Add multixact test reproducing the problem with duplicates caused by incorrect opffset calculation

2026-01-25 22:30:38 +00:00 · 2023-07-21 22:40:47 +03:00
93 changed files with 1723 additions and 3588 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -21,5 +21,4 @@
 !workspace_hack/
 !neon_local/
 !scripts/ninstall.sh
-!scripts/combine_control_files.py
 !vm-cgconfig.conf
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -209,4 +209,4 @@ runs:
      uses: ./.github/actions/allure-report-store
      with:
        report-dir: /tmp/test_output/allure/results
-        unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
+        unique-key: ${{ inputs.build_type }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -955,15 +955,22 @@ jobs:
        version: [ v14, v15 ]

    env:
-      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+      # While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
+      # Later all the extensions will be moved to extensions image.
+      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
+      COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
      AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
-      S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
+      S3_BUCKETS: |
+        ${{ github.ref_name == 'release' &&
+          'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
+          'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}

    steps:
      - name: Pull postgres-extensions image
        run: |
          docker pull ${EXTENSIONS_IMAGE}
+          docker pull ${COMPUTE_NODE_IMAGE}

      - name: Create postgres-extensions container
        id: create-container
@@ -971,23 +978,46 @@ jobs:
          EID=$(docker create ${EXTENSIONS_IMAGE} true)
          echo "EID=${EID}" >> $GITHUB_OUTPUT

+          CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
+          echo "CID=${CID}" >> $GITHUB_OUTPUT
+
      - name: Extract postgres-extensions from container
        run: |
-          rm -rf ./extensions-to-upload # Just in case
-          mkdir -p extensions-to-upload
+          rm -rf ./extensions-to-upload ./custom-extensions # Just in case

-          docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
-          docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
+          # In compute image we have a bit different directory layout
+          mkdir -p extensions-to-upload/share
+          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
+          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib             ./extensions-to-upload/lib
+
+          # Delete Neon extensitons (they always present on compute-node image)
+          rm -rf ./extensions-to-upload/share/extension/neon*
+          rm -rf ./extensions-to-upload/lib/neon*
+
+          # Delete leftovers from the extension build step
+          rm -rf ./extensions-to-upload/lib/pgxs
+          rm -rf ./extensions-to-upload/lib/pkgconfig
+
+          docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
+          for EXT_NAME in $(ls ./custom-extensions); do
+            mkdir -p ./extensions-to-upload/${EXT_NAME}/share
+
+            mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
+            mv ./custom-extensions/${EXT_NAME}/lib             ./extensions-to-upload/${EXT_NAME}/lib
+          done

      - name: Upload postgres-extensions to S3
+        # TODO: Reenable step after switching to the new extensions format (tar-gzipped + index.json)
+        if: false
        run: |
-          for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
+          for BUCKET in $(echo ${S3_BUCKETS}); do
            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
          done

      - name: Cleanup
-        if: ${{ always() && steps.create-container.outputs.EID }}
+        if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
        run: |
+          docker rm ${{ steps.create-container.outputs.CID }} || true
          docker rm ${{ steps.create-container.outputs.EID }} || true

  deploy:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2506,7 +2506,6 @@ dependencies = [
 "pageserver",
 "postgres_ffi",
 "svg_fmt",
- "tokio",
 "utils",
 "workspace_hack",
 ]
@@ -2545,7 +2544,6 @@ dependencies = [
 "metrics",
 "nix",
 "num-traits",
- "num_cpus",
 "once_cell",
 "pageserver_api",
 "pin-project-lite",
@@ -2782,7 +2780,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2795,7 +2793,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "native-tls",
 "tokio",
@@ -2806,7 +2804,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -2824,7 +2822,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4314,7 +4312,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "async-trait",
 "byteorder",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -144,11 +144,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -183,7 +183,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }

 ################# Binary contents sections

--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -13,7 +13,7 @@ FROM debian:bullseye-slim AS build-deps
 RUN apt update &&  \
    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
-    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
+    libicu-dev libxslt1-dev liblz4-dev libzstd-dev

 #########################################################################################
 #
@@ -77,7 +77,6 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"
 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
    echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    ./autogen.sh && \
    ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -90,28 +89,17 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
-    mkdir -p /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis_raster.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis_sfcgal.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis_topology.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control

 RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
    echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
-    mkdir build && cd build && \
+    mkdir build && \
+    cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control

 #########################################################################################
 #
@@ -431,16 +419,12 @@ RUN apt-get update && \
    wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
    echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
-    mkdir build && cd build && \
+    mkdir build && \
+    cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control

 #########################################################################################
 #
@@ -551,8 +535,10 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.1.tar.gz -O pg_embedding.tar.gz && \
-    echo "c4ae84eef36fa8ec5868f6e061f39812f19ee5ba3604d428d40935685c7be512 pg_embedding.tar.gz" | sha256sum --check && \
+# eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703 made on 15/07/2023
+# There is no release tag yet
+RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703.tar.gz -O pg_embedding.tar.gz && \
+    echo "030846df723652f99a8689ce63b66fa0c23477a7fd723533ab8a6b28ab70730f pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -567,17 +553,16 @@ RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.1.ta
 FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

+# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
    echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
+    find /usr/local/pgsql -type f | sort  > /before.txt && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
+    find /usr/local/pgsql -type f | sort  > /after.txt && \
+    /bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'

 #########################################################################################
 #
@@ -769,23 +754,16 @@ RUN rm /usr/local/pgsql/lib/lib*.a
 # Extenstion only
 #
 #########################################################################################
-FROM python:3.9-slim-bullseye AS generate-ext-index
-ARG PG_VERSION
-ARG BUILD_TAG
-RUN apt update && apt install -y zstd
-
-# copy the control files here
-COPY --from=kq-imcx-pg-build /extensions/ /extensions/
-COPY --from=pg-anon-pg-build /extensions/ /extensions/
-COPY --from=postgis-build /extensions/ /extensions/
-COPY scripts/combine_control_files.py ./combine_control_files.py
-RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
-
 FROM scratch AS postgres-extensions
 # After the transition this layer will include all extensitons.
-# As for now, it's only a couple for testing purposses
-COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
-COPY --from=generate-ext-index /ext_index.json /ext_index.json
+# As for now, it's only for new custom ones
+#
+# # Default extensions
+# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
+# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib             /usr/local/pgsql/lib
+# Custom extensions
+COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
+COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension

 #########################################################################################
 #
--- a/2
+++ b/2
@@ -108,8 +108,6 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
-	+@echo "Compiling amcheck $*"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install

 .PHONY: postgres-clean-%
 postgres-clean-%:
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -193,13 +193,6 @@ fn main() -> Result<()> {
    if !spec_set {
        // No spec provided, hang waiting for it.
        info!("no compute spec provided, waiting");
-
-        // TODO this can stall startups in the unlikely event that we bind
-        //      this compute node while it's busy prewarming. It's not too
-        //      bad because it's just 100ms and unlikely, but it's an
-        //      avoidable problem.
-        compute.prewarm_postgres()?;
-
        let mut state = compute.state.lock().unwrap();
        while state.status != ComputeStatus::ConfigurationPending {
            state = compute.state_changed.wait(state).unwrap();
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -8,11 +8,9 @@ use std::sync::{Condvar, Mutex};

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use futures::stream::FuturesUnordered;
-use futures::StreamExt;
 use postgres::{Client, NoTls};
 use tokio_postgres;
-use tracing::{error, info, instrument, warn};
+use tracing::{info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -23,7 +21,6 @@ use utils::measured_stream::MeasuredReader;
 use crate::config;
 use crate::pg_helpers::*;
 use crate::spec::*;
-use crate::sync_sk::{check_if_synced, ping_safekeeper};

 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
@@ -89,7 +86,6 @@ pub struct ParsedSpec {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    pub pageserver_connstr: String,
-    pub safekeeper_connstrings: Vec<String>,
    pub storage_auth_token: Option<String>,
 }

@@ -107,21 +103,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
            .clone()
            .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
            .ok_or("pageserver connstr should be provided")?;
-        let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
-            if matches!(spec.mode, ComputeMode::Primary) {
-                spec.cluster
-                    .settings
-                    .find("neon.safekeepers")
-                    .ok_or("safekeeper connstrings should be provided")?
-                    .split(',')
-                    .map(|str| str.to_string())
-                    .collect()
-            } else {
-                vec![]
-            }
-        } else {
-            spec.safekeeper_connstrings.clone()
-        };
        let storage_auth_token = spec.storage_auth_token.clone();
        let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
            tenant_id
@@ -147,7 +128,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
        Ok(ParsedSpec {
            spec,
            pageserver_connstr,
-            safekeeper_connstrings,
            storage_auth_token,
            tenant_id,
            timeline_id,
@@ -329,102 +309,6 @@ impl ComputeNode {
        Ok(())
    }

-    pub async fn check_safekeepers_synced_async(
-        &self,
-        compute_state: &ComputeState,
-    ) -> Result<Option<Lsn>> {
-        // Construct a connection config for each safekeeper
-        let pspec: ParsedSpec = compute_state
-            .pspec
-            .as_ref()
-            .expect("spec must be set")
-            .clone();
-        let sk_connstrs: Vec<String> = pspec.safekeeper_connstrings.clone();
-        let sk_configs = sk_connstrs.into_iter().map(|connstr| {
-            // Format connstr
-            let id = connstr.clone();
-            let connstr = format!("postgresql://no_user@{}", connstr);
-            let options = format!(
-                "-c timeline_id={} tenant_id={}",
-                pspec.timeline_id, pspec.tenant_id
-            );
-
-            // Construct client
-            let mut config = tokio_postgres::Config::from_str(&connstr).unwrap();
-            config.options(&options);
-            if let Some(storage_auth_token) = pspec.storage_auth_token.clone() {
-                config.password(storage_auth_token);
-            }
-
-            (id, config)
-        });
-
-        // Create task set to query all safekeepers
-        let mut tasks = FuturesUnordered::new();
-        let quorum = sk_configs.len() / 2 + 1;
-        for (id, config) in sk_configs {
-            let timeout = tokio::time::Duration::from_millis(100);
-            let task = tokio::time::timeout(timeout, ping_safekeeper(id, config));
-            tasks.push(tokio::spawn(task));
-        }
-
-        // Get a quorum of responses or errors
-        let mut responses = Vec::new();
-        let mut join_errors = Vec::new();
-        let mut task_errors = Vec::new();
-        let mut timeout_errors = Vec::new();
-        while let Some(response) = tasks.next().await {
-            match response {
-                Ok(Ok(Ok(r))) => responses.push(r),
-                Ok(Ok(Err(e))) => task_errors.push(e),
-                Ok(Err(e)) => timeout_errors.push(e),
-                Err(e) => join_errors.push(e),
-            };
-            if responses.len() >= quorum {
-                break;
-            }
-            if join_errors.len() + task_errors.len() + timeout_errors.len() >= quorum {
-                break;
-            }
-        }
-
-        // In case of error, log and fail the check, but don't crash.
-        // We're playing it safe because these errors could be transient
-        // and we don't yet retry. Also being careful here allows us to
-        // be backwards compatible with safekeepers that don't have the
-        // TIMELINE_STATUS API yet.
-        if responses.len() < quorum {
-            error!(
-                "failed sync safekeepers check {:?} {:?} {:?}",
-                join_errors, task_errors, timeout_errors
-            );
-            return Ok(None);
-        }
-
-        Ok(check_if_synced(responses))
-    }
-
-    // Fast path for sync_safekeepers. If they're already synced we get the lsn
-    // in one roundtrip. If not, we should do a full sync_safekeepers.
-    pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result<Option<Lsn>> {
-        let start_time = Utc::now();
-
-        // Run actual work with new tokio runtime
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .expect("failed to create rt");
-        let result = rt.block_on(self.check_safekeepers_synced_async(compute_state));
-
-        // Record runtime
-        self.state.lock().unwrap().metrics.sync_sk_check_ms = Utc::now()
-            .signed_duration_since(start_time)
-            .to_std()
-            .unwrap()
-            .as_millis() as u64;
-        result
-    }
-
    // Run `postgres` in a special mode with `--sync-safekeepers` argument
    // and return the reported LSN back to the caller.
    #[instrument(skip_all)]
@@ -487,14 +371,10 @@ impl ComputeNode {
        // cannot sync safekeepers.
        let lsn = match spec.mode {
            ComputeMode::Primary => {
-                info!("checking if safekeepers are synced");
-                let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) {
-                    lsn
-                } else {
-                    info!("starting safekeepers syncing");
-                    self.sync_safekeepers(pspec.storage_auth_token.clone())
-                        .with_context(|| "failed to sync safekeepers")?
-                };
+                info!("starting safekeepers syncing");
+                let lsn = self
+                    .sync_safekeepers(pspec.storage_auth_token.clone())
+                    .with_context(|| "failed to sync safekeepers")?;
                info!("safekeepers synced at LSN {}", lsn);
                lsn
            }
@@ -532,50 +412,6 @@ impl ComputeNode {
        Ok(())
    }

-    /// Start and stop a postgres process to warm up the VM for startup.
-    pub fn prewarm_postgres(&self) -> Result<()> {
-        info!("prewarming");
-
-        // Create pgdata
-        let pgdata = &format!("{}.warmup", self.pgdata);
-        create_pgdata(pgdata)?;
-
-        // Run initdb to completion
-        info!("running initdb");
-        let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
-        Command::new(initdb_bin)
-            .args(["-D", pgdata])
-            .output()
-            .expect("cannot start initdb process");
-
-        // Write conf
-        use std::io::Write;
-        let conf_path = Path::new(pgdata).join("postgresql.conf");
-        let mut file = std::fs::File::create(conf_path)?;
-        writeln!(file, "shared_buffers=65536")?;
-        writeln!(file, "port=51055")?; // Nobody should be connecting
-        writeln!(file, "shared_preload_libraries = 'neon'")?;
-
-        // Start postgres
-        info!("starting postgres");
-        let mut pg = Command::new(&self.pgbin)
-            .args(["-D", pgdata])
-            .spawn()
-            .expect("cannot start postgres process");
-
-        // Stop it when it's ready
-        info!("waiting for postgres");
-        wait_for_postgres(&mut pg, Path::new(pgdata))?;
-        pg.kill()?;
-        info!("sent kill signal");
-        pg.wait()?;
-        info!("done prewarming");
-
-        // clean up
-        let _ok = fs::remove_dir_all(pgdata);
-        Ok(())
-    }
-
    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
    #[instrument(skip_all)]
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -30,7 +30,6 @@ fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
        status: state.status,
        last_active: state.last_active,
        error: state.error.clone(),
-        metrics: state.metrics.clone(),
    }
 }

--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -13,4 +13,3 @@ pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
 pub mod spec;
-pub mod sync_sk;
--- a/compute_tools/src/sync_sk.rs
+++ b/compute_tools/src/sync_sk.rs
@@ -1,98 +0,0 @@
-// Utils for running sync_safekeepers
-use anyhow::Result;
-use tracing::info;
-use utils::lsn::Lsn;
-
-#[derive(Copy, Clone, Debug)]
-pub enum TimelineStatusResponse {
-    NotFound,
-    Ok(TimelineStatusOkResponse),
-}
-
-#[derive(Copy, Clone, Debug)]
-pub struct TimelineStatusOkResponse {
-    flush_lsn: Lsn,
-    commit_lsn: Lsn,
-}
-
-/// Get a safekeeper's metadata for our timeline. The id is only used for logging
-pub async fn ping_safekeeper(
-    id: String,
-    config: tokio_postgres::Config,
-) -> Result<TimelineStatusResponse> {
-    // TODO add retries
-
-    // Connect
-    info!("connecting to {}", id);
-    let (client, conn) = config.connect(tokio_postgres::NoTls).await?;
-    tokio::spawn(async move {
-        if let Err(e) = conn.await {
-            eprintln!("connection error: {}", e);
-        }
-    });
-
-    // Query
-    info!("querying {}", id);
-    let result = client.simple_query("TIMELINE_STATUS").await?;
-
-    // Parse result
-    info!("done with {}", id);
-    if let postgres::SimpleQueryMessage::Row(row) = &result[0] {
-        use std::str::FromStr;
-        let response = TimelineStatusResponse::Ok(TimelineStatusOkResponse {
-            flush_lsn: Lsn::from_str(row.get("flush_lsn").unwrap())?,
-            commit_lsn: Lsn::from_str(row.get("commit_lsn").unwrap())?,
-        });
-        Ok(response)
-    } else {
-        // Timeline doesn't exist
-        Ok(TimelineStatusResponse::NotFound)
-    }
-}
-
-/// Given a quorum of responses, check if safekeepers are synced at some Lsn
-pub fn check_if_synced(responses: Vec<TimelineStatusResponse>) -> Option<Lsn> {
-    // Check if all responses are ok
-    let ok_responses: Vec<TimelineStatusOkResponse> = responses
-        .iter()
-        .filter_map(|r| match r {
-            TimelineStatusResponse::Ok(ok_response) => Some(ok_response),
-            _ => None,
-        })
-        .cloned()
-        .collect();
-    if ok_responses.len() < responses.len() {
-        info!(
-            "not synced. Only {} out of {} know about this timeline",
-            ok_responses.len(),
-            responses.len()
-        );
-        return None;
-    }
-
-    // Get the min and the max of everything
-    let commit: Vec<Lsn> = ok_responses.iter().map(|r| r.commit_lsn).collect();
-    let flush: Vec<Lsn> = ok_responses.iter().map(|r| r.flush_lsn).collect();
-    let commit_max = commit.iter().max().unwrap();
-    let commit_min = commit.iter().min().unwrap();
-    let flush_max = flush.iter().max().unwrap();
-    let flush_min = flush.iter().min().unwrap();
-
-    // Check that all values are equal
-    if commit_min != commit_max {
-        info!("not synced. {:?} {:?}", commit_min, commit_max);
-        return None;
-    }
-    if flush_min != flush_max {
-        info!("not synced. {:?} {:?}", flush_min, flush_max);
-        return None;
-    }
-
-    // Check that commit == flush
-    if commit_max != flush_max {
-        info!("not synced. {:?} {:?}", commit_max, flush_max);
-        return None;
-    }
-
-    Some(*commit_max)
-}
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -564,7 +564,9 @@ impl Endpoint {
                }
                Err(e) => {
                    if attempt == MAX_ATTEMPTS {
-                        return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
+                        return Err(e).context(
+                            "timed out waiting to connect to compute_ctl HTTP; last error: {e}",
+                        );
                    }
                }
            }
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -21,7 +21,6 @@ pub struct ComputeStatusResponse {
    #[serde(serialize_with = "rfc3339_serialize")]
    pub last_active: Option<DateTime<Utc>>,
    pub error: Option<String>,
-    pub metrics: ComputeMetrics,
 }

 #[derive(Deserialize, Serialize)]
@@ -67,11 +66,10 @@ where
 }

 /// Response of the /metrics.json API
-#[derive(Clone, Debug, Default, Serialize, Deserialize)]
+#[derive(Clone, Debug, Default, Serialize)]
 pub struct ComputeMetrics {
    pub wait_for_spec_ms: u64,
    pub sync_safekeepers_ms: u64,
-    pub sync_sk_check_ms: u64,
    pub basebackup_ms: u64,
    pub basebackup_bytes: u64,
    pub start_postgres_ms: u64,
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::Serialize;

-#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
    #[serde(rename = "absolute")]
@@ -17,32 +17,6 @@ pub enum EventType {
    },
 }

-impl EventType {
-    pub fn absolute_time(&self) -> Option<&DateTime<Utc>> {
-        use EventType::*;
-        match self {
-            Absolute { time } => Some(time),
-            _ => None,
-        }
-    }
-
-    pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
-        // these can most likely be thought of as Range or RangeFull
-        use EventType::*;
-        match self {
-            Incremental {
-                start_time,
-                stop_time,
-            } => Some(start_time..stop_time),
-            _ => None,
-        }
-    }
-
-    pub fn is_incremental(&self) -> bool {
-        matches!(self, EventType::Incremental { .. })
-    }
-}
-
 #[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 pub struct Event<Extra> {
    #[serde(flatten)]
@@ -57,7 +31,7 @@ pub struct Event<Extra> {
    pub extra: Extra,
 }

-pub fn idempotency_key(node_id: &str) -> String {
+pub fn idempotency_key(node_id: String) -> String {
    format!(
        "{}-{}-{:04}",
        Utc::now(),
@@ -71,6 +45,6 @@ pub const CHUNK_SIZE: usize = 1000;
 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
 #[derive(serde::Serialize)]
-pub struct EventChunk<'a, T: Clone> {
-    pub events: std::borrow::Cow<'a, [T]>,
+pub struct EventChunk<'a, T> {
+    pub events: &'a [T],
 }
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -179,7 +179,7 @@ pub struct FeExecuteMessage {
 #[derive(Debug)]
 pub struct FeCloseMessage;

-/// An error occurred while parsing or serializing raw stream into Postgres
+/// An error occured while parsing or serializing raw stream into Postgres
 /// messages.
 #[derive(thiserror::Error, Debug)]
 pub enum ProtocolError {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -200,17 +200,13 @@ impl S3Bucket {
        )
    }

-    pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
-        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path
-            .get_path()
-            .to_string_lossy()
-            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
-            .to_string();
-        match &self.prefix_in_bucket {
-            Some(prefix) => prefix.clone() + "/" + &path_string,
-            None => path_string,
+    fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
+        let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
+        for segment in path.0.iter() {
+            full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+            full_path.push_str(segment.to_str().unwrap_or_default());
        }
+        full_path
    }

    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
@@ -431,12 +427,10 @@ impl RemoteStorage for S3Bucket {
    }

    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
-        // if prefix is not none then download file `prefix/from`
-        // if prefix is none then download file `from`
        self.download_object(GetObjectRequest {
            bucket: self.bucket_name.clone(),
            key: self.relative_path_to_s3_object(from),
-            range: None,
+            ..GetObjectRequest::default()
        })
        .await
    }
@@ -529,63 +523,3 @@ impl RemoteStorage for S3Bucket {
        Ok(())
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use std::num::NonZeroUsize;
-    use std::path::Path;
-
-    use crate::{RemotePath, S3Bucket, S3Config};
-
-    #[test]
-    fn relative_path() {
-        let all_paths = vec!["", "some/path", "some/path/"];
-        let all_paths: Vec<RemotePath> = all_paths
-            .iter()
-            .map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
-            .collect();
-        let prefixes = [
-            None,
-            Some(""),
-            Some("test/prefix"),
-            Some("test/prefix/"),
-            Some("/test/prefix/"),
-        ];
-        let expected_outputs = vec![
-            vec!["", "some/path", "some/path"],
-            vec!["/", "/some/path", "/some/path"],
-            vec![
-                "test/prefix/",
-                "test/prefix/some/path",
-                "test/prefix/some/path",
-            ],
-            vec![
-                "test/prefix/",
-                "test/prefix/some/path",
-                "test/prefix/some/path",
-            ],
-            vec![
-                "test/prefix/",
-                "test/prefix/some/path",
-                "test/prefix/some/path",
-            ],
-        ];
-
-        for (prefix_idx, prefix) in prefixes.iter().enumerate() {
-            let config = S3Config {
-                bucket_name: "bucket".to_owned(),
-                bucket_region: "region".to_owned(),
-                prefix_in_bucket: prefix.map(str::to_string),
-                endpoint: None,
-                concurrency_limit: NonZeroUsize::new(100).unwrap(),
-                max_keys_per_list_response: Some(5),
-            };
-            let storage = S3Bucket::new(&config).expect("remote storage init");
-            for (test_path_idx, test_path) in all_paths.iter().enumerate() {
-                let result = storage.relative_path_to_s3_object(test_path);
-                let expected = expected_outputs[prefix_idx][test_path_idx];
-                assert_eq!(result, expected);
-            }
-        }
-    }
-}
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -19,7 +19,7 @@ static LOGGING_DONE: OnceCell<()> = OnceCell::new();

 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";

-const BASE_PREFIX: &str = "test";
+const BASE_PREFIX: &str = "test/";

 /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -24,29 +24,12 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
    Ok(dir.next_entry().await?.is_none())
 }

-pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
-    if e.kind() == io::ErrorKind::NotFound {
-        Ok(())
-    } else {
-        Err(e)
-    }
-}
-
-pub fn ignore_absent_files<F>(fs_operation: F) -> io::Result<()>
-where
-    F: Fn() -> io::Result<()>,
-{
-    fs_operation().or_else(ignore_not_found)
-}
-
 #[cfg(test)]
 mod test {
    use std::path::PathBuf;

    use crate::fs_ext::is_directory_empty;

-    use super::ignore_absent_files;
-
    #[test]
    fn is_empty_dir() {
        use super::PathExt;
@@ -92,21 +75,4 @@ mod test {
        std::fs::remove_file(&file_path).unwrap();
        assert!(is_directory_empty(file_path).await.is_err());
    }
-
-    #[test]
-    fn ignore_absent_files_works() {
-        let dir = tempfile::tempdir().unwrap();
-        let dir_path = dir.path();
-
-        let file_path: PathBuf = dir_path.join("testfile");
-
-        ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
-
-        let f = std::fs::File::create(&file_path).unwrap();
-        drop(f);
-
-        ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
-
-        assert!(!file_path.exists());
-    }
 }
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -1,7 +1,5 @@
-use std::ffi::OsStr;
 use std::{fmt, str::FromStr};

-use anyhow::Context;
 use hex::FromHex;
 use rand::Rng;
 use serde::{Deserialize, Serialize};
@@ -215,18 +213,6 @@ pub struct TimelineId(Id);

 id_newtype!(TimelineId);

-impl TryFrom<Option<&OsStr>> for TimelineId {
-    type Error = anyhow::Error;
-
-    fn try_from(value: Option<&OsStr>) -> Result<Self, Self::Error> {
-        value
-            .and_then(OsStr::to_str)
-            .unwrap_or_default()
-            .parse::<TimelineId>()
-            .with_context(|| format!("Could not parse timeline id from {:?}", value))
-    }
-}
-
 /// Neon Tenant Id represents identifiar of a particular tenant.
 /// Is used for distinguishing requests and data belonging to different users.
 ///
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -35,8 +35,6 @@ humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
 nix.workspace = true
-# hack to get the number of worker threads tokio uses
-num_cpus = { version = "1.15" }
 num-traits.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -13,7 +13,6 @@ clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
 pageserver = { path = ".." }
 postgres_ffi.workspace = true
-tokio.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -95,7 +95,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 }

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
-async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
+fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    let file = FileBlockReader::new(VirtualFile::open(path)?);
    let summary_blk = file.read_blk(0)?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -129,7 +129,7 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    Ok(holes)
 }

-pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
+pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let storage_path = &cmd.path;
    let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);

@@ -160,7 +160,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
                    parse_filename(&layer.file_name().into_string().unwrap())
                {
                    if layer_file.is_delta {
-                        layer_file.holes = get_holes(&layer.path(), max_holes).await?;
+                        layer_file.holes = get_holes(&layer.path(), max_holes)?;
                        n_deltas += 1;
                    }
                    layers.push(layer_file);
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -43,7 +43,8 @@ pub(crate) enum LayerCmd {
    },
 }

-async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
+fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
+    use pageserver::tenant::blob_io::BlobCursor;
    use pageserver::tenant::block_io::BlockReader;

    let path = path.as_ref();
@@ -68,7 +69,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
            true
        },
    )?;
-    let cursor = BlockCursor::new(&file);
+    let mut cursor = BlockCursor::new(&file);
    for (k, v) in all {
        let value = cursor.read_blob(v.pos())?;
        println!("key:{} value_len:{}", k, value.len());
@@ -77,7 +78,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    Ok(())
 }

-pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
+pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
    match cmd {
        LayerCmd::List { path } => {
            for tenant in fs::read_dir(path.join("tenants"))? {
@@ -152,7 +153,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                        );

                        if layer_file.is_delta {
-                            read_delta_file(layer.path()).await?;
+                            read_delta_file(layer.path())?;
                        } else {
                            anyhow::bail!("not supported yet :(");
                        }
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -72,13 +72,12 @@ struct AnalyzeLayerMapCmd {
    max_holes: Option<usize>,
 }

-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
+fn main() -> anyhow::Result<()> {
    let cli = CliOpts::parse();

    match cli.command {
        Commands::Layer(cmd) => {
-            layers::main(&cmd).await?;
+            layers::main(&cmd)?;
        }
        Commands::Metadata(cmd) => {
            handle_metadata(&cmd)?;
@@ -87,7 +86,7 @@ async fn main() -> anyhow::Result<()> {
            draw_timeline_dir::main()?;
        }
        Commands::AnalyzeLayerMap(cmd) => {
-            layer_map_analyzer::main(&cmd).await?;
+            layer_map_analyzer::main(&cmd)?;
        }
        Commands::PrintLayerFile(cmd) => {
            if let Err(e) = read_pg_control_file(&cmd.path) {
@@ -95,7 +94,7 @@ async fn main() -> anyhow::Result<()> {
                    "Failed to read input file as a pg control one: {e:#}\n\
                    Attempting to read it as layer file"
                );
-                print_layerfile(&cmd.path).await?;
+                print_layerfile(&cmd.path)?;
            }
        }
    };
@@ -114,12 +113,12 @@ fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
    Ok(())
 }

-async fn print_layerfile(path: &Path) -> anyhow::Result<()> {
+fn print_layerfile(path: &Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
    virtual_file::init(10);
    page_cache::init(100);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
-    dump_layerfile_from_path(path, true, &ctx).await
+    dump_layerfile_from_path(path, true, &ctx)
 }

 fn handle_metadata(
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -33,8 +33,7 @@ use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
-    TIMELINE_UNINIT_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
 };

 pub mod defaults {
@@ -602,17 +601,6 @@ impl PageServerConf {
        )
    }

-    pub fn timeline_delete_mark_file_path(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> PathBuf {
-        path_with_suffix_extension(
-            self.timeline_path(&tenant_id, &timeline_id),
-            TIMELINE_DELETE_MARK_SUFFIX,
-        )
-    }
-
    pub fn traces_path(&self) -> PathBuf {
        self.workdir.join("traces")
    }
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -7,23 +7,27 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use anyhow;
-use chrono::{DateTime, Utc};
+use chrono::Utc;
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use pageserver_api::models::TenantState;
 use reqwest::Url;
 use serde::Serialize;
 use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
-use std::sync::Arc;
-use std::time::{Duration, SystemTime};
+use std::time::Duration;
 use tracing::*;
 use utils::id::{NodeId, TenantId, TimelineId};
-use utils::lsn::Lsn;
+
+const WRITTEN_SIZE: &str = "written_size";
+const SYNTHETIC_STORAGE_SIZE: &str = "synthetic_storage_size";
+const RESIDENT_SIZE: &str = "resident_size";
+const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
+const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";

 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

 #[serde_as]
-#[derive(Serialize, Debug, Clone, Copy)]
+#[derive(Serialize, Debug)]
 struct Ids {
    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
@@ -34,142 +38,10 @@ struct Ids {

 /// Key that uniquely identifies the object, this metric describes.
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
-struct MetricsKey {
-    tenant_id: TenantId,
-    timeline_id: Option<TimelineId>,
-    metric: &'static str,
-}
-
-impl MetricsKey {
-    const fn absolute_values(self) -> AbsoluteValueFactory {
-        AbsoluteValueFactory(self)
-    }
-    const fn incremental_values(self) -> IncrementalValueFactory {
-        IncrementalValueFactory(self)
-    }
-}
-
-/// Helper type which each individual metric kind can return to produce only absolute values.
-struct AbsoluteValueFactory(MetricsKey);
-
-impl AbsoluteValueFactory {
-    fn at(self, time: DateTime<Utc>, val: u64) -> (MetricsKey, (EventType, u64)) {
-        let key = self.0;
-        (key, (EventType::Absolute { time }, val))
-    }
-}
-
-/// Helper type which each individual metric kind can return to produce only incremental values.
-struct IncrementalValueFactory(MetricsKey);
-
-impl IncrementalValueFactory {
-    #[allow(clippy::wrong_self_convention)]
-    fn from_previous_up_to(
-        self,
-        prev_end: DateTime<Utc>,
-        up_to: DateTime<Utc>,
-        val: u64,
-    ) -> (MetricsKey, (EventType, u64)) {
-        let key = self.0;
-        // cannot assert prev_end < up_to because these are realtime clock based
-        (
-            key,
-            (
-                EventType::Incremental {
-                    start_time: prev_end,
-                    stop_time: up_to,
-                },
-                val,
-            ),
-        )
-    }
-
-    fn key(&self) -> &MetricsKey {
-        &self.0
-    }
-}
-
-// the static part of a MetricsKey
-impl MetricsKey {
-    /// Absolute value of [`Timeline::get_last_record_lsn`].
-    ///
-    /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
-    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: "written_size",
-        }
-        .absolute_values()
-    }
-
-    /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
-    /// previously sent, starting from the previously sent incremental time range ending at the
-    /// latest absolute measurement.
-    const fn written_size_delta(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> IncrementalValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            // the name here is correctly about data not size, because that is what is wanted by
-            // downstream pipeline
-            metric: "written_data_bytes_delta",
-        }
-        .incremental_values()
-    }
-
-    /// Exact [`Timeline::get_current_logical_size`].
-    ///
-    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
-    const fn timeline_logical_size(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: "timeline_logical_size",
-        }
-        .absolute_values()
-    }
-
-    /// [`Tenant::remote_size`]
-    ///
-    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
-    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "remote_storage_size",
-        }
-        .absolute_values()
-    }
-
-    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
-    ///
-    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
-    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "resident_size",
-        }
-        .absolute_values()
-    }
-
-    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
-    ///
-    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
-    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "synthetic_storage_size",
-        }
-        .absolute_values()
-    }
+pub struct PageserverConsumptionMetricsKey {
+    pub tenant_id: TenantId,
+    pub timeline_id: Option<TimelineId>,
+    pub metric: &'static str,
 }

 /// Main thread that serves metrics collection
@@ -207,7 +79,7 @@ pub async fn collect_metrics(
        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
        .build()
        .expect("Failed to create http client with timeout");
-    let mut cached_metrics = HashMap::new();
+    let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();

    loop {
@@ -247,15 +119,15 @@ pub async fn collect_metrics(
 ///
 /// TODO
 /// - refactor this function (chunking+sending part) to reuse it in proxy module;
-async fn collect_metrics_iteration(
+pub async fn collect_metrics_iteration(
    client: &reqwest::Client,
-    cached_metrics: &mut HashMap<MetricsKey, (EventType, u64)>,
+    cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
    metric_collection_endpoint: &reqwest::Url,
    node_id: NodeId,
    ctx: &RequestContext,
    send_cached: bool,
 ) {
-    let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new();
+    let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
    trace!(
        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
        metric_collection_endpoint
@@ -289,65 +161,99 @@ async fn collect_metrics_iteration(
        let mut tenant_resident_size = 0;

        // iterate through list of timelines in tenant
-        for timeline in tenant.list_timelines() {
+        for timeline in tenant.list_timelines().iter() {
            // collect per-timeline metrics only for active timelines
+            if timeline.is_active() {
+                let timeline_written_size = u64::from(timeline.get_last_record_lsn());

-            let timeline_id = timeline.timeline_id;
-
-            match TimelineSnapshot::collect(&timeline, ctx) {
-                Ok(Some(snap)) => {
-                    snap.to_metrics(
+                current_metrics.push((
+                    PageserverConsumptionMetricsKey {
                        tenant_id,
-                        timeline_id,
-                        Utc::now(),
-                        &mut current_metrics,
-                        cached_metrics,
-                    );
-                }
-                Ok(None) => {}
-                Err(e) => {
-                    error!(
-                        "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
-                        timeline.timeline_id
-                    );
-                    continue;
-                }
+                        timeline_id: Some(timeline.timeline_id),
+                        metric: WRITTEN_SIZE,
+                    },
+                    timeline_written_size,
+                ));
+
+                let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
+                match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
+                    // Only send timeline logical size when it is fully calculated.
+                    Ok((size, is_exact)) if is_exact => {
+                        current_metrics.push((
+                            PageserverConsumptionMetricsKey {
+                                tenant_id,
+                                timeline_id: Some(timeline.timeline_id),
+                                metric: TIMELINE_LOGICAL_SIZE,
+                            },
+                            size,
+                        ));
+                    }
+                    Ok((_, _)) => {}
+                    Err(err) => {
+                        error!(
+                            "failed to get current logical size for timeline {}: {err:?}",
+                            timeline.timeline_id
+                        );
+                        continue;
+                    }
+                };
            }

-            tenant_resident_size += timeline.resident_physical_size();
+            let timeline_resident_size = timeline.get_resident_physical_size();
+            tenant_resident_size += timeline_resident_size;
        }

-        current_metrics
-            .push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size()));
+        match tenant.get_remote_size().await {
+            Ok(tenant_remote_size) => {
+                current_metrics.push((
+                    PageserverConsumptionMetricsKey {
+                        tenant_id,
+                        timeline_id: None,
+                        metric: REMOTE_STORAGE_SIZE,
+                    },
+                    tenant_remote_size,
+                ));
+            }
+            Err(err) => {
+                error!(
+                    "failed to get remote size for tenant {}: {err:?}",
+                    tenant_id
+                );
+            }
+        }

-        current_metrics
-            .push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size));
+        current_metrics.push((
+            PageserverConsumptionMetricsKey {
+                tenant_id,
+                timeline_id: None,
+                metric: RESIDENT_SIZE,
+            },
+            tenant_resident_size,
+        ));

        // Note that this metric is calculated in a separate bgworker
        // Here we only use cached value, which may lag behind the real latest one
-        let synthetic_size = tenant.cached_synthetic_size();
+        let tenant_synthetic_size = tenant.get_cached_synthetic_size();

-        if synthetic_size != 0 {
+        if tenant_synthetic_size != 0 {
            // only send non-zeroes because otherwise these show up as errors in logs
-            current_metrics
-                .push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size));
+            current_metrics.push((
+                PageserverConsumptionMetricsKey {
+                    tenant_id,
+                    timeline_id: None,
+                    metric: SYNTHETIC_STORAGE_SIZE,
+                },
+                tenant_synthetic_size,
+            ));
        }
    }

    // Filter metrics, unless we want to send all metrics, including cached ones.
    // See: https://github.com/neondatabase/neon/issues/3485
    if !send_cached {
-        current_metrics.retain(|(curr_key, (kind, curr_val))| {
-            if kind.is_incremental() {
-                // incremental values (currently only written_size_delta) should not get any cache
-                // deduplication because they will be used by upstream for "is still alive."
-                true
-            } else {
-                match cached_metrics.get(curr_key) {
-                    Some((_, val)) => val != curr_val,
-                    None => true,
-                }
-            }
+        current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
+            Some(val) => val != curr_val,
+            None => true,
        });
    }

@@ -362,16 +268,14 @@ async fn collect_metrics_iteration(

    let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);

-    let node_id = node_id.to_string();
-
    for chunk in chunks {
        chunk_to_send.clear();

        // enrich metrics with type,timestamp and idempotency key before sending
-        chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
-            kind: *when,
+        chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| Event {
+            kind: EventType::Absolute { time: Utc::now() },
            metric: curr_key.metric,
-            idempotency_key: idempotency_key(&node_id),
+            idempotency_key: idempotency_key(node_id.to_string()),
            value: *curr_val,
            extra: Ids {
                tenant_id: curr_key.tenant_id,
@@ -379,14 +283,17 @@ async fn collect_metrics_iteration(
            },
        }));

+        let chunk_json = serde_json::value::to_raw_value(&EventChunk {
+            events: &chunk_to_send,
+        })
+        .expect("PageserverConsumptionMetric should not fail serialization");
+
        const MAX_RETRIES: u32 = 3;

        for attempt in 0..MAX_RETRIES {
            let res = client
                .post(metric_collection_endpoint.clone())
-                .json(&EventChunk {
-                    events: (&chunk_to_send).into(),
-                })
+                .json(&chunk_json)
                .send()
                .await;

@@ -422,130 +329,6 @@ async fn collect_metrics_iteration(
    }
 }

-/// Internal type to make timeline metric production testable.
-///
-/// As this value type contains all of the information needed from a timeline to produce the
-/// metrics, it can easily be created with different values in test.
-struct TimelineSnapshot {
-    loaded_at: (Lsn, SystemTime),
-    last_record_lsn: Lsn,
-    current_exact_logical_size: Option<u64>,
-}
-
-impl TimelineSnapshot {
-    /// Collect the metrics from an actual timeline.
-    ///
-    /// Fails currently only when [`Timeline::get_current_logical_size`] fails.
-    ///
-    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
-    fn collect(
-        t: &Arc<crate::tenant::Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Option<Self>> {
-        use anyhow::Context;
-
-        if !t.is_active() {
-            // no collection for broken or stopping needed, we will still keep the cached values
-            // though at the caller.
-            Ok(None)
-        } else {
-            let loaded_at = t.loaded_at;
-            let last_record_lsn = t.get_last_record_lsn();
-
-            let current_exact_logical_size = {
-                let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
-                let res = span
-                    .in_scope(|| t.get_current_logical_size(ctx))
-                    .context("get_current_logical_size");
-                match res? {
-                    // Only send timeline logical size when it is fully calculated.
-                    (size, is_exact) if is_exact => Some(size),
-                    (_, _) => None,
-                }
-            };
-
-            Ok(Some(TimelineSnapshot {
-                loaded_at,
-                last_record_lsn,
-                current_exact_logical_size,
-            }))
-        }
-    }
-
-    /// Produce the timeline consumption metrics into the `metrics` argument.
-    fn to_metrics(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        now: DateTime<Utc>,
-        metrics: &mut Vec<(MetricsKey, (EventType, u64))>,
-        cache: &HashMap<MetricsKey, (EventType, u64)>,
-    ) {
-        let timeline_written_size = u64::from(self.last_record_lsn);
-
-        let (key, written_size_now) =
-            MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
-
-        // last_record_lsn can only go up, right now at least, TODO: #2592 or related
-        // features might change this.
-
-        let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
-
-        // use this when available, because in a stream of incremental values, it will be
-        // accurate where as when last_record_lsn stops moving, we will only cache the last
-        // one of those.
-        let last_stop_time = cache
-            .get(written_size_delta_key.key())
-            .map(|(until, _val)| {
-                until
-                    .incremental_timerange()
-                    .expect("never create EventType::Absolute for written_size_delta")
-                    .end
-            });
-
-        // by default, use the last sent written_size as the basis for
-        // calculating the delta. if we don't yet have one, use the load time value.
-        let prev = cache
-            .get(&key)
-            .map(|(prev_at, prev)| {
-                // use the prev time from our last incremental update, or default to latest
-                // absolute update on the first round.
-                let prev_at = prev_at
-                    .absolute_time()
-                    .expect("never create EventType::Incremental for written_size");
-                let prev_at = last_stop_time.unwrap_or(prev_at);
-                (*prev_at, *prev)
-            })
-            .unwrap_or_else(|| {
-                // if we don't have a previous point of comparison, compare to the load time
-                // lsn.
-                let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
-                (DateTime::from(*loaded_at), disk_consistent_lsn.0)
-            });
-
-        // written_size_bytes_delta
-        metrics.extend(
-            if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
-                let up_to = written_size_now
-                    .0
-                    .absolute_time()
-                    .expect("never create EventType::Incremental for written_size");
-                let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
-                Some(key_value)
-            } else {
-                None
-            },
-        );
-
-        // written_size
-        metrics.push((key, written_size_now));
-
-        if let Some(size) = self.current_exact_logical_size {
-            metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size));
-        }
-    }
-}
-
 /// Caclculate synthetic size for each active tenant
 pub async fn calculate_synthetic_size_worker(
    synthetic_size_calculation_interval: Duration,
@@ -560,7 +343,7 @@ pub async fn calculate_synthetic_size_worker(
            _ = task_mgr::shutdown_watcher() => {
                return Ok(());
            },
-            tick_at = ticker.tick() => {
+        tick_at = ticker.tick() => {

                let tenants = match mgr::list_tenants().await {
                    Ok(tenants) => tenants,
@@ -596,149 +379,3 @@ pub async fn calculate_synthetic_size_worker(
        }
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use std::collections::HashMap;
-
-    use std::time::SystemTime;
-    use utils::{
-        id::{TenantId, TimelineId},
-        lsn::Lsn,
-    };
-
-    use crate::consumption_metrics::MetricsKey;
-
-    use super::TimelineSnapshot;
-    use chrono::{DateTime, Utc};
-
-    #[test]
-    fn startup_collected_timeline_metrics_before_advancing() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-
-        let mut metrics = Vec::new();
-        let cache = HashMap::new();
-
-        let initdb_lsn = Lsn(0x10000);
-        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-        let snap = TimelineSnapshot {
-            loaded_at: (disk_consistent_lsn, SystemTime::now()),
-            last_record_lsn: disk_consistent_lsn,
-            current_exact_logical_size: Some(0x42000),
-        };
-
-        let now = DateTime::<Utc>::from(SystemTime::now());
-
-        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-        assert_eq!(
-            metrics,
-            &[
-                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
-                    snap.loaded_at.1.into(),
-                    now,
-                    0
-                ),
-                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-            ]
-        );
-    }
-
-    #[test]
-    fn startup_collected_timeline_metrics_second_round() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-
-        let [now, before, init] = time_backwards();
-
-        let now = DateTime::<Utc>::from(now);
-        let before = DateTime::<Utc>::from(before);
-
-        let initdb_lsn = Lsn(0x10000);
-        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-        let mut metrics = Vec::new();
-        let cache = HashMap::from([
-            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
-        ]);
-
-        let snap = TimelineSnapshot {
-            loaded_at: (disk_consistent_lsn, init),
-            last_record_lsn: disk_consistent_lsn,
-            current_exact_logical_size: Some(0x42000),
-        };
-
-        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-        assert_eq!(
-            metrics,
-            &[
-                MetricsKey::written_size_delta(tenant_id, timeline_id)
-                    .from_previous_up_to(before, now, 0),
-                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-            ]
-        );
-    }
-
-    #[test]
-    fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-
-        let [now, just_before, before, init] = time_backwards();
-
-        let now = DateTime::<Utc>::from(now);
-        let just_before = DateTime::<Utc>::from(just_before);
-        let before = DateTime::<Utc>::from(before);
-
-        let initdb_lsn = Lsn(0x10000);
-        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-        let mut metrics = Vec::new();
-        let cache = HashMap::from([
-            // at t=before was the last time the last_record_lsn changed
-            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
-            // end time of this event is used for the next ones
-            MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
-                before,
-                just_before,
-                0,
-            ),
-        ]);
-
-        let snap = TimelineSnapshot {
-            loaded_at: (disk_consistent_lsn, init),
-            last_record_lsn: disk_consistent_lsn,
-            current_exact_logical_size: Some(0x42000),
-        };
-
-        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-        assert_eq!(
-            metrics,
-            &[
-                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
-                    just_before,
-                    now,
-                    0
-                ),
-                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-            ]
-        );
-    }
-
-    fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
-        let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
-        times[0] = std::time::SystemTime::now();
-        for behind in 1..N {
-            times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
-        }
-
-        times
-    }
-}
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -545,12 +545,12 @@ async fn collect_eviction_candidates(
        // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
        // That's what's typically used by the various background loops.
        //
-        // The default can be overridden with a fixed value in the tenant conf.
+        // The default can be overriden with a fixed value in the tenant conf.
        // A default override can be put in the default tenant conf in the pageserver.toml.
        let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
            debug!(
                tenant_id=%tenant.tenant_id(),
-                overridden_size=s,
+                overriden_size=s,
                "using overridden min resident size for tenant"
            );
            s
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -994,29 +994,31 @@ async fn timeline_gc_handler(
 // Run compaction immediately on given timeline.
 async fn timeline_compact_handler(
    request: Request<Body>,
-    cancel: CancellationToken,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    async {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-        timeline
-            .compact(&cancel, &ctx)
-            .await
-            .map_err(ApiError::InternalServerError)?;
-        json_response(StatusCode::OK, ())
-    }
-    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
-    .await
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let result_receiver = mgr::immediate_compact(tenant_id, timeline_id, &ctx)
+        .await
+        .context("spawn compaction task")
+        .map_err(ApiError::InternalServerError)?;
+
+    let result: anyhow::Result<()> = result_receiver
+        .await
+        .context("receive compaction result")
+        .map_err(ApiError::InternalServerError)?;
+    result.map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
 }

 // Run checkpoint immediately on given timeline.
 async fn timeline_checkpoint_handler(
    request: Request<Body>,
-    cancel: CancellationToken,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1029,13 +1031,13 @@ async fn timeline_checkpoint_handler(
            .await
            .map_err(ApiError::InternalServerError)?;
        timeline
-            .compact(&cancel, &ctx)
+            .compact(&ctx)
            .await
            .map_err(ApiError::InternalServerError)?;

        json_response(StatusCode::OK, ())
    }
-    .instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id))
+    .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_id, timeline_id = %timeline_id))
    .await
 }

--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -109,8 +109,6 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
 pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";

-pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
-
 /// A marker file to prevent pageserver from loading a certain tenant on restart.
 /// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
 /// `ignore` management API command, that expects the ignored tenant to be properly loaded
@@ -125,30 +123,15 @@ pub fn is_temporary(path: &Path) -> bool {
    }
 }

-fn ends_with_suffix(path: &Path, suffix: &str) -> bool {
+pub fn is_uninit_mark(path: &Path) -> bool {
    match path.file_name() {
-        Some(name) => name.to_string_lossy().ends_with(suffix),
+        Some(name) => name
+            .to_string_lossy()
+            .ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
        None => false,
    }
 }

-pub fn is_uninit_mark(path: &Path) -> bool {
-    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
-}
-
-pub fn is_delete_mark(path: &Path) -> bool {
-    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
-}
-
-fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
-    if let Some(e) = e.io_error() {
-        if e.kind() == std::io::ErrorKind::NotFound {
-            return true;
-        }
-    }
-    false
-}
-
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -6,6 +6,7 @@ use metrics::{
    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
+use pageserver_api::models::TenantState;
 use strum::VariantNames;
 use strum_macros::{EnumVariantNames, IntoStaticStr};
 use utils::id::{TenantId, TimelineId};
@@ -73,7 +74,7 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
 // Buckets for background operations like compaction, GC, size calculation
 const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];

-pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
+pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_storage_operations_seconds_global",
        "Time spent on storage operations",
@@ -83,17 +84,18 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
        "pageserver_read_num_fs_layers",
        "Number of persistent layers accessed for processing a read request, including those in the cache",
+        &["tenant_id", "timeline_id"],
        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
    )
    .expect("failed to define a metric")
 });

 // Metrics collected on operations on the storage repository.
-pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_getpage_reconstruct_seconds",
        "Time spent in reconstruct_value (reconstruct a page from deltas)",
@@ -102,7 +104,7 @@ pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
+pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_materialized_cache_hits_direct_total",
        "Number of cache hits from materialized page cache without redo",
@@ -110,16 +112,17 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::n
    .expect("failed to define a metric")
 });

-pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
        "pageserver_getpage_get_reconstruct_data_seconds",
        "Time spent in get_reconstruct_value_data",
+        &["tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });

-pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
+pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_materialized_cache_hits_total",
        "Number of cache hits from materialized page cache",
@@ -243,10 +246,11 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
    },
 });

-pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
        "pageserver_wait_lsn_seconds",
        "Time spent waiting for WAL to arrive",
+        &["tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
@@ -280,7 +284,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
+pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_layers_total",
        "Total on-demand downloaded layers"
@@ -288,7 +292,7 @@ pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::ne
    .unwrap()
 });

-pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_bytes_total",
        "Total bytes of layers on-demand downloaded",
@@ -305,29 +309,16 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define current logical size metric")
 });

-pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_states_count",
        "Count of tenants per state",
-        &["state"]
+        &["tenant_id", "state"]
    )
    .expect("Failed to register pageserver_tenant_states_count metric")
 });

-/// A set of broken tenants.
-///
-/// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
-/// tenant.
-pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_broken_tenants_count",
-        "Set of broken tenants",
-        &["tenant_id"]
-    )
-    .expect("Failed to register pageserver_tenant_states_count metric")
-});
-
-pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_synthetic_cached_size_bytes",
        "Synthetic size of each tenant in bytes",
@@ -385,7 +376,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
    .expect("failed to define a metric")
 });

-pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
+pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_unexpected_ondemand_downloads_count",
        "Number of unexpected on-demand downloads. \
@@ -508,31 +499,23 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
    30.000,   // 30000 ms
 ];

-/// Tracks time taken by fs operations near VirtualFile.
-///
-/// Operations:
-/// - open ([`std::fs::OpenOptions::open`])
-/// - close (dropping [`std::fs::File`])
-/// - close-by-replace (close by replacement algorithm)
-/// - read (`read_at`)
-/// - write (`write_at`)
-/// - seek (modify internal position or file length query)
-/// - fsync ([`std::fs::File::sync_all`])
-/// - metadata ([`std::fs::File::metadata`])
-pub(crate) static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
+    "open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
+];
+
+const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
+
+pub static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_io_operations_seconds",
        "Time spent in IO operations",
-        &["operation"],
+        &["operation", "tenant_id", "timeline_id"],
        STORAGE_IO_TIME_BUCKETS.into()
    )
    .expect("failed to define a metric")
 });

-const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
-
-// Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
-pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
+pub static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_io_operations_bytes_total",
        "Total amount of bytes read/written in IO operations",
@@ -622,7 +605,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
         at a given instant. It gives you a better idea of the queue depth \
         than plotting the gauge directly, since operations may complete faster \
         than the sampling interval.",
-        &["file_kind", "op_kind"],
+        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
        // The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
        vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
    )
@@ -679,18 +662,18 @@ impl RemoteOpFileKind {
    }
 }

-pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_remote_operation_seconds",
        "Time spent on remote storage operations. \
        Grouped by tenant, timeline, operation_kind and status. \
        Does not account for time spent waiting in remote timeline client's queues.",
-        &["file_kind", "op_kind", "status"]
+        &["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
    )
    .expect("failed to define a metric")
 });

-pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_tenant_task_events",
        "Number of task start/stop/fail events.",
@@ -699,7 +682,7 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

-pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_background_loop_period_overrun_count",
        "Incremented whenever warn_when_period_overrun() logs a warning.",
@@ -710,7 +693,7 @@ pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = La

 // walreceiver metrics

-pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
+pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_walreceiver_started_connections_total",
        "Number of started walreceiver connections"
@@ -718,7 +701,7 @@ pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
+pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
    register_int_gauge!(
        "pageserver_walreceiver_active_managers",
        "Number of active walreceiver managers"
@@ -726,7 +709,7 @@ pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_walreceiver_switches_total",
        "Number of walreceiver manager change_connection calls",
@@ -735,7 +718,7 @@ pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
+pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_walreceiver_broker_updates_total",
        "Number of received broker updates in walreceiver"
@@ -743,7 +726,7 @@ pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_walreceiver_candidates_events_total",
        "Number of walreceiver candidate events",
@@ -752,10 +735,10 @@ pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
+pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));

-pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
+pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));

 // Metrics collected on WAL redo operations
@@ -802,7 +785,7 @@ macro_rules! redo_bytes_histogram_count_buckets {
    };
 }

-pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_seconds",
        "Time spent on WAL redo",
@@ -811,7 +794,7 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_wait_seconds",
        "Time spent waiting for access to the Postgres WAL redo process",
@@ -820,7 +803,7 @@ pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_records_histogram",
        "Histogram of number of records replayed per redo in the Postgres WAL redo process",
@@ -829,7 +812,7 @@ pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_bytes_histogram",
        "Histogram of number of records replayed per redo sent to Postgres",
@@ -838,8 +821,7 @@ pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-// FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
-pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
+pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_replayed_wal_records_total",
        "Number of WAL records replayed in WAL redo process"
@@ -915,6 +897,7 @@ impl StorageTimeMetrics {
 pub struct TimelineMetrics {
    tenant_id: String,
    timeline_id: String,
+    pub get_reconstruct_data_time_histo: Histogram,
    pub flush_time_histo: StorageTimeMetrics,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
@@ -923,7 +906,9 @@ pub struct TimelineMetrics {
    pub load_layer_map_histo: StorageTimeMetrics,
    pub garbage_collect_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
+    pub wait_lsn_time_histo: Histogram,
    pub resident_physical_size_gauge: UIntGauge,
+    pub read_num_fs_layers: Histogram,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub num_persistent_files_created: IntCounter,
@@ -940,6 +925,9 @@ impl TimelineMetrics {
    ) -> Self {
        let tenant_id = tenant_id.to_string();
        let timeline_id = timeline_id.to_string();
+        let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let flush_time_histo =
            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
        let compact_time_histo =
@@ -960,6 +948,9 @@ impl TimelineMetrics {
        let last_record_gauge = LAST_RECORD_LSN
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
+        let wait_lsn_time_histo = WAIT_LSN_TIME
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
@@ -975,12 +966,16 @@ impl TimelineMetrics {
        let evictions = EVICTIONS
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
+        let read_num_fs_layers = READ_NUM_FS_LAYERS
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let evictions_with_low_residence_duration =
            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);

        TimelineMetrics {
            tenant_id,
            timeline_id,
+            get_reconstruct_data_time_histo,
            flush_time_histo,
            compact_time_histo,
            create_images_time_histo,
@@ -989,6 +984,7 @@ impl TimelineMetrics {
            garbage_collect_histo,
            load_layer_map_histo,
            last_record_gauge,
+            wait_lsn_time_histo,
            resident_physical_size_gauge,
            current_logical_size_gauge,
            num_persistent_files_created,
@@ -997,6 +993,7 @@ impl TimelineMetrics {
            evictions_with_low_residence_duration: std::sync::RwLock::new(
                evictions_with_low_residence_duration,
            ),
+            read_num_fs_layers,
        }
    }
 }
@@ -1005,12 +1002,15 @@ impl Drop for TimelineMetrics {
    fn drop(&mut self) {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
+        let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);

        self.evictions_with_low_residence_duration
            .write()
@@ -1022,6 +1022,9 @@ impl Drop for TimelineMetrics {
            let _ =
                STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
        }
+        for op in STORAGE_IO_TIME_OPERATIONS {
+            let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
+        }

        for op in STORAGE_IO_SIZE_OPERATIONS {
            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
@@ -1036,7 +1039,9 @@ impl Drop for TimelineMetrics {
 pub fn remove_tenant_metrics(tenant_id: &TenantId) {
    let tid = tenant_id.to_string();
    let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
-    // we leave the BROKEN_TENANTS_SET entry if any
+    for state in TenantState::VARIANTS {
+        let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
+    }
 }

 use futures::Future;
@@ -1051,7 +1056,9 @@ pub struct RemoteTimelineClientMetrics {
    tenant_id: String,
    timeline_id: String,
    remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
+    remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
+    calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
 }
@@ -1061,13 +1068,14 @@ impl RemoteTimelineClientMetrics {
        RemoteTimelineClientMetrics {
            tenant_id: tenant_id.to_string(),
            timeline_id: timeline_id.to_string(),
+            remote_operation_time: Mutex::new(HashMap::default()),
            calls_unfinished_gauge: Mutex::new(HashMap::default()),
+            calls_started_hist: Mutex::new(HashMap::default()),
            bytes_started_counter: Mutex::new(HashMap::default()),
            bytes_finished_counter: Mutex::new(HashMap::default()),
            remote_physical_size_gauge: Mutex::new(None),
        }
    }
-
    pub fn remote_physical_size_gauge(&self) -> UIntGauge {
        let mut guard = self.remote_physical_size_gauge.lock().unwrap();
        guard
@@ -1081,17 +1089,26 @@ impl RemoteTimelineClientMetrics {
            })
            .clone()
    }
-
    pub fn remote_operation_time(
        &self,
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
        status: &'static str,
    ) -> Histogram {
+        let mut guard = self.remote_operation_time.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str(), status);
-        REMOTE_OPERATION_TIME
-            .get_metric_with_label_values(&[key.0, key.1, key.2])
-            .unwrap()
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_OPERATION_TIME
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                    key.2,
+                ])
+                .unwrap()
+        });
+        metric.clone()
    }

    fn calls_unfinished_gauge(
@@ -1119,10 +1136,19 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> Histogram {
+        let mut guard = self.calls_started_hist.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
-        REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
-            .get_metric_with_label_values(&[key.0, key.1])
-            .unwrap()
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                ])
+                .unwrap()
+        });
+        metric.clone()
    }

    fn bytes_started_counter(
@@ -1302,10 +1328,15 @@ impl Drop for RemoteTimelineClientMetrics {
            tenant_id,
            timeline_id,
            remote_physical_size_gauge,
+            remote_operation_time,
            calls_unfinished_gauge,
+            calls_started_hist,
            bytes_started_counter,
            bytes_finished_counter,
        } = self;
+        for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
+            let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
+        }
        for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
                tenant_id,
@@ -1314,6 +1345,14 @@ impl Drop for RemoteTimelineClientMetrics {
                b,
            ]);
        }
+        for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() {
+            let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                a,
+                b,
+            ]);
+        }
        for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
                tenant_id,
@@ -1395,51 +1434,15 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }

 pub fn preinitialize_metrics() {
-    // Python tests need these and on some we do alerting.
-    //
-    // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
-    // order:
-    // - global metrics reside in a Lazy<PageserverMetrics>
-    //   - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
-    // - could move the statics into TimelineMetrics::new()?
+    // We want to alert on this metric increasing.
+    // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
+    assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
+    UNEXPECTED_ONDEMAND_DOWNLOADS.reset();

-    // counters
-    [
-        &MATERIALIZED_PAGE_CACHE_HIT,
-        &MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
-        &UNEXPECTED_ONDEMAND_DOWNLOADS,
-        &WALRECEIVER_STARTED_CONNECTIONS,
-        &WALRECEIVER_BROKER_UPDATES,
-        &WALRECEIVER_CANDIDATES_ADDED,
-        &WALRECEIVER_CANDIDATES_REMOVED,
-    ]
-    .into_iter()
-    .for_each(|c| {
-        Lazy::force(c);
-    });
+    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
+    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();

-    // countervecs
-    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
-        .into_iter()
-        .for_each(|c| {
-            Lazy::force(c);
-        });
-
-    // gauges
-    WALRECEIVER_ACTIVE_MANAGERS.get();
-
-    // histograms
-    [
-        &READ_NUM_FS_LAYERS,
-        &RECONSTRUCT_TIME,
-        &WAIT_LSN_TIME,
-        &WAL_REDO_TIME,
-        &WAL_REDO_WAIT_TIME,
-        &WAL_REDO_RECORDS_HISTOGRAM,
-        &WAL_REDO_BYTES_HISTOGRAM,
-    ]
-    .into_iter()
-    .for_each(|h| {
-        Lazy::force(h);
-    });
+    // Python tests need these.
+    MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
+    MATERIALIZED_PAGE_CACHE_HIT.get();
 }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -130,25 +130,11 @@ pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
 pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
    tokio::runtime::Builder::new_multi_thread()
        .thread_name("background op worker")
-        // if you change the number of worker threads please change the constant below
        .enable_all()
        .build()
        .expect("Failed to create background op runtime")
 });

-pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
-    // force init and thus panics
-    let _ = BACKGROUND_RUNTIME.handle();
-    // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
-    // tokio would had already panicked for parsing errors or NotUnicode
-    //
-    // this will be wrong if any of the runtimes gets their worker threads configured to something
-    // else, but that has not been needed in a long time.
-    std::env::var("TOKIO_WORKER_THREADS")
-        .map(|s| s.parse::<usize>().unwrap())
-        .unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
-});
-
 #[derive(Debug, Clone, Copy)]
 pub struct PageserverTaskId(u64);

@@ -559,7 +545,7 @@ pub fn current_task_id() -> Option<PageserverTaskId> {
 pub async fn shutdown_watcher() {
    let token = SHUTDOWN_TOKEN
        .try_with(|t| t.clone())
-        .expect("shutdown_watcher() called in an unexpected task or thread");
+        .expect("shutdown_requested() called in an unexpected task or thread");

    token.cancelled().await;
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -16,20 +16,30 @@ use crate::tenant::block_io::{BlockCursor, BlockReader};
 use std::cmp::min;
 use std::io::{Error, ErrorKind};

-impl<R> BlockCursor<R>
-where
-    R: BlockReader,
-{
+/// For reading
+pub trait BlobCursor {
    /// Read a blob into a new buffer.
-    pub fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+    fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
        self.read_blob_into_buf(offset, &mut buf)?;
        Ok(buf)
    }
+
    /// Read blob into the given buffer. Any previous contents in the buffer
    /// are overwritten.
-    pub fn read_blob_into_buf(
-        &self,
+    fn read_blob_into_buf(
+        &mut self,
+        offset: u64,
+        dstbuf: &mut Vec<u8>,
+    ) -> Result<(), std::io::Error>;
+}
+
+impl<R> BlobCursor for BlockCursor<R>
+where
+    R: BlockReader,
+{
+    fn read_blob_into_buf(
+        &mut self,
        offset: u64,
        dstbuf: &mut Vec<u8>,
    ) -> Result<(), std::io::Error> {
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -80,7 +80,7 @@ where
        BlockCursor { reader }
    }

-    pub fn read_blk(&self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
+    pub fn read_blk(&mut self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
        self.reader.read_blk(blknum)
    }
 }
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -390,42 +390,39 @@ where
    }

    #[allow(dead_code)]
-    pub async fn dump(&self) -> Result<()> {
-        let mut stack = Vec::new();
+    pub fn dump(&self) -> Result<()> {
+        self.dump_recurse(self.root_blk, &[], 0)
+    }

-        stack.push((self.root_blk, String::new(), 0, 0, 0));
+    fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
+        let blk = self.reader.read_blk(self.start_blk + blknum)?;
+        let buf: &[u8] = blk.as_ref();

-        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
-            let blk = self.reader.read_blk(self.start_blk + blknum)?;
-            let buf: &[u8] = blk.as_ref();
-            let node = OnDiskNode::<L>::deparse(buf)?;
+        let node = OnDiskNode::<L>::deparse(buf)?;

-            if child_idx == 0 {
-                print!("{:indent$}", "", indent = depth * 2);
-                let path_prefix = stack
-                    .iter()
-                    .map(|(_blknum, path, ..)| path.as_str())
-                    .collect::<String>();
-                println!(
-                    "blk #{blknum}: path {path_prefix}{path}: prefix {}, suffix_len {}",
-                    hex::encode(node.prefix),
-                    node.suffix_len
-                );
-            }
+        print!("{:indent$}", "", indent = depth * 2);
+        println!(
+            "blk #{}: path {}: prefix {}, suffix_len {}",
+            blknum,
+            hex::encode(path),
+            hex::encode(node.prefix),
+            node.suffix_len
+        );

-            if child_idx + 1 < node.num_children {
-                let key_off = key_off + node.suffix_len as usize;
-                stack.push((blknum, path.clone(), depth, child_idx + 1, key_off));
-            }
+        let mut idx = 0;
+        let mut key_off = 0;
+        while idx < node.num_children {
            let key = &node.keys[key_off..key_off + node.suffix_len as usize];
-            let val = node.value(child_idx as usize);
-
+            let val = node.value(idx as usize);
            print!("{:indent$}", "", indent = depth * 2 + 2);
            println!("{}: {}", hex::encode(key), hex::encode(val.0));

            if node.level > 0 {
-                stack.push((val.to_blknum(), hex::encode(node.prefix), depth + 1, 0, 0));
+                let child_path = [path, node.prefix].concat();
+                self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?;
            }
+            idx += 1;
+            key_off += node.suffix_len as usize;
        }
        Ok(())
    }
@@ -757,8 +754,8 @@ mod tests {
        }
    }

-    #[tokio::test]
-    async fn basic() -> Result<()> {
+    #[test]
+    fn basic() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);

@@ -778,7 +775,7 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump().await?;
+        reader.dump()?;

        // Test the `get` function on all the keys.
        for (key, val) in all_data.iter() {
@@ -838,8 +835,8 @@ mod tests {
        Ok(())
    }

-    #[tokio::test]
-    async fn lots_of_keys() -> Result<()> {
+    #[test]
+    fn lots_of_keys() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);

@@ -859,7 +856,7 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump().await?;
+        reader.dump()?;

        use std::sync::Mutex;

@@ -997,8 +994,8 @@ mod tests {
    ///
    /// This test contains a particular data set, see disk_btree_test_data.rs
    ///
-    #[tokio::test]
-    async fn particular_data() -> Result<()> {
+    #[test]
+    fn particular_data() -> Result<()> {
        // Build a tree from it
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
@@ -1025,7 +1022,7 @@ mod tests {
        })?;
        assert_eq!(count, disk_btree_test_data::TEST_DATA.len());

-        reader.dump().await?;
+        reader.dump()?;

        Ok(())
    }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -328,7 +328,7 @@ fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::tenant::blob_io::BlobWriter;
+    use crate::tenant::blob_io::{BlobCursor, BlobWriter};
    use crate::tenant::block_io::BlockCursor;
    use rand::{seq::SliceRandom, thread_rng, RngCore};
    use std::fs;
@@ -420,7 +420,7 @@ mod tests {
            blobs.push((pos, data));
        }

-        let cursor = BlockCursor::new(&file);
+        let mut cursor = BlockCursor::new(&file);
        for (pos, expected) in blobs {
            let actual = cursor.read_blob(pos)?;
            assert_eq!(actual, expected);
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -626,17 +626,17 @@ impl LayerMap {

    /// debugging function to print out the contents of the layer map
    #[allow(unused)]
-    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    pub fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!("Begin dump LayerMap");

        println!("open_layer:");
        if let Some(open_layer) = &self.open_layer {
-            open_layer.dump(verbose, ctx).await?;
+            open_layer.dump(verbose, ctx)?;
        }

        println!("frozen_layers:");
        for frozen_layer in self.frozen_layers.iter() {
-            frozen_layer.dump(verbose, ctx).await?;
+            frozen_layer.dump(verbose, ctx)?;
        }

        println!("historic_layers:");
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -9,11 +9,10 @@
 //! [`remote_timeline_client`]: super::remote_timeline_client

 use std::fs::{File, OpenOptions};
-use std::io::{self, Write};
+use std::io::Write;

 use anyhow::{bail, ensure, Context};
 use serde::{Deserialize, Serialize};
-use thiserror::Error;
 use tracing::info_span;
 use utils::bin_ser::SerializeError;
 use utils::{
@@ -268,24 +267,24 @@ pub fn save_metadata(
    Ok(())
 }

-#[derive(Error, Debug)]
-pub enum LoadMetadataError {
-    #[error(transparent)]
-    Read(#[from] io::Error),
-
-    #[error(transparent)]
-    Decode(#[from] anyhow::Error),
-}
-
 pub fn load_metadata(
    conf: &'static PageServerConf,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
-) -> Result<TimelineMetadata, LoadMetadataError> {
+) -> anyhow::Result<TimelineMetadata> {
    let metadata_path = conf.metadata_path(tenant_id, timeline_id);
-    let metadata_bytes = std::fs::read(metadata_path)?;
-
-    Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
+    let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
+        format!(
+            "Failed to read metadata bytes from path {}",
+            metadata_path.display()
+        )
+    })?;
+    TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| {
+        format!(
+            "Failed to parse metadata bytes from path {}",
+            metadata_path.display()
+        )
+    })
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -26,8 +26,6 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};

-use super::timeline::delete::DeleteTimelineFlow;
-
 /// The tenants known to the pageserver.
 /// The enum variants are used to distinguish the different states that the pageserver can be in.
 enum TenantsMap {
@@ -423,10 +421,12 @@ pub enum DeleteTimelineError {
 pub async fn delete_timeline(
    tenant_id: TenantId,
    timeline_id: TimelineId,
-    _ctx: &RequestContext,
+    ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
    let tenant = get_tenant(tenant_id, true).await?;
-    DeleteTimelineFlow::run(&tenant, timeline_id).await?;
+    tenant
+        .prepare_and_schedule_delete_timeline(timeline_id, ctx)
+        .await?;
    Ok(())
 }

@@ -768,6 +768,55 @@ pub async fn immediate_gc(
    Ok(wait_task_done)
 }

+pub async fn immediate_compact(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ctx: &RequestContext,
+) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
+    let guard = TENANTS.read().await;
+
+    let tenant = guard
+        .get(&tenant_id)
+        .map(Arc::clone)
+        .with_context(|| format!("tenant {tenant_id}"))
+        .map_err(|e| ApiError::NotFound(e.into()))?;
+
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(|e| ApiError::NotFound(e.into()))?;
+
+    // Run in task_mgr to avoid race with tenant_detach operation
+    let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
+    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
+    task_mgr::spawn(
+        &tokio::runtime::Handle::current(),
+        TaskKind::Compaction,
+        Some(tenant_id),
+        Some(timeline_id),
+        &format!(
+            "timeline_compact_handler compaction run for tenant {tenant_id} timeline {timeline_id}"
+        ),
+        false,
+        async move {
+            let result = timeline
+                .compact(&ctx)
+                .instrument(info_span!("manual_compact", %tenant_id, %timeline_id))
+                .await;
+
+            match task_done.send(result) {
+                Ok(_) => (),
+                Err(result) => error!("failed to send compaction result: {result:?}"),
+            }
+            Ok(())
+        },
+    );
+
+    // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
+    drop(guard);
+
+    Ok(wait_task_done)
+}
+
 #[cfg(test)]
 mod tests {
    use std::collections::HashMap;
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -514,7 +514,7 @@ impl RemoteTimelineClient {
    /// updated metadata.
    ///
    /// The upload will be added to the queue immediately, but it
-    /// won't be performed until all previously scheduled layer file
+    /// won't be performed until all previosuly scheduled layer file
    /// upload operations have completed successfully.  This is to
    /// ensure that when the index file claims that layers X, Y and Z
    /// exist in remote storage, they really do. To wait for the upload
@@ -625,7 +625,7 @@ impl RemoteTimelineClient {
    /// Note: This schedules an index file upload before the deletions.  The
    /// deletion won't actually be performed, until any previously scheduled
    /// upload operations, and the index file upload, have completed
-    /// successfully.
+    /// succesfully.
    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
        names: &[LayerFileName],
@@ -827,7 +827,7 @@ impl RemoteTimelineClient {
            )
        };

-        receiver.changed().await.context("upload queue shut down")?;
+        receiver.changed().await?;

        // Do not delete index part yet, it is needed for possible retry. If we remove it first
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
@@ -855,23 +855,11 @@ impl RemoteTimelineClient {
            self.storage_impl.delete_objects(&remaining).await?;
        }

-        fail::fail_point!("timeline-delete-before-index-delete", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: timeline-delete-before-index-delete"
-            ))?
-        });
-
        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));

        debug!("deleting index part");
        self.storage_impl.delete(&index_file_path).await?;

-        fail::fail_point!("timeline-delete-after-index-delete", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: timeline-delete-after-index-delete"
-            ))?
-        });
-
        info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");

        Ok(())
@@ -1117,7 +1105,7 @@ impl RemoteTimelineClient {
            debug!("remote task {} completed successfully", task.op);
        }

-        // The task has completed successfully. Remove it from the in-progress list.
+        // The task has completed succesfully. Remove it from the in-progress list.
        {
            let mut upload_queue_guard = self.upload_queue.lock().unwrap();
            let upload_queue = match upload_queue_guard.deref_mut() {
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -223,45 +223,6 @@ mod tests {
        assert_eq!(part, expected);
    }

-    #[test]
-    fn v2_indexpart_is_parsed_with_deleted_at() {
-        let example = r#"{
-            "version":2,
-            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
-            "missing_layers":["This shouldn't fail deserialization"],
-            "layer_metadata":{
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
-            },
-            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
-            "deleted_at": "2023-07-31T09:00:00.123"
-        }"#;
-
-        let expected = IndexPart {
-            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
-            version: 2,
-            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
-            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
-                    file_size: 25600000,
-                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
-                    // serde_json should always parse this but this might be a double with jq for
-                    // example.
-                    file_size: 9007199254741001,
-                })
-            ]),
-            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
-            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
-        };
-
-        let part = serde_json::from_str::<IndexPart>(example).unwrap();
-        assert_eq!(part, expected);
-    }
-
    #[test]
    fn empty_layers_are_parsed() {
        let empty_layers_json = r#"{
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -9,7 +9,7 @@ mod remote_layer;

 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::repository::Key;
+use crate::repository::{Key, Value};
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Result;
@@ -34,7 +34,7 @@ use utils::{
    lsn::Lsn,
 };

-pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
+pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
 pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
@@ -338,8 +338,7 @@ impl LayerAccessStats {
 /// All layers should implement a minimal `std::fmt::Debug` without tenant or
 /// timeline names, because those are known in the context of which the layers
 /// are used in (timeline).
-#[async_trait::async_trait]
-pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
+pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
    /// Range of keys that this layer covers
    fn get_key_range(&self) -> Range<Key>;

@@ -369,7 +368,7 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
    /// is available. If this returns ValueReconstructResult::Continue, look up
    /// the predecessor layer and call again with the same 'reconstruct_data' to
    /// collect more data.
-    async fn get_value_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
@@ -378,9 +377,15 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
    ) -> Result<ValueReconstructResult>;

    /// Dump summary of the contents of the layer to stdout
-    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
+    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
 }

+/// Returned by [`PersistentLayer::iter`]
+pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
+
+/// Returned by [`PersistentLayer::key_iter`]
+pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
+
 /// Get a layer descriptor from a layer.
 pub trait AsLayerDesc {
    /// Get the layer descriptor.
@@ -421,6 +426,15 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
    // `None` for `RemoteLayer`.
    fn local_path(&self) -> Option<PathBuf>;

+    /// Iterate through all keys and values stored in the layer
+    fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>>;
+
+    /// Iterate through all keys stored in the layer. Returns key, lsn and value size
+    /// It is used only for compaction and so is currently implemented only for DeltaLayer
+    fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
+        panic!("Not implemented")
+    }
+
    /// Permanently remove this layer from disk.
    fn delete_resident_layer_file(&self) -> Result<()>;

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -31,7 +31,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::{PageReadGuard, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
-use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
+use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
@@ -61,8 +61,8 @@ use utils::{
 };

 use super::{
-    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
-    PersistentLayerDesc,
+    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
+    LayerKeyIter, PathOrConf, PersistentLayerDesc,
 };

 ///
@@ -189,7 +189,7 @@ pub struct DeltaLayer {

    access_stats: LayerAccessStats,

-    inner: OnceCell<Arc<DeltaLayerInner>>,
+    inner: OnceCell<DeltaLayerInner>,
 }

 impl std::fmt::Debug for DeltaLayer {
@@ -223,10 +223,9 @@ impl std::fmt::Debug for DeltaLayerInner {
    }
 }

-#[async_trait::async_trait]
 impl Layer for DeltaLayer {
    /// debugging function to print out the contents of the layer
-    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
            self.desc.tenant_id,
@@ -256,12 +255,12 @@ impl Layer for DeltaLayer {
            file,
        );

-        tree_reader.dump().await?;
+        tree_reader.dump()?;

-        let cursor = file.block_cursor();
+        let mut cursor = file.block_cursor();

        // A subroutine to dump a single blob
-        let dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
+        let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
            let buf = cursor.read_blob(blob_ref.pos())?;
            let val = Value::des(&buf)?;
            let desc = match val {
@@ -301,7 +300,7 @@ impl Layer for DeltaLayer {
        Ok(())
    }

-    async fn get_value_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
@@ -343,7 +342,7 @@ impl Layer for DeltaLayer {
            })?;

            // Ok, 'offsets' now contains the offsets of all the entries we need to read
-            let cursor = file.block_cursor();
+            let mut cursor = file.block_cursor();
            let mut buf = Vec::new();
            for (entry_lsn, pos) in offsets {
                cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
@@ -424,6 +423,23 @@ impl PersistentLayer for DeltaLayer {
        Some(self.path())
    }

+    fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>> {
+        let inner = self
+            .load(LayerAccessKind::KeyIter, ctx)
+            .context("load delta layer")?;
+        Ok(match DeltaValueIter::new(inner) {
+            Ok(iter) => Box::new(iter),
+            Err(err) => Box::new(std::iter::once(Err(err))),
+        })
+    }
+
+    fn key_iter(&self, ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
+        let inner = self.load(LayerAccessKind::KeyIter, ctx)?;
+        Ok(Box::new(
+            DeltaKeyIter::new(inner).context("Layer index is corrupted")?,
+        ))
+    }
+
    fn delete_resident_layer_file(&self) -> Result<()> {
        // delete underlying file
        fs::remove_file(self.path())?;
@@ -493,11 +509,7 @@ impl DeltaLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    fn load(
-        &self,
-        access_kind: LayerAccessKind,
-        ctx: &RequestContext,
-    ) -> Result<&Arc<DeltaLayerInner>> {
+    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
        self.access_stats
            .record_access(access_kind, ctx.task_kind());
        // Quick exit if already loaded
@@ -506,7 +518,7 @@ impl DeltaLayer {
            .with_context(|| format!("Failed to load delta layer {}", self.path().display()))
    }

-    fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
+    fn load_inner(&self) -> Result<DeltaLayerInner> {
        let path = self.path();

        let file = VirtualFile::open(&path)
@@ -541,11 +553,11 @@ impl DeltaLayer {

        debug!("loaded from {}", &path.display());

-        Ok(Arc::new(DeltaLayerInner {
+        Ok(DeltaLayerInner {
            file,
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
-        }))
+        })
    }

    /// Create a DeltaLayer struct representing an existing file on disk.
@@ -610,24 +622,6 @@ impl DeltaLayer {
            &self.layer_name(),
        )
    }
-
-    /// Obtains all keys and value references stored in the layer
-    ///
-    /// The value can be obtained via the [`ValueRef::load`] function.
-    pub fn load_val_refs(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, ValueRef)>> {
-        let inner = self
-            .load(LayerAccessKind::KeyIter, ctx)
-            .context("load delta layer")?;
-        DeltaLayerInner::load_val_refs(inner).context("Layer index is corrupted")
-    }
-
-    /// Loads all keys stored in the layer. Returns key, lsn and value size.
-    pub fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, u64)>> {
-        let inner = self
-            .load(LayerAccessKind::KeyIter, ctx)
-            .context("load delta layer keys")?;
-        inner.load_keys().context("Layer index is corrupted")
-    }
 }

 /// A builder object for constructing a new delta layer.
@@ -898,41 +892,121 @@ impl Drop for DeltaLayerWriter {
    }
 }

-impl DeltaLayerInner {
-    fn load_val_refs(this: &Arc<DeltaLayerInner>) -> Result<Vec<(Key, Lsn, ValueRef)>> {
-        let file = &this.file;
+///
+/// Iterator over all key-value pairse stored in a delta layer
+///
+/// FIXME: This creates a Vector to hold the offsets of all key value pairs.
+/// That takes up quite a lot of memory. Should do this in a more streaming
+/// fashion.
+///
+struct DeltaValueIter<'a> {
+    all_offsets: Vec<(DeltaKey, BlobRef)>,
+    next_idx: usize,
+    reader: BlockCursor<Adapter<'a>>,
+}
+
+struct Adapter<'a>(&'a DeltaLayerInner);
+
+impl<'a> BlockReader for Adapter<'a> {
+    type BlockLease = PageReadGuard<'static>;
+
+    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+        self.0.file.read_blk(blknum)
+    }
+}
+
+impl<'a> Iterator for DeltaValueIter<'a> {
+    type Item = Result<(Key, Lsn, Value)>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.next_res().transpose()
+    }
+}
+
+impl<'a> DeltaValueIter<'a> {
+    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
+        let file = &inner.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            this.index_start_blk,
-            this.index_root_blk,
+            inner.index_start_blk,
+            inner.index_root_blk,
            file,
        );

-        let mut all_offsets = Vec::<(Key, Lsn, ValueRef)>::new();
+        let mut all_offsets: Vec<(DeltaKey, BlobRef)> = Vec::new();
        tree_reader.visit(
            &[0u8; DELTA_KEY_SIZE],
            VisitDirection::Forwards,
            |key, value| {
-                let delta_key = DeltaKey::from_slice(key);
-                let val_ref = ValueRef {
-                    blob_ref: BlobRef(value),
-                    reader: BlockCursor::new(Adapter(this.clone())),
-                };
-                all_offsets.push((delta_key.key(), delta_key.lsn(), val_ref));
+                all_offsets.push((DeltaKey::from_slice(key), BlobRef(value)));
                true
            },
        )?;

-        Ok(all_offsets)
+        let iter = DeltaValueIter {
+            all_offsets,
+            next_idx: 0,
+            reader: BlockCursor::new(Adapter(inner)),
+        };
+
+        Ok(iter)
    }
-    fn load_keys(&self) -> Result<Vec<(Key, Lsn, u64)>> {
-        let file = &self.file;
+
+    fn next_res(&mut self) -> Result<Option<(Key, Lsn, Value)>> {
+        if self.next_idx < self.all_offsets.len() {
+            let (delta_key, blob_ref) = &self.all_offsets[self.next_idx];
+
+            let key = delta_key.key();
+            let lsn = delta_key.lsn();
+
+            let buf = self.reader.read_blob(blob_ref.pos())?;
+            let val = Value::des(&buf)?;
+            self.next_idx += 1;
+            Ok(Some((key, lsn, val)))
+        } else {
+            Ok(None)
+        }
+    }
+}
+///
+/// Iterator over all keys stored in a delta layer
+///
+/// FIXME: This creates a Vector to hold all keys.
+/// That takes up quite a lot of memory. Should do this in a more streaming
+/// fashion.
+///
+struct DeltaKeyIter {
+    all_keys: Vec<(DeltaKey, u64)>,
+    next_idx: usize,
+}
+
+impl Iterator for DeltaKeyIter {
+    type Item = (Key, Lsn, u64);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.next_idx < self.all_keys.len() {
+            let (delta_key, size) = &self.all_keys[self.next_idx];
+
+            let key = delta_key.key();
+            let lsn = delta_key.lsn();
+
+            self.next_idx += 1;
+            Some((key, lsn, *size))
+        } else {
+            None
+        }
+    }
+}
+
+impl<'a> DeltaKeyIter {
+    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
+        let file = &inner.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
+            inner.index_start_blk,
+            inner.index_root_blk,
            file,
        );

-        let mut all_keys: Vec<(Key, Lsn, u64)> = Vec::new();
+        let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new();
        tree_reader.visit(
            &[0u8; DELTA_KEY_SIZE],
            VisitDirection::Forwards,
@@ -940,48 +1014,46 @@ impl DeltaLayerInner {
                let delta_key = DeltaKey::from_slice(key);
                let pos = BlobRef(value).pos();
                if let Some(last) = all_keys.last_mut() {
-                    if last.0 == delta_key.key() {
+                    if last.0.key() == delta_key.key() {
                        return true;
                    } else {
                        // subtract offset of new key BLOB and first blob of this key
                        // to get total size if values associated with this key
-                        let first_pos = last.2;
-                        last.2 = pos - first_pos;
+                        let first_pos = last.1;
+                        last.1 = pos - first_pos;
                    }
                }
-                all_keys.push((delta_key.key(), delta_key.lsn(), pos));
+                all_keys.push((delta_key, pos));
                true
            },
        )?;
        if let Some(last) = all_keys.last_mut() {
            // Last key occupies all space till end of layer
-            last.2 = std::fs::metadata(&file.file.path)?.len() - last.2;
+            last.1 = std::fs::metadata(&file.file.path)?.len() - last.1;
        }
-        Ok(all_keys)
+        let iter = DeltaKeyIter {
+            all_keys,
+            next_idx: 0,
+        };
+
+        Ok(iter)
    }
 }

-/// Reference to an on-disk value
-pub struct ValueRef {
-    blob_ref: BlobRef,
-    reader: BlockCursor<Adapter>,
-}
+#[cfg(test)]
+mod test {
+    use super::DeltaKeyIter;
+    use super::DeltaLayer;
+    use super::DeltaValueIter;

-impl ValueRef {
-    /// Loads the value from disk
-    pub fn load(&self) -> Result<Value> {
-        let buf = self.reader.read_blob(self.blob_ref.pos())?;
-        let val = Value::des(&buf)?;
-        Ok(val)
-    }
-}
-
-struct Adapter(Arc<DeltaLayerInner>);
-
-impl BlockReader for Adapter {
-    type BlockLease = PageReadGuard<'static>;
-
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
-        self.0.file.read_blk(blknum)
+    // We will soon need the iters to be send in the compaction code.
+    // Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
+    // Cf https://github.com/neondatabase/neon/issues/4471
+    #[test]
+    fn is_send() {
+        fn assert_send<T: Send>() {}
+        assert_send::<DeltaLayer>();
+        assert_send::<DeltaValueIter>();
+        assert_send::<DeltaKeyIter>();
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -27,7 +27,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, KEY_SIZE};
-use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
+use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
@@ -38,7 +38,6 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use hex;
-use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -48,6 +47,7 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
+use std::sync::{RwLock, RwLockReadGuard};
 use tracing::*;

 use utils::{
@@ -57,7 +57,9 @@ use utils::{
 };

 use super::filename::ImageFileName;
-use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};
+use super::{
+    AsLayerDesc, Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc,
+};

 ///
 /// Header stored in the beginning of the file
@@ -115,7 +117,7 @@ pub struct ImageLayer {

    access_stats: LayerAccessStats,

-    inner: OnceCell<ImageLayerInner>,
+    inner: RwLock<ImageLayerInner>,
 }

 impl std::fmt::Debug for ImageLayer {
@@ -132,27 +134,30 @@ impl std::fmt::Debug for ImageLayer {
 }

 pub struct ImageLayerInner {
+    /// If false, the 'index' has not been loaded into memory yet.
+    loaded: bool,
+
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,

-    /// Reader object for reading blocks from the file.
-    file: FileBlockReader<VirtualFile>,
+    /// Reader object for reading blocks from the file. (None if not loaded yet)
+    file: Option<FileBlockReader<VirtualFile>>,
 }

 impl std::fmt::Debug for ImageLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("ImageLayerInner")
+            .field("loaded", &self.loaded)
            .field("index_start_blk", &self.index_start_blk)
            .field("index_root_blk", &self.index_root_blk)
            .finish()
    }
 }

-#[async_trait::async_trait]
 impl Layer for ImageLayer {
    /// debugging function to print out the contents of the layer
-    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
            self.desc.tenant_id,
@@ -169,11 +174,11 @@ impl Layer for ImageLayer {
        }

        let inner = self.load(LayerAccessKind::Dump, ctx)?;
-        let file = &inner.file;
+        let file = inner.file.as_ref().unwrap();
        let tree_reader =
            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);

-        tree_reader.dump().await?;
+        tree_reader.dump()?;

        tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
            println!("key: {} offset {}", hex::encode(key), value);
@@ -184,7 +189,7 @@ impl Layer for ImageLayer {
    }

    /// Look up given page in the file
-    async fn get_value_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
@@ -197,7 +202,7 @@ impl Layer for ImageLayer {

        let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;

-        let file = &inner.file;
+        let file = inner.file.as_ref().unwrap();
        let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);

        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
@@ -253,6 +258,10 @@ impl PersistentLayer for ImageLayer {
        Some(self.path())
    }

+    fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
+        unimplemented!();
+    }
+
    fn delete_resident_layer_file(&self) -> Result<()> {
        // delete underlying file
        fs::remove_file(self.path())?;
@@ -312,26 +321,52 @@ impl ImageLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&ImageLayerInner> {
+    fn load(
+        &self,
+        access_kind: LayerAccessKind,
+        ctx: &RequestContext,
+    ) -> Result<RwLockReadGuard<ImageLayerInner>> {
        self.access_stats
            .record_access(access_kind, ctx.task_kind());
        loop {
-            if let Some(inner) = self.inner.get() {
+            // Quick exit if already loaded
+            let inner = self.inner.read().unwrap();
+            if inner.loaded {
                return Ok(inner);
            }
-            self.inner
-                .get_or_try_init(|| self.load_inner())
-                .with_context(|| format!("Failed to load image layer {}", self.path().display()))?;
+
+            // Need to open the file and load the metadata. Upgrade our lock to
+            // a write lock. (Or rather, release and re-lock in write mode.)
+            drop(inner);
+            let mut inner = self.inner.write().unwrap();
+            if !inner.loaded {
+                self.load_inner(&mut inner).with_context(|| {
+                    format!("Failed to load image layer {}", self.path().display())
+                })?
+            } else {
+                // Another thread loaded it while we were not holding the lock.
+            }
+
+            // We now have the file open and loaded. There's no function to do
+            // that in the std library RwLock, so we have to release and re-lock
+            // in read mode. (To be precise, the lock guard was moved in the
+            // above call to `load_inner`, so it's already been released). And
+            // while we do that, another thread could unload again, so we have
+            // to re-check and retry if that happens.
+            drop(inner);
        }
    }

-    fn load_inner(&self) -> Result<ImageLayerInner> {
+    fn load_inner(&self, inner: &mut ImageLayerInner) -> Result<()> {
        let path = self.path();

        // Open the file if it's not open already.
-        let file = VirtualFile::open(&path)
-            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-        let file = FileBlockReader::new(file);
+        if inner.file.is_none() {
+            let file = VirtualFile::open(&path)
+                .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+            inner.file = Some(FileBlockReader::new(file));
+        }
+        let file = inner.file.as_mut().unwrap();
        let summary_blk = file.read_blk(0)?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

@@ -359,11 +394,10 @@ impl ImageLayer {
            }
        }

-        Ok(ImageLayerInner {
-            index_start_blk: actual_summary.index_start_blk,
-            index_root_blk: actual_summary.index_root_blk,
-            file,
-        })
+        inner.index_start_blk = actual_summary.index_start_blk;
+        inner.index_root_blk = actual_summary.index_root_blk;
+        inner.loaded = true;
+        Ok(())
    }

    /// Create an ImageLayer struct representing an existing file on disk
@@ -387,7 +421,12 @@ impl ImageLayer {
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: filename.lsn,
            access_stats,
-            inner: OnceCell::new(),
+            inner: RwLock::new(ImageLayerInner {
+                loaded: false,
+                file: None,
+                index_start_blk: 0,
+                index_root_blk: 0,
+            }),
        }
    }

@@ -414,7 +453,12 @@ impl ImageLayer {
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
+            inner: RwLock::new(ImageLayerInner {
+                file: None,
+                loaded: false,
+                index_start_blk: 0,
+                index_root_blk: 0,
+            }),
        })
    }

@@ -575,7 +619,12 @@ impl ImageLayerWriterInner {
            desc,
            lsn: self.lsn,
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
+            inner: RwLock::new(ImageLayerInner {
+                loaded: false,
+                file: None,
+                index_start_blk,
+                index_root_blk,
+            }),
        };

        // fsync the file
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -7,7 +7,7 @@
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::{Key, Value};
-use crate::tenant::blob_io::BlobWriter;
+use crate::tenant::blob_io::{BlobCursor, BlobWriter};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
@@ -110,7 +110,6 @@ impl InMemoryLayer {
    }
 }

-#[async_trait::async_trait]
 impl Layer for InMemoryLayer {
    fn get_key_range(&self) -> Range<Key> {
        Key::MIN..Key::MAX
@@ -133,7 +132,7 @@ impl Layer for InMemoryLayer {
    }

    /// debugging function to print out the contents of the layer
-    async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
+    fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
        let inner = self.inner.read().unwrap();

        let end_str = inner
@@ -151,7 +150,7 @@ impl Layer for InMemoryLayer {
            return Ok(());
        }

-        let cursor = inner.file.block_cursor();
+        let mut cursor = inner.file.block_cursor();
        let mut buf = Vec::new();
        for (key, vec_map) in inner.index.iter() {
            for (lsn, pos) in vec_map.as_slice() {
@@ -184,7 +183,7 @@ impl Layer for InMemoryLayer {
    }

    /// Look up given value in the layer.
-    async fn get_value_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
@@ -196,7 +195,7 @@ impl Layer for InMemoryLayer {

        let inner = self.inner.read().unwrap();

-        let reader = inner.file.block_cursor();
+        let mut reader = inner.file.block_cursor();

        // Scan the page versions backwards, starting from `lsn`.
        if let Some(vec_map) = inner.index.get(&key) {
@@ -354,7 +353,7 @@ impl InMemoryLayer {

        let mut buf = Vec::new();

-        let cursor = inner.file.block_cursor();
+        let mut cursor = inner.file.block_cursor();

        let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
        keys.sort_by_key(|k| k.0);
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -20,8 +20,8 @@ use utils::{

 use super::filename::{DeltaFileName, ImageFileName};
 use super::{
-    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset,
-    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
+    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
+    LayerKeyIter, LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
 };

 /// RemoteLayer is a not yet downloaded [`ImageLayer`] or
@@ -65,9 +65,8 @@ impl std::fmt::Debug for RemoteLayer {
    }
 }

-#[async_trait::async_trait]
 impl Layer for RemoteLayer {
-    async fn get_value_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
        _key: Key,
        _lsn_range: Range<Lsn>,
@@ -78,7 +77,7 @@ impl Layer for RemoteLayer {
    }

    /// debugging function to print out the contents of the layer
-    async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
+    fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        println!(
            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
            self.desc.tenant_id,
@@ -129,6 +128,14 @@ impl PersistentLayer for RemoteLayer {
        None
    }

+    fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
+        bail!("cannot iterate a remote layer");
+    }
+
+    fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
+        bail!("cannot iterate a remote layer");
+    }
+
    fn delete_resident_layer_file(&self) -> Result<()> {
        bail!("remote layer has no layer file");
    }
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -111,7 +111,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                Duration::from_secs(10)
            } else {
                // Run compaction
-                if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
+                if let Err(e) = tenant.compaction_iteration(&ctx).await {
                    error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
                    wait_duration
                } else {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,4 +1,3 @@
-pub mod delete;
 mod eviction_task;
 pub mod layer_manager;
 mod logical_size;
@@ -80,7 +79,6 @@ use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};

-use self::delete::DeleteTimelineFlow;
 pub(super) use self::eviction_task::EvictionTaskTenantState;
 use self::eviction_task::EvictionTaskTimelineState;
 use self::layer_manager::LayerManager;
@@ -239,10 +237,11 @@ pub struct Timeline {

    /// Layer removal lock.
    /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
-    /// This lock is acquired in [`Timeline::gc`] and [`Timeline::compact`].
-    /// This is an `Arc<Mutex>` lock because we need an owned
+    /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
+    /// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
    /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
-    /// Note that [`DeleteTimelineFlow`] uses `delete_progress` field.
+    ///
+    /// [`Tenant::delete_timeline`]: super::Tenant::delete_timeline
    pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,

    // Needed to ensure that we can't create a branch at a point that was already garbage collected
@@ -284,7 +283,7 @@ pub struct Timeline {

    /// Prevent two tasks from deleting the timeline at the same time. If held, the
    /// timeline is being deleted. If 'true', the timeline has already been deleted.
-    pub delete_progress: Arc<tokio::sync::Mutex<DeleteTimelineFlow>>,
+    pub delete_lock: Arc<tokio::sync::Mutex<bool>>,

    eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,

@@ -294,10 +293,6 @@ pub struct Timeline {
    /// Completion shared between all timelines loaded during startup; used to delay heavier
    /// background tasks until some logical sizes have been calculated.
    initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
-
-    /// Load or creation time information about the disk_consistent_lsn and when the loading
-    /// happened. Used for consumption metrics.
-    pub(crate) loaded_at: (Lsn, SystemTime),
 }

 pub struct WalReceiverInfo {
@@ -339,7 +334,7 @@ pub struct GcInfo {
 #[derive(thiserror::Error)]
 pub enum PageReconstructError {
    #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error

    /// The operation would require downloading a layer that is missing locally.
    NeedsDownload(TenantTimelineId, LayerFileName),
@@ -480,7 +475,7 @@ impl Timeline {
            img: cached_page_img,
        };

-        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
+        let timer = self.metrics.get_reconstruct_data_time_histo.start_timer();
        self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
            .await?;
        timer.stop_and_record();
@@ -528,7 +523,7 @@ impl Timeline {
        size
    }

-    pub fn resident_physical_size(&self) -> u64 {
+    pub fn get_resident_physical_size(&self) -> u64 {
        self.metrics.resident_physical_size_gauge.get()
    }

@@ -560,7 +555,7 @@ impl Timeline {
            "wait_lsn cannot be called in WAL receiver"
        );

-        let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();
+        let _timer = self.metrics.wait_lsn_time_histo.start_timer();

        match self
            .last_record_lsn
@@ -616,46 +611,9 @@ impl Timeline {
    }

    /// Outermost timeline compaction operation; downloads needed layers.
-    pub async fn compact(
-        self: &Arc<Self>,
-        cancel: &CancellationToken,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    pub async fn compact(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
        const ROUNDS: usize = 2;

-        static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
-            once_cell::sync::Lazy::new(|| {
-                let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
-                let permits = usize::max(
-                    1,
-                    // while a lot of the work is done on spawn_blocking, we still do
-                    // repartitioning in the async context. this should give leave us some workers
-                    // unblocked to be blocked on other work, hopefully easing any outside visible
-                    // effects of restarts.
-                    //
-                    // 6/8 is a guess; previously we ran with unlimited 8 and more from
-                    // spawn_blocking.
-                    (total_threads * 3).checked_div(4).unwrap_or(0),
-                );
-                assert_ne!(permits, 0, "we will not be adding in permits later");
-                assert!(
-                    permits < total_threads,
-                    "need threads avail for shorter work"
-                );
-                tokio::sync::Semaphore::new(permits)
-            });
-
-        // this wait probably never needs any "long time spent" logging, because we already nag if
-        // compaction task goes over it's period (20s) which is quite often in production.
-        let _permit = tokio::select! {
-            permit = CONCURRENT_COMPACTIONS.acquire() => {
-                permit
-            },
-            _ = cancel.cancelled() => {
-                return Ok(());
-            }
-        };
-
        let last_record_lsn = self.get_last_record_lsn();

        // Last record Lsn could be zero in case the timeline was just created
@@ -713,9 +671,11 @@ impl Timeline {

            let mut failed = 0;

+            let mut cancelled = pin!(task_mgr::shutdown_watcher());
+
            loop {
                tokio::select! {
-                    _ = cancel.cancelled() => anyhow::bail!("Cancelled while downloading remote layers"),
+                    _ = &mut cancelled => anyhow::bail!("Cancelled while downloading remote layers"),
                    res = downloads.next() => {
                        match res {
                            Some(Ok(())) => {},
@@ -930,7 +890,7 @@ impl Timeline {
                    new_state,
                    TimelineState::Stopping | TimelineState::Broken { .. }
                ) {
-                    // drop the completion guard, if any; it might be holding off the completion
+                    // drop the copmletion guard, if any; it might be holding off the completion
                    // forever needlessly
                    self.initial_logical_size_attempt
                        .lock()
@@ -1365,10 +1325,9 @@ impl Timeline {
        pg_version: u32,
        initial_logical_size_can_start: Option<completion::Barrier>,
        initial_logical_size_attempt: Option<completion::Completion>,
-        state: TimelineState,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
-        let (state, _) = watch::channel(state);
+        let (state, _) = watch::channel(TimelineState::Loading);

        let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
        let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
@@ -1408,8 +1367,6 @@ impl Timeline {
                last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0),
                last_freeze_ts: RwLock::new(Instant::now()),

-                loaded_at: (disk_consistent_lsn, SystemTime::now()),
-
                ancestor_timeline: ancestor,
                ancestor_lsn: metadata.ancestor_lsn(),

@@ -1461,7 +1418,7 @@ impl Timeline {
                eviction_task_timeline_state: tokio::sync::Mutex::new(
                    EvictionTaskTimelineState::default(),
                ),
-                delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),
+                delete_lock: Arc::new(tokio::sync::Mutex::new(false)),

                initial_logical_size_can_start,
                initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
@@ -1606,7 +1563,7 @@ impl Timeline {
            if let Some(imgfilename) = ImageFileName::parse_str(&fname) {
                // create an ImageLayer struct for each image file.
                if imgfilename.lsn > disk_consistent_lsn {
-                    info!(
+                    warn!(
                        "found future image layer {} on timeline {} disk_consistent_lsn is {}",
                        imgfilename, self.timeline_id, disk_consistent_lsn
                    );
@@ -1638,7 +1595,7 @@ impl Timeline {
                // is 102, then it might not have been fully flushed to disk
                // before crash.
                if deltafilename.lsn_range.end > disk_consistent_lsn + 1 {
-                    info!(
+                    warn!(
                        "found future delta layer {} on timeline {} disk_consistent_lsn is {}",
                        deltafilename, self.timeline_id, disk_consistent_lsn
                    );
@@ -1780,7 +1737,7 @@ impl Timeline {
            match remote_layer_name {
                LayerFileName::Image(imgfilename) => {
                    if imgfilename.lsn > up_to_date_disk_consistent_lsn {
-                        info!(
+                        warn!(
                        "found future image layer {} on timeline {} remote_consistent_lsn is {}",
                        imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn
                    );
@@ -1805,7 +1762,7 @@ impl Timeline {
                    // is 102, then it might not have been fully flushed to disk
                    // before crash.
                    if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
-                        info!(
+                        warn!(
                            "found future delta layer {} on timeline {} remote_consistent_lsn is {}",
                            deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
                        );
@@ -1926,15 +1883,6 @@ impl Timeline {
    }

    fn try_spawn_size_init_task(self: &Arc<Self>, lsn: Lsn, ctx: &RequestContext) {
-        let state = self.current_state();
-        if matches!(
-            state,
-            TimelineState::Broken { .. } | TimelineState::Stopping
-        ) {
-            // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
-            return;
-        }
-
        let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
            .try_acquire_owned()
        {
@@ -2304,9 +2252,8 @@ impl Timeline {
        let mut timeline_owned;
        let mut timeline = self;

-        let mut read_count = scopeguard::guard(0, |cnt| {
-            crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64)
-        });
+        let mut read_count =
+            scopeguard::guard(0, |cnt| self.metrics.read_num_fs_layers.observe(cnt as f64));

        // For debugging purposes, collect the path of layers that we traversed
        // through. It's included in the error message if we fail to find the key.
@@ -2440,15 +2387,12 @@ impl Timeline {
                            // Get all the data needed to reconstruct the page version from this layer.
                            // But if we have an older cached page image, no need to go past that.
                            let lsn_floor = max(cached_lsn + 1, start_lsn);
-                            result = match open_layer
-                                .get_value_reconstruct_data(
-                                    key,
-                                    lsn_floor..cont_lsn,
-                                    reconstruct_state,
-                                    ctx,
-                                )
-                                .await
-                            {
+                            result = match open_layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                                ctx,
+                            ) {
                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
@@ -2470,15 +2414,12 @@ impl Timeline {
                        if cont_lsn > start_lsn {
                            //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
                            let lsn_floor = max(cached_lsn + 1, start_lsn);
-                            result = match frozen_layer
-                                .get_value_reconstruct_data(
-                                    key,
-                                    lsn_floor..cont_lsn,
-                                    reconstruct_state,
-                                    ctx,
-                                )
-                                .await
-                            {
+                            result = match frozen_layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                                ctx,
+                            ) {
                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
@@ -2509,15 +2450,12 @@ impl Timeline {
                            // Get all the data needed to reconstruct the page version from this layer.
                            // But if we have an older cached page image, no need to go past that.
                            let lsn_floor = max(cached_lsn + 1, lsn_floor);
-                            result = match layer
-                                .get_value_reconstruct_data(
-                                    key,
-                                    lsn_floor..cont_lsn,
-                                    reconstruct_state,
-                                    ctx,
-                                )
-                                .await
-                            {
+                            result = match layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                                ctx,
+                            ) {
                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
@@ -3513,13 +3451,7 @@ impl Timeline {
        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
        let mut prev: Option<Key> = None;
        for (next_key, _next_lsn, _size) in itertools::process_results(
-            deltas_to_compact.iter().map(|l| -> Result<_> {
-                Ok(l.clone()
-                    .downcast_delta_layer()
-                    .expect("delta layer")
-                    .load_keys(ctx)?
-                    .into_iter())
-            }),
+            deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
            |iter_iter| iter_iter.kmerge_by(|a, b| a.0 < b.0),
        )? {
            if let Some(prev_key) = prev {
@@ -3555,31 +3487,25 @@ impl Timeline {
        // This iterator walks through all key-value pairs from all the layers
        // we're compacting, in key, LSN order.
        let all_values_iter = itertools::process_results(
-            deltas_to_compact.iter().map(|l| -> Result<_> {
-                Ok(l.clone()
-                    .downcast_delta_layer()
-                    .expect("delta layer")
-                    .load_val_refs(ctx)?
-                    .into_iter())
-            }),
+            deltas_to_compact.iter().map(|l| l.iter(ctx)),
            |iter_iter| {
                iter_iter.kmerge_by(|a, b| {
-                    let (a_key, a_lsn, _) = a;
-                    let (b_key, b_lsn, _) = b;
-                    (a_key, a_lsn) < (b_key, b_lsn)
+                    if let Ok((a_key, a_lsn, _)) = a {
+                        if let Ok((b_key, b_lsn, _)) = b {
+                            (a_key, a_lsn) < (b_key, b_lsn)
+                        } else {
+                            false
+                        }
+                    } else {
+                        true
+                    }
                })
            },
        )?;

        // This iterator walks through all keys and is needed to calculate size used by each key
        let mut all_keys_iter = itertools::process_results(
-            deltas_to_compact.iter().map(|l| -> Result<_> {
-                Ok(l.clone()
-                    .downcast_delta_layer()
-                    .expect("delta layer")
-                    .load_keys(ctx)?
-                    .into_iter())
-            }),
+            deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
            |iter_iter| {
                iter_iter.kmerge_by(|a, b| {
                    let (a_key, a_lsn, _) = a;
@@ -3641,8 +3567,8 @@ impl Timeline {
        let mut key_values_total_size = 0u64;
        let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
-        for (key, lsn, value_ref) in all_values_iter {
-            let value = value_ref.load()?;
+        for x in all_values_iter {
+            let (key, lsn, value) = x?;
            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
            // We need to check key boundaries once we reach next key or end of layer with the same key
            if !same_key || lsn == dup_end_lsn {
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -1,576 +0,0 @@
-use std::{
-    ops::{Deref, DerefMut},
-    sync::Arc,
-};
-
-use anyhow::Context;
-use pageserver_api::models::TimelineState;
-use tokio::sync::OwnedMutexGuard;
-use tracing::{debug, error, info, instrument, warn, Instrument, Span};
-use utils::{
-    crashsafe, fs_ext,
-    id::{TenantId, TimelineId},
-};
-
-use crate::{
-    config::PageServerConf,
-    task_mgr::{self, TaskKind},
-    tenant::{
-        metadata::TimelineMetadata,
-        remote_timeline_client::{
-            self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
-        },
-        CreateTimelineCause, DeleteTimelineError, Tenant,
-    },
-    InitializationOrder,
-};
-
-use super::Timeline;
-
-/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
-async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
-    // Stop the walreceiver first.
-    debug!("waiting for wal receiver to shutdown");
-    let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
-    if let Some(walreceiver) = maybe_started_walreceiver {
-        walreceiver.stop().await;
-    }
-    debug!("wal receiver shutdown confirmed");
-
-    // Prevent new uploads from starting.
-    if let Some(remote_client) = timeline.remote_client.as_ref() {
-        let res = remote_client.stop();
-        match res {
-            Ok(()) => {}
-            Err(e) => match e {
-                remote_timeline_client::StopError::QueueUninitialized => {
-                    // This case shouldn't happen currently because the
-                    // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
-                    // That is, before we declare the Tenant as Active.
-                    // But we only allow calls to delete_timeline on Active tenants.
-                    return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
-                }
-            },
-        }
-    }
-
-    // Stop & wait for the remaining timeline tasks, including upload tasks.
-    // NB: This and other delete_timeline calls do not run as a task_mgr task,
-    //     so, they are not affected by this shutdown_tasks() call.
-    info!("waiting for timeline tasks to shutdown");
-    task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await;
-
-    fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: timeline-delete-before-index-deleted-at"
-        ))?
-    });
-    Ok(())
-}
-
-/// Mark timeline as deleted in S3 so we won't pick it up next time
-/// during attach or pageserver restart.
-/// See comment in persist_index_part_with_deleted_flag.
-async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
-    if let Some(remote_client) = timeline.remote_client.as_ref() {
-        match remote_client.persist_index_part_with_deleted_flag().await {
-            // If we (now, or already) marked it successfully as deleted, we can proceed
-            Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
-            // Bail out otherwise
-            //
-            // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
-            // two tasks from performing the deletion at the same time. The first task
-            // that starts deletion should run it to completion.
-            Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
-            | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
-                return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
-            }
-        }
-    }
-    Ok(())
-}
-
-// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued.
-// This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json
-// gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote"
-// and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file.
-// After index part is deleted presence of this mark file indentifies that it was a deletion intention.
-// So we can just remove the mark file.
-async fn create_delete_mark(
-    conf: &PageServerConf,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-) -> Result<(), DeleteTimelineError> {
-    fail::fail_point!("timeline-delete-before-delete-mark", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: timeline-delete-before-delete-mark"
-        ))?
-    });
-    let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id);
-
-    // Note: we're ok to replace existing file.
-    let _ = std::fs::OpenOptions::new()
-        .write(true)
-        .create(true)
-        .open(&marker_path)
-        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
-
-    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
-    Ok(())
-}
-
-/// Grab the layer_removal_cs lock, and actually perform the deletion.
-///
-/// This lock prevents prevents GC or compaction from running at the same time.
-/// The GC task doesn't register itself with the timeline it's operating on,
-/// so it might still be running even though we called `shutdown_tasks`.
-///
-/// Note that there are still other race conditions between
-/// GC, compaction and timeline deletion. See
-/// <https://github.com/neondatabase/neon/issues/2671>
-///
-/// No timeout here, GC & Compaction should be responsive to the
-/// `TimelineState::Stopping` change.
-async fn delete_local_layer_files(
-    conf: &PageServerConf,
-    tenant_id: TenantId,
-    timeline: &Timeline,
-) -> anyhow::Result<()> {
-    info!("waiting for layer_removal_cs.lock()");
-    let layer_removal_guard = timeline.layer_removal_cs.lock().await;
-    info!("got layer_removal_cs.lock(), deleting layer files");
-
-    // NB: storage_sync upload tasks that reference these layers have been cancelled
-    //     by the caller.
-
-    let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id);
-
-    fail::fail_point!("timeline-delete-before-rm", |_| {
-        Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
-    });
-
-    // NB: This need not be atomic because the deleted flag in the IndexPart
-    // will be observed during tenant/timeline load. The deletion will be resumed there.
-    //
-    // For configurations without remote storage, we guarantee crash-safety by persising delete mark file.
-    //
-    // Note that here we do not bail out on std::io::ErrorKind::NotFound.
-    // This can happen if we're called a second time, e.g.,
-    // because of a previous failure/cancellation at/after
-    // failpoint timeline-delete-after-rm.
-    //
-    // It can also happen if we race with tenant detach, because,
-    // it doesn't grab the layer_removal_cs lock.
-    //
-    // For now, log and continue.
-    // warn! level is technically not appropriate for the
-    // first case because we should expect retries to happen.
-    // But the error is so rare, it seems better to get attention if it happens.
-    //
-    // Note that metadata removal is skipped, this is not technically needed,
-    // but allows to reuse timeline loading code during resumed deletion.
-    // (we always expect that metadata is in place when timeline is being loaded)
-
-    #[cfg(feature = "testing")]
-    let mut counter = 0;
-
-    // Timeline directory may not exist if we failed to delete mark file and request was retried.
-    if !local_timeline_directory.exists() {
-        return Ok(());
-    }
-
-    let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id);
-
-    for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
-        #[cfg(feature = "testing")]
-        {
-            counter += 1;
-            if counter == 2 {
-                fail::fail_point!("timeline-delete-during-rm", |_| {
-                    Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))?
-                });
-            }
-        }
-
-        let entry = entry?;
-        if entry.path() == metadata_path {
-            debug!("found metadata, skipping");
-            continue;
-        }
-
-        if entry.path() == local_timeline_directory {
-            // Keeping directory because metedata file is still there
-            debug!("found timeline dir itself, skipping");
-            continue;
-        }
-
-        let metadata = match entry.metadata() {
-            Ok(metadata) => metadata,
-            Err(e) => {
-                if crate::is_walkdir_io_not_found(&e) {
-                    warn!(
-                        timeline_dir=?local_timeline_directory,
-                        path=?entry.path().display(),
-                        "got not found err while removing timeline dir, proceeding anyway"
-                    );
-                    continue;
-                }
-                anyhow::bail!(e);
-            }
-        };
-
-        let r = if metadata.is_dir() {
-            // There shouldnt be any directories inside timeline dir as of current layout.
-            tokio::fs::remove_dir(entry.path()).await
-        } else {
-            tokio::fs::remove_file(entry.path()).await
-        };
-
-        if let Err(e) = r {
-            if e.kind() == std::io::ErrorKind::NotFound {
-                warn!(
-                    timeline_dir=?local_timeline_directory,
-                    path=?entry.path().display(),
-                    "got not found err while removing timeline dir, proceeding anyway"
-                );
-                continue;
-            }
-            anyhow::bail!(anyhow::anyhow!(
-                "Failed to remove: {}. Error: {e}",
-                entry.path().display()
-            ));
-        }
-    }
-
-    info!("finished deleting layer files, releasing layer_removal_cs.lock()");
-    drop(layer_removal_guard);
-
-    fail::fail_point!("timeline-delete-after-rm", |_| {
-        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
-    });
-
-    Ok(())
-}
-
-/// Removes remote layers and an index file after them.
-async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
-    if let Some(remote_client) = &timeline.remote_client {
-        remote_client.delete_all().await.context("delete_all")?
-    };
-
-    Ok(())
-}
-
-// This function removs remaining traces of a timeline on disk.
-// Namely: metadata file, timeline directory, delete mark.
-// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
-// delete mark should be present because it is the last step during deletion.
-// (nothing can fail after its deletion)
-async fn cleanup_remaining_timeline_fs_traces(
-    conf: &PageServerConf,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-) -> anyhow::Result<()> {
-    // Remove local metadata
-    tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id))
-        .await
-        .or_else(fs_ext::ignore_not_found)
-        .context("remove metadata")?;
-
-    fail::fail_point!("timeline-delete-after-rm-metadata", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: timeline-delete-after-rm-metadata"
-        ))?
-    });
-
-    // Remove timeline dir
-    tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id))
-        .await
-        .or_else(fs_ext::ignore_not_found)
-        .context("timeline dir")?;
-
-    fail::fail_point!("timeline-delete-after-rm-dir", |_| {
-        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
-    });
-
-    // Remove delete mark
-    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
-        .await
-        .context("remove delete mark")
-}
-
-/// It is important that this gets called when DeletionGuard is being held.
-/// For more context see comments in [`DeleteTimelineFlow::prepare`]
-async fn remove_timeline_from_tenant(
-    tenant: &Tenant,
-    timeline_id: TimelineId,
-    _: &DeletionGuard, // using it as a witness
-) -> anyhow::Result<()> {
-    // Remove the timeline from the map.
-    let mut timelines = tenant.timelines.lock().unwrap();
-    let children_exist = timelines
-        .iter()
-        .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
-    // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
-    // We already deleted the layer files, so it's probably best to panic.
-    // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
-    if children_exist {
-        panic!("Timeline grew children while we removed layer files");
-    }
-
-    timelines
-        .remove(&timeline_id)
-        .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
-
-    drop(timelines);
-
-    Ok(())
-}
-
-/// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
-/// and deletes its data from both disk and s3.
-/// The sequence of steps:
-/// 1. Set deleted_at in remote index part.
-/// 2. Create local mark file.
-/// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
-/// 4. Delete remote layers
-/// 5. Delete index part
-/// 6. Delete meta, timeline directory
-/// 7. Delete mark file
-/// It is resumable from any step in case a crash/restart occurs.
-/// There are three entrypoints to the process:
-/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
-/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
-/// and we possibly neeed to continue deletion of remote files.
-/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
-/// index but still have local metadata, timeline directory and delete mark.
-/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
-#[derive(Default)]
-pub enum DeleteTimelineFlow {
-    #[default]
-    NotStarted,
-    InProgress,
-    Finished,
-}
-
-impl DeleteTimelineFlow {
-    // These steps are run in the context of management api request handler.
-    // Long running steps are continued to run in the background.
-    // NB: If this fails half-way through, and is retried, the retry will go through
-    // all the same steps again. Make sure the code here is idempotent, and don't
-    // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))]
-    pub async fn run(
-        tenant: &Arc<Tenant>,
-        timeline_id: TimelineId,
-    ) -> Result<(), DeleteTimelineError> {
-        let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
-
-        guard.mark_in_progress()?;
-
-        stop_tasks(&timeline).await?;
-
-        set_deleted_in_remote_index(&timeline).await?;
-
-        create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?;
-
-        fail::fail_point!("timeline-delete-before-schedule", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: timeline-delete-before-schedule"
-            ))?
-        });
-
-        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
-
-        Ok(())
-    }
-
-    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
-        match self {
-            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
-            Self::InProgress { .. } => { /* We're in a retry */ }
-            Self::NotStarted => { /* Fresh start */ }
-        }
-
-        *self = Self::InProgress;
-
-        Ok(())
-    }
-
-    /// Shortcut to create Timeline in stopping state and spawn deletion task.
-    pub async fn resume_deletion(
-        tenant: Arc<Tenant>,
-        timeline_id: TimelineId,
-        local_metadata: &TimelineMetadata,
-        remote_client: Option<RemoteTimelineClient>,
-        init_order: Option<&InitializationOrder>,
-    ) -> anyhow::Result<()> {
-        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
-        // RemoteTimelineClient is the only functioning part.
-        let timeline = tenant
-            .create_timeline_struct(
-                timeline_id,
-                local_metadata,
-                None, // Ancestor is not needed for deletion.
-                remote_client,
-                init_order,
-                // Important. We dont pass ancestor above because it can be missing.
-                // Thus we need to skip the validation here.
-                CreateTimelineCause::Delete,
-            )
-            .context("create_timeline_struct")?;
-
-        let mut guard = DeletionGuard(
-            Arc::clone(&timeline.delete_progress)
-                .try_lock_owned()
-                .expect("cannot happen because we're the only owner"),
-        );
-
-        // We meed to do this because when console retries delete request we shouldnt answer with 404
-        // because 404 means successful deletion.
-        {
-            let mut locked = tenant.timelines.lock().unwrap();
-            locked.insert(timeline_id, Arc::clone(&timeline));
-        }
-
-        guard.mark_in_progress()?;
-
-        // Note that delete mark can be missing on resume
-        // because we create delete mark after we set deleted_at in the index part.
-        create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?;
-
-        Self::schedule_background(guard, tenant.conf, tenant, timeline);
-
-        Ok(())
-    }
-
-    pub async fn cleanup_remaining_timeline_fs_traces(
-        tenant: &Tenant,
-        timeline_id: TimelineId,
-    ) -> anyhow::Result<()> {
-        cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await
-    }
-
-    fn prepare(
-        tenant: &Tenant,
-        timeline_id: TimelineId,
-    ) -> Result<(Arc<Timeline>, DeletionGuard), DeleteTimelineError> {
-        // Note the interaction between this guard and deletion guard.
-        // Here we attempt to lock deletion guard when we're holding a lock on timelines.
-        // This is important because when you take into account `remove_timeline_from_tenant`
-        // we remove timeline from memory when we still hold the deletion guard.
-        // So here when timeline deletion is finished timeline wont be present in timelines map at all
-        // which makes the following sequence impossible:
-        // T1: get preempted right before the try_lock on `Timeline::delete_progress`
-        // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
-        // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
-        // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
-        let timelines = tenant.timelines.lock().unwrap();
-
-        let timeline = match timelines.get(&timeline_id) {
-            Some(t) => t,
-            None => return Err(DeleteTimelineError::NotFound),
-        };
-
-        // Ensure that there are no child timelines **attached to that pageserver**,
-        // because detach removes files, which will break child branches
-        let children: Vec<TimelineId> = timelines
-            .iter()
-            .filter_map(|(id, entry)| {
-                if entry.get_ancestor_timeline_id() == Some(timeline_id) {
-                    Some(*id)
-                } else {
-                    None
-                }
-            })
-            .collect();
-
-        if !children.is_empty() {
-            return Err(DeleteTimelineError::HasChildren(children));
-        }
-
-        // Note that using try_lock here is important to avoid a deadlock.
-        // Here we take lock on timelines and then the deletion guard.
-        // At the end of the operation we're holding the guard and need to lock timelines map
-        // to remove the timeline from it.
-        // Always if you have two locks that are taken in different order this can result in a deadlock.
-        let delete_lock_guard = DeletionGuard(
-            Arc::clone(&timeline.delete_progress)
-                .try_lock_owned()
-                .map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
-        );
-
-        timeline.set_state(TimelineState::Stopping);
-
-        Ok((Arc::clone(timeline), delete_lock_guard))
-    }
-
-    fn schedule_background(
-        guard: DeletionGuard,
-        conf: &'static PageServerConf,
-        tenant: Arc<Tenant>,
-        timeline: Arc<Timeline>,
-    ) {
-        let tenant_id = timeline.tenant_id;
-        let timeline_id = timeline.timeline_id;
-
-        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
-            TaskKind::TimelineDeletionWorker,
-            Some(tenant_id),
-            Some(timeline_id),
-            "timeline_delete",
-            false,
-            async move {
-                if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
-                    error!("Error: {err:#}");
-                    timeline.set_broken(format!("{err:#}"))
-                };
-                Ok(())
-            }
-            .instrument({
-                let span =
-                    tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
-                span.follows_from(Span::current());
-                span
-            }),
-        );
-    }
-
-    async fn background(
-        mut guard: DeletionGuard,
-        conf: &PageServerConf,
-        tenant: &Tenant,
-        timeline: &Timeline,
-    ) -> Result<(), DeleteTimelineError> {
-        delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;
-
-        delete_remote_layers_and_index(timeline).await?;
-
-        pausable_failpoint!("in_progress_delete");
-
-        cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?;
-
-        remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
-
-        *guard.0 = Self::Finished;
-
-        Ok(())
-    }
-}
-
-struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
-
-impl Deref for DeletionGuard {
-    type Target = DeleteTimelineFlow;
-
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
-
-impl DerefMut for DeletionGuard {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.0
-    }
-}
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -308,13 +308,8 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
        let mut state = self.eviction_task_timeline_state.lock().await;
-
-        // Only do the imitate_layer accesses approximately as often as the threshold.  A little
-        // more frequently, to avoid this period racing with the threshold/period-th eviction iteration.
-        let inter_imitate_period = p.threshold.checked_sub(p.period).unwrap_or(p.threshold);
-
        match state.last_layer_access_imitation {
-            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
+            Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
            _ => {
                self.imitate_timeline_cached_layer_accesses(cancel, ctx)
                    .await;
@@ -337,7 +332,7 @@ impl Timeline {
        };
        let mut state = tenant.eviction_task_tenant_state.lock().await;
        match state.last_layer_access_imitation {
-            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
+            Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
            _ => {
                self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
                    .await;
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -2,9 +2,13 @@ use std::{collections::hash_map::Entry, fs, path::PathBuf, sync::Arc};

 use anyhow::Context;
 use tracing::{error, info, info_span, warn};
-use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn};
+use utils::{crashsafe, id::TimelineId, lsn::Lsn};

-use crate::{context::RequestContext, import_datadir, tenant::Tenant};
+use crate::{
+    context::RequestContext,
+    import_datadir,
+    tenant::{ignore_absent_files, Tenant},
+};

 use super::Timeline;

@@ -137,7 +141,7 @@ impl Drop for UninitializedTimeline<'_> {

 pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
    let timeline_path = &uninit_mark.timeline_path;
-    match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
+    match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
        Ok(()) => {
            info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
        }
@@ -181,7 +185,7 @@ impl TimelineUninitMark {
        let uninit_mark_parent = uninit_mark_file
            .parent()
            .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
-        fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
+        ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
            format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
        })?;
        crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -1123,7 +1123,7 @@ mod tests {
    }

    #[tokio::test]
-    async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
+    async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
        let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
        let mut state = dummy_state(&harness).await;
        let current_lsn = Lsn(100_000).align();
@@ -1189,8 +1189,8 @@ mod tests {
    }

    #[tokio::test]
-    async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?;
+    async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?;
        let mut state = dummy_state(&harness).await;
        let current_lsn = Lsn(100_000).align();
        let now = Utc::now().naive_utc();
@@ -1252,8 +1252,8 @@ mod tests {
    }

    #[tokio::test]
-    async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?;
+    async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?;
        let mut state = dummy_state(&harness).await;
        let current_lsn = Lsn(100_000).align();
        let new_lsn = Lsn(100_100).align();
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -149,10 +149,12 @@ impl OpenFiles {
        // old file.
        //
        if let Some(old_file) = slot_guard.file.take() {
-            // the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
-            // distinguish the two.
+            // We do not have information about tenant_id/timeline_id of evicted file.
+            // It is possible to store path together with file or use filepath crate,
+            // but as far as close() is not expected to be fast, it is not so critical to gather
+            // precise per-tenant statistic here.
            STORAGE_IO_TIME
-                .with_label_values(&["close-by-replace"])
+                .with_label_values(&["close", "-", "-"])
                .observe_closure_duration(|| drop(old_file));
        }

@@ -206,7 +208,7 @@ impl VirtualFile {
        }
        let (handle, mut slot_guard) = get_open_files().find_victim_slot();
        let file = STORAGE_IO_TIME
-            .with_label_values(&["open"])
+            .with_label_values(&["open", &tenant_id, &timeline_id])
            .observe_closure_duration(|| open_options.open(path))?;

        // Strip all options other than read and write.
@@ -269,7 +271,7 @@ impl VirtualFile {
                            // Found a cached file descriptor.
                            slot.recently_used.store(true, Ordering::Relaxed);
                            return Ok(STORAGE_IO_TIME
-                                .with_label_values(&[op])
+                                .with_label_values(&[op, &self.tenant_id, &self.timeline_id])
                                .observe_closure_duration(|| func(file)));
                        }
                    }
@@ -296,12 +298,12 @@ impl VirtualFile {

        // Open the physical file
        let file = STORAGE_IO_TIME
-            .with_label_values(&["open"])
+            .with_label_values(&["open", &self.tenant_id, &self.timeline_id])
            .observe_closure_duration(|| self.open_options.open(&self.path))?;

        // Perform the requested operation on it
        let result = STORAGE_IO_TIME
-            .with_label_values(&[op])
+            .with_label_values(&[op, &self.tenant_id, &self.timeline_id])
            .observe_closure_duration(|| func(&file));

        // Store the File in the slot and update the handle in the VirtualFile
@@ -331,11 +333,13 @@ impl Drop for VirtualFile {
        let mut slot_guard = slot.inner.write().unwrap();
        if slot_guard.tag == handle.tag {
            slot.recently_used.store(false, Ordering::Relaxed);
-            // there is also operation "close-by-replace" for closes done on eviction for
-            // comparison.
+            // Unlike files evicted by replacement algorithm, here
+            // we group close time by tenant_id/timeline_id.
+            // At allows to compare number/time of "normal" file closes
+            // with file eviction.
            STORAGE_IO_TIME
-                .with_label_values(&["close"])
-                .observe_closure_duration(|| drop(slot_guard.file.take()));
+                .with_label_values(&["close", &self.tenant_id, &self.timeline_id])
+                .observe_closure_duration(|| slot_guard.file.take());
        }
    }
 }
--- a/pgxn/neon/libpqwalproposer.c
+++ b/pgxn/neon/libpqwalproposer.c
@@ -292,7 +292,7 @@ walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
 	/*
 	 * The docs for PQgetCopyData list the return values as: 0 if the copy is
 	 * still in progress, but no "complete row" is available -1 if the copy is
-	 * done -2 if an error occurred (> 0) if it was successful; that value is
+	 * done -2 if an error occured (> 0) if it was successful; that value is
 	 * the amount transferred.
 	 *
 	 * The protocol we use between walproposer and safekeeper means that we
@@ -353,7 +353,7 @@ walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
 	/*
 	 * The docs for PQputcopyData list the return values as: 1 if the data was
 	 * queued, 0 if it was not queued because of full buffers, or -1 if an
-	 * error occurred
+	 * error occured
 	 */
 	result = PQputCopyData(conn->pg_conn, buf, size);

--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -788,7 +788,7 @@ ReconnectSafekeepers(void)

 /*
 * Performs the logic for advancing the state machine of the specified safekeeper,
- * given that a certain set of events has occurred.
+ * given that a certain set of events has occured.
 */
 static void
 AdvancePollState(Safekeeper *sk, uint32 events)
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -23,7 +23,7 @@
 									 * message header */

 /*
- * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occurred,
+ * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured,
 * because all WL_* events are given flags equal to some (1 << i), starting from i = 0
 */
 #define WL_NO_EVENTS 0
@@ -317,7 +317,7 @@ typedef struct AppendResponse
 	/* this is a criterion for walproposer --sync mode exit */
 	XLogRecPtr	commitLsn;
 	HotStandbyFeedback hs;
-	/* Feedback received from pageserver includes standby_status_update fields */
+	/* Feedback recieved from pageserver includes standby_status_update fields */
 	/* and custom neon feedback. */
 	/* This part of the message is extensible. */
 	PageserverFeedback rf;
--- a/poetry.lock
+++ b/poetry.lock
@@ -740,13 +740,13 @@ typing-extensions = ">=4.1.0"

 [[package]]
 name = "certifi"
-version = "2023.7.22"
+version = "2022.12.7"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
-    {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
+    {file = "certifi-2022.12.7-py3-none-any.whl", hash = "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"},
+    {file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"},
 ]

 [[package]]
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -53,12 +53,6 @@ pub enum BackendType<'a, T> {
    Postgres(Cow<'a, console::provider::mock::Api>, T),
    /// Authentication via a web browser.
    Link(Cow<'a, url::ApiUrl>),
-    /// Test backend.
-    Test(&'a dyn TestBackend),
-}
-
-pub trait TestBackend: Send + Sync + 'static {
-    fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
 }

 impl std::fmt::Display for BackendType<'_, ()> {
@@ -68,7 +62,6 @@ impl std::fmt::Display for BackendType<'_, ()> {
            Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(),
            Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(),
            Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
-            Test(_) => fmt.debug_tuple("Test").finish(),
        }
    }
 }
@@ -82,7 +75,6 @@ impl<T> BackendType<'_, T> {
            Console(c, x) => Console(Cow::Borrowed(c), x),
            Postgres(c, x) => Postgres(Cow::Borrowed(c), x),
            Link(c) => Link(Cow::Borrowed(c)),
-            Test(x) => Test(*x),
        }
    }
 }
@@ -97,7 +89,6 @@ impl<'a, T> BackendType<'a, T> {
            Console(c, x) => Console(c, f(x)),
            Postgres(c, x) => Postgres(c, f(x)),
            Link(c) => Link(c),
-            Test(x) => Test(x),
        }
    }
 }
@@ -111,7 +102,6 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
            Console(c, x) => x.map(|x| Console(c, x)),
            Postgres(c, x) => x.map(|x| Postgres(c, x)),
            Link(c) => Ok(Link(c)),
-            Test(x) => Ok(Test(x)),
        }
    }
 }
@@ -157,7 +147,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
            Console(_, creds) => creds.project.clone(),
            Postgres(_, creds) => creds.project.clone(),
            Link(_) => Some("link".to_owned()),
-            Test(_) => Some("test".to_owned()),
        }
    }
    /// Authenticate the client via the requested backend, possibly using credentials.
@@ -199,9 +188,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
                    .await?
                    .map(CachedNodeInfo::new_uncached)
            }
-            Test(_) => {
-                unreachable!("this function should never be called in the test backend")
-            }
        };

        info!("user successfully authenticated");
@@ -220,7 +206,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
            Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
            Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
            Link(_) => Ok(None),
-            Test(x) => x.wake_compute().map(Some),
        }
    }
 }
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -1,11 +1,8 @@
-use std::ops::ControlFlow;
-
 use super::AuthSuccess;
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
    compute,
    console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
-    proxy::handle_try_wake,
    sasl, scram,
    stream::PqStream,
 };
@@ -51,16 +48,7 @@ pub(super) async fn authenticate(
        }
    };

-    info!("compute node's state has likely changed; requesting a wake-up");
-    let mut num_retries = 0;
-    let mut node = loop {
-        let wake_res = api.wake_compute(extra, creds).await;
-        match handle_try_wake(wake_res, num_retries)? {
-            ControlFlow::Continue(_) => num_retries += 1,
-            ControlFlow::Break(n) => break n,
-        }
-        info!(num_retries, "retrying wake compute");
-    };
+    let mut node = api.wake_compute(extra, creds).await?;
    if let Some(keys) = scram_keys {
        use tokio_postgres::config::AuthKeys;
        node.config.auth_keys(AuthKeys::ScramSha256(keys));
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -48,14 +48,6 @@ impl ClientCredentials<'_> {
 }

 impl<'a> ClientCredentials<'a> {
-    #[cfg(test)]
-    pub fn new_noop() -> Self {
-        ClientCredentials {
-            user: "",
-            project: None,
-        }
-    }
-
    pub fn parse(
        params: &'a StartupMessageParams,
        sni: Option<&str>,
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -14,7 +14,6 @@ pub mod errors {
    use crate::{
        error::{io_error, UserFacingError},
        http,
-        proxy::ShouldRetry,
    };
    use thiserror::Error;

@@ -73,24 +72,6 @@ pub mod errors {
        }
    }

-    impl ShouldRetry for ApiError {
-        fn could_retry(&self) -> bool {
-            match self {
-                // retry some transport errors
-                Self::Transport(io) => io.could_retry(),
-                // retry some temporary failures because the compute was in a bad state
-                // (bad request can be returned when the endpoint was in transition)
-                Self::Console {
-                    status: http::StatusCode::BAD_REQUEST | http::StatusCode::LOCKED,
-                    ..
-                } => true,
-                // retry server errors
-                Self::Console { status, .. } if status.is_server_error() => true,
-                _ => false,
-            }
-        }
-    }
-
    impl From<reqwest::Error> for ApiError {
        fn from(e: reqwest::Error) -> Self {
            io_error(e).into()
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -1,9 +1,7 @@
 use std::sync::Arc;

-use anyhow::bail;
 use futures::pin_mut;
 use futures::StreamExt;
-use hashbrown::HashMap;
 use hyper::body::HttpBody;
 use hyper::http::HeaderName;
 use hyper::http::HeaderValue;
@@ -13,8 +11,6 @@ use serde_json::Map;
 use serde_json::Value;
 use tokio_postgres::types::Kind;
 use tokio_postgres::types::Type;
-use tokio_postgres::GenericClient;
-use tokio_postgres::IsolationLevel;
 use tokio_postgres::Row;
 use url::Url;

@@ -27,21 +23,12 @@ struct QueryData {
    params: Vec<serde_json::Value>,
 }

-#[derive(serde::Deserialize)]
-#[serde(untagged)]
-enum Payload {
-    Single(QueryData),
-    Batch(Vec<QueryData>),
-}
-
 pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB

 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
 static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
-static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level");
-static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only");

 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");

@@ -175,7 +162,7 @@ pub async fn handle(
    request: Request<Body>,
    sni_hostname: Option<String>,
    conn_pool: Arc<GlobalConnPool>,
-) -> anyhow::Result<(Value, HashMap<HeaderName, HeaderValue>)> {
+) -> anyhow::Result<Value> {
    //
    // Determine the destination and connection params
    //
@@ -190,23 +177,6 @@ pub async fn handle(
    // Allow connection pooling only if explicitly requested
    let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);

-    // isolation level and read only
-
-    let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
-    let txn_isolation_level = match txn_isolation_level_raw {
-        Some(ref x) => Some(match x.as_bytes() {
-            b"Serializable" => IsolationLevel::Serializable,
-            b"ReadUncommitted" => IsolationLevel::ReadUncommitted,
-            b"ReadCommitted" => IsolationLevel::ReadCommitted,
-            b"RepeatableRead" => IsolationLevel::RepeatableRead,
-            _ => bail!("invalid isolation level"),
-        }),
-        None => None,
-    };
-
-    let txn_read_only_raw = headers.get(&TXN_READ_ONLY).cloned();
-    let txn_read_only = txn_read_only_raw.as_ref() == Some(&HEADER_VALUE_TRUE);
-
    let request_content_length = match request.body().size_hint().upper() {
        Some(v) => v,
        None => MAX_REQUEST_SIZE + 1,
@@ -222,70 +192,15 @@ pub async fn handle(
    // Read the query and query params from the request body
    //
    let body = hyper::body::to_bytes(request.into_body()).await?;
-    let payload: Payload = serde_json::from_slice(&body)?;
-
-    let mut client = conn_pool.get(&conn_info, !allow_pool).await?;
+    let QueryData { query, params } = serde_json::from_slice(&body)?;
+    let query_params = json_to_pg_text(params)?;

    //
    // Now execute the query and return the result
    //
-    let result = match payload {
-        Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode)
-            .await
-            .map(|x| (x, HashMap::default())),
-        Payload::Batch(queries) => {
-            let mut results = Vec::new();
-            let mut builder = client.build_transaction();
-            if let Some(isolation_level) = txn_isolation_level {
-                builder = builder.isolation_level(isolation_level);
-            }
-            if txn_read_only {
-                builder = builder.read_only(true);
-            }
-            let transaction = builder.start().await?;
-            for query in queries {
-                let result = query_to_json(&transaction, query, raw_output, array_mode).await;
-                match result {
-                    Ok(r) => results.push(r),
-                    Err(e) => {
-                        transaction.rollback().await?;
-                        return Err(e);
-                    }
-                }
-            }
-            transaction.commit().await?;
-            let mut headers = HashMap::default();
-            headers.insert(
-                TXN_READ_ONLY.clone(),
-                HeaderValue::try_from(txn_read_only.to_string())?,
-            );
-            if let Some(txn_isolation_level_raw) = txn_isolation_level_raw {
-                headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level_raw);
-            }
-            Ok((json!({ "results": results }), headers))
-        }
-    };
+    let client = conn_pool.get(&conn_info, !allow_pool).await?;

-    if allow_pool {
-        // return connection to the pool
-        tokio::task::spawn(async move {
-            let _ = conn_pool.put(&conn_info, client).await;
-        });
-    }
-
-    result
-}
-
-async fn query_to_json<T: GenericClient>(
-    client: &T,
-    data: QueryData,
-    raw_output: bool,
-    array_mode: bool,
-) -> anyhow::Result<Value> {
-    let query_params = json_to_pg_text(data.params)?;
-    let row_stream = client
-        .query_raw_txt::<String, _>(data.query, query_params)
-        .await?;
+    let row_stream = client.query_raw_txt(query, query_params).await?;

    // Manually drain the stream into a vector to leave row_stream hanging
    // around to get a command tag. Also check that the response is not too
@@ -341,6 +256,13 @@ async fn query_to_json<T: GenericClient>(
        .map(|row| pg_text_row_to_json(row, raw_output, array_mode))
        .collect::<Result<Vec<_>, _>>()?;

+    if allow_pool {
+        // return connection to the pool
+        tokio::task::spawn(async move {
+            let _ = conn_pool.put(&conn_info, client).await;
+        });
+    }
+
    // resulting JSON format is based on the format of node-postgres result
    Ok(json!({
        "command": command_tag_name,
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -6,7 +6,6 @@ use crate::{
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
-use hashbrown::HashMap;
 use hyper::{
    server::{
        accept,
@@ -182,15 +181,13 @@ async fn ws_handler(

    // Check if the request is a websocket upgrade request.
    if hyper_tungstenite::is_upgrade_request(&request) {
-        info!(session_id = ?session_id, "performing websocket upgrade");
-
        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
            .map_err(|e| ApiError::BadRequest(e.into()))?;

        tokio::spawn(async move {
            if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
            {
-                error!(session_id = ?session_id, "error in websocket connection: {e:?}");
+                error!("error in websocket connection: {e:?}");
            }
        });

@@ -206,7 +203,7 @@ async fn ws_handler(
            Ok(_) => StatusCode::OK,
            Err(_) => StatusCode::BAD_REQUEST,
        };
-        let (json, headers) = match result {
+        let json = match result {
            Ok(r) => r,
            Err(e) => {
                let message = format!("{:?}", e);
@@ -217,10 +214,7 @@ async fn ws_handler(
                    },
                    None => Value::Null,
                };
-                (
-                    json!({ "message": message, "code": code }),
-                    HashMap::default(),
-                )
+                json!({ "message": message, "code": code })
            }
        };
        json_response(status_code, json).map(|mut r| {
@@ -228,9 +222,6 @@ async fn ws_handler(
                "Access-Control-Allow-Origin",
                hyper::http::HeaderValue::from_static("*"),
            );
-            for (k, v) in headers {
-                r.headers_mut().insert(k, v);
-            }
            r
        })
    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -11,6 +11,7 @@ const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";

 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

+///
 /// Key that uniquely identifies the object, this metric describes.
 /// Currently, endpoint_id is enough, but this may change later,
 /// so keep it in a named struct.
@@ -18,7 +19,8 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// Both the proxy and the ingestion endpoint will live in the same region (or cell)
 /// so while the project-id is unique across regions the whole pipeline will work correctly
 /// because we enrich the event with project_id in the control-plane endpoint.
-#[derive(Eq, Hash, PartialEq, Serialize, Debug, Clone)]
+///
+#[derive(Eq, Hash, PartialEq, Serialize, Debug)]
 pub struct Ids {
    pub endpoint_id: String,
    pub branch_id: String,
@@ -147,7 +149,7 @@ async fn collect_metrics_iteration(
                    stop_time: *curr_time,
                },
                metric: PROXY_IO_BYTES_PER_CLIENT,
-                idempotency_key: idempotency_key(hostname),
+                idempotency_key: idempotency_key(hostname.to_owned()),
                value,
                extra: Ids {
                    endpoint_id: curr_key.endpoint_id.clone(),
@@ -165,11 +167,12 @@ async fn collect_metrics_iteration(
    // Send metrics.
    // Split into chunks of 1000 metrics to avoid exceeding the max request size
    for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
+        let chunk_json = serde_json::value::to_raw_value(&EventChunk { events: chunk })
+            .expect("ProxyConsumptionMetric should not fail serialization");
+
        let res = client
            .post(metric_collection_endpoint.clone())
-            .json(&EventChunk {
-                events: chunk.into(),
-            })
+            .json(&chunk_json)
            .send()
            .await;

--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -6,15 +6,18 @@ use crate::{
    cancellation::{self, CancelMap},
    compute::{self, PostgresConnection},
    config::{ProxyConfig, TlsConfig},
-    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
+    console::{
+        self,
+        errors::{ApiError, WakeComputeError},
+        messages::MetricsAuxInfo,
+    },
    stream::{PqStream, Stream},
 };
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use metrics::{
-    exponential_buckets, register_histogram, register_int_counter_vec, Histogram, IntCounterVec,
-};
+use hyper::StatusCode;
+use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use std::{error::Error, io, ops::ControlFlow, sync::Arc};
@@ -28,37 +31,25 @@ use utils::measured_stream::MeasuredStream;

 /// Number of times we should retry the `/proxy_wake_compute` http request.
 /// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
-pub const NUM_RETRIES_CONNECT: u32 = 10;
+const NUM_RETRIES_CONNECT: u32 = 10;
 const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
 const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);

 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";

-static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
+static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
        "proxy_accepted_connections_total",
-        "Number of TCP client connections accepted.",
-        &["protocol"],
+        "Number of TCP client connections accepted."
    )
    .unwrap()
 });

-static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
+static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
        "proxy_closed_connections_total",
-        "Number of TCP client connections closed.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-static COMPUTE_CONNECTION_LATENCY: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_compute_connection_latency_seconds",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // largest bucket = 2^16 * 0.5ms = 32s
-        exponential_buckets(0.0005, 2.0, 16).unwrap(),
+        "Number of TCP client connections closed."
    )
    .unwrap()
 });
@@ -146,13 +137,6 @@ pub enum ClientMode {

 /// Abstracts the logic of handling TCP vs WS clients
 impl ClientMode {
-    fn protocol_label(&self) -> &'static str {
-        match self {
-            ClientMode::Tcp => "tcp",
-            ClientMode::Websockets { .. } => "ws",
-        }
-    }
-
    fn allow_cleartext(&self) -> bool {
        match self {
            ClientMode::Tcp => false,
@@ -191,17 +175,10 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    stream: S,
    mode: ClientMode,
 ) -> anyhow::Result<()> {
-    info!(
-        protocol = mode.protocol_label(),
-        "handling interactive connection from client"
-    );
-
    // The `closed` counter will increase when this future is destroyed.
-    NUM_CONNECTIONS_ACCEPTED_COUNTER
-        .with_label_values(&[mode.protocol_label()])
-        .inc();
+    NUM_CONNECTIONS_ACCEPTED_COUNTER.inc();
    scopeguard::defer! {
-        NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[mode.protocol_label()]).inc();
+        NUM_CONNECTIONS_CLOSED_COUNTER.inc();
    }

    let tls = config.tls_config.as_ref();
@@ -347,6 +324,11 @@ async fn connect_to_compute_once(
        .await
 }

+enum ConnectionState<E> {
+    Cached(console::CachedNodeInfo),
+    Invalid(compute::ConnCfg, E),
+}
+
 #[async_trait]
 pub trait ConnectMechanism {
    type Connection;
@@ -398,91 +380,88 @@ where
    M::ConnectError: ShouldRetry + std::fmt::Debug,
    M::Error: From<WakeComputeError>,
 {
-    let _timer = COMPUTE_CONNECTION_LATENCY.start_timer();
-
    mechanism.update_connect_config(&mut node_info.config);

-    // try once
-    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-        Ok(res) => return Ok(res),
-        Err(e) => {
-            error!(error = ?e, "could not connect to compute node");
-            (invalidate_cache(node_info), e)
-        }
-    };
+    let mut num_retries = 0;
+    let mut state = ConnectionState::<M::ConnectError>::Cached(node_info);

-    let mut num_retries = 1;
-
-    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
-    info!("compute node's state has likely changed; requesting a wake-up");
-    let node_info = loop {
-        let wake_res = match creds {
-            auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
-            auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
-            // nothing to do?
-            auth::BackendType::Link(_) => return Err(err.into()),
-            // test backend
-            auth::BackendType::Test(x) => x.wake_compute(),
-        };
-
-        match handle_try_wake(wake_res, num_retries)? {
-            // failed to wake up but we can continue to retry
-            ControlFlow::Continue(_) => {}
-            // successfully woke up a compute node and can break the wakeup loop
-            ControlFlow::Break(mut node_info) => {
-                node_info.config.reuse_password(&config);
-                mechanism.update_connect_config(&mut node_info.config);
-                break node_info;
-            }
-        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-
-        time::sleep(wait_duration).await;
-        info!(num_retries, "retrying wake compute");
-    };
-
-    // now that we have a new node, try connect to it repeatedly.
-    // this can error for a few reasons, for instance:
-    // * DNS connection settings haven't quite propagated yet
-    info!("wake_compute success. attempting to connect");
    loop {
-        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-            Ok(res) => return Ok(res),
-            Err(e) => {
-                error!(error = ?e, "could not connect to compute node");
-                if !e.should_retry(num_retries) {
-                    return Err(e.into());
+        match state {
+            ConnectionState::Invalid(config, err) => {
+                match try_wake(&config, extra, creds).await {
+                    // we can't wake up the compute node
+                    Ok(None) => return Err(err.into()),
+                    // there was an error communicating with the control plane
+                    Err(e) => return Err(e.into()),
+                    // failed to wake up but we can continue to retry
+                    Ok(Some(ControlFlow::Continue(()))) => {
+                        state = ConnectionState::Invalid(config, err);
+                        let wait_duration = retry_after(num_retries);
+                        num_retries += 1;
+
+                        info!(num_retries, "retrying wake compute");
+                        time::sleep(wait_duration).await;
+                        continue;
+                    }
+                    // successfully woke up a compute node and can break the wakeup loop
+                    Ok(Some(ControlFlow::Break(mut node_info))) => {
+                        mechanism.update_connect_config(&mut node_info.config);
+                        state = ConnectionState::Cached(node_info)
+                    }
+                }
+            }
+            ConnectionState::Cached(node_info) => {
+                match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+                    Ok(res) => return Ok(res),
+                    Err(e) => {
+                        error!(error = ?e, "could not connect to compute node");
+                        if !e.should_retry(num_retries) {
+                            return Err(e.into());
+                        }
+
+                        // after the first connect failure,
+                        // we should invalidate the cache and wake up a new compute node
+                        if num_retries == 0 {
+                            state = ConnectionState::Invalid(invalidate_cache(node_info), e);
+                        } else {
+                            state = ConnectionState::Cached(node_info);
+                        }
+
+                        let wait_duration = retry_after(num_retries);
+                        num_retries += 1;
+
+                        info!(num_retries, "retrying wake compute");
+                        time::sleep(wait_duration).await;
+                    }
                }
            }
        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-
-        time::sleep(wait_duration).await;
-        info!(num_retries, "retrying connect_once");
    }
 }

 /// Attempts to wake up the compute node.
-/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
-/// * Returns Ok(Break(node)) if the wakeup succeeded
-/// * Returns Err(e) if there was an error
-pub fn handle_try_wake(
-    result: Result<console::CachedNodeInfo, WakeComputeError>,
-    num_retries: u32,
-) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
-    match result {
-        Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
-                Ok(ControlFlow::Continue(err))
-            }
-            _ => Err(err),
-        },
-        // Ready to try again.
-        Ok(new) => Ok(ControlFlow::Break(new)),
+/// * Returns Ok(Some(true)) if there was an error waking but retries are acceptable
+/// * Returns Ok(Some(false)) if the wakeup succeeded
+/// * Returns Ok(None) or Err(e) if there was an error
+async fn try_wake(
+    config: &compute::ConnCfg,
+    extra: &console::ConsoleReqExtra<'_>,
+    creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
+) -> Result<Option<ControlFlow<console::CachedNodeInfo>>, WakeComputeError> {
+    info!("compute node's state has likely changed; requesting a wake-up");
+    match creds.wake_compute(extra).await {
+        // retry wake if the compute was in an invalid state
+        Err(WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::BAD_REQUEST,
+            ..
+        })) => Ok(Some(ControlFlow::Continue(()))),
+        // Update `node_info` and try again.
+        Ok(Some(mut new)) => {
+            new.config.reuse_password(config);
+            Ok(Some(ControlFlow::Break(new)))
+        }
+        Err(e) => Err(e),
+        Ok(None) => Ok(None),
    }
 }

@@ -490,6 +469,8 @@ pub trait ShouldRetry {
    fn could_retry(&self) -> bool;
    fn should_retry(&self, num_retries: u32) -> bool {
        match self {
+            // retry all errors at least once
+            _ if num_retries == 0 => true,
            _ if num_retries >= NUM_RETRIES_CONNECT => false,
            err => err.could_retry(),
        }
@@ -541,9 +522,14 @@ impl ShouldRetry for compute::ConnectionError {
    }
 }

-fn retry_after(num_retries: u32) -> time::Duration {
-    // 1.5 seems to be an ok growth factor heuristic
-    BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32))
+pub fn retry_after(num_retries: u32) -> time::Duration {
+    match num_retries {
+        0 => time::Duration::ZERO,
+        _ => {
+            // 3/2 = 1.5 which seems to be an ok growth factor heuristic
+            BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries)
+        }
+    }
 }

 /// Finish client connection initialization: confirm auth success, send params, etc.
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -1,10 +1,6 @@
 //! A group of high-level tests for connection establishing logic and auth.
-//!
 use super::*;
-use crate::auth::backend::TestBackend;
-use crate::auth::ClientCredentials;
-use crate::console::{CachedNodeInfo, NodeInfo};
-use crate::{auth, http, sasl, scram};
+use crate::{auth, sasl, scram};
 use async_trait::async_trait;
 use rstest::rstest;
 use tokio_postgres::config::SslMode;
@@ -302,230 +298,9 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 #[test]
 fn connect_compute_total_wait() {
    let mut total_wait = tokio::time::Duration::ZERO;
-    for num_retries in 1..10 {
+    for num_retries in 0..10 {
        total_wait += retry_after(num_retries);
    }
    assert!(total_wait < tokio::time::Duration::from_secs(12));
    assert!(total_wait > tokio::time::Duration::from_secs(10));
 }
-
-#[derive(Clone, Copy, Debug)]
-enum ConnectAction {
-    Wake,
-    WakeFail,
-    WakeRetry,
-    Connect,
-    Retry,
-    Fail,
-}
-
-struct TestConnectMechanism {
-    counter: Arc<std::sync::Mutex<usize>>,
-    sequence: Vec<ConnectAction>,
-}
-
-impl TestConnectMechanism {
-    fn verify(&self) {
-        let counter = self.counter.lock().unwrap();
-        assert_eq!(
-            *counter,
-            self.sequence.len(),
-            "sequence does not proceed to the end"
-        );
-    }
-}
-
-impl TestConnectMechanism {
-    fn new(sequence: Vec<ConnectAction>) -> Self {
-        Self {
-            counter: Arc::new(std::sync::Mutex::new(0)),
-            sequence,
-        }
-    }
-}
-
-#[derive(Debug)]
-struct TestConnection;
-
-#[derive(Debug)]
-struct TestConnectError {
-    retryable: bool,
-}
-
-impl std::fmt::Display for TestConnectError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:?}", self)
-    }
-}
-
-impl std::error::Error for TestConnectError {}
-
-impl ShouldRetry for TestConnectError {
-    fn could_retry(&self) -> bool {
-        self.retryable
-    }
-}
-
-#[async_trait]
-impl ConnectMechanism for TestConnectMechanism {
-    type Connection = TestConnection;
-    type ConnectError = TestConnectError;
-    type Error = anyhow::Error;
-
-    async fn connect_once(
-        &self,
-        _node_info: &console::CachedNodeInfo,
-        _timeout: time::Duration,
-    ) -> Result<Self::Connection, Self::ConnectError> {
-        let mut counter = self.counter.lock().unwrap();
-        let action = self.sequence[*counter];
-        *counter += 1;
-        match action {
-            ConnectAction::Connect => Ok(TestConnection),
-            ConnectAction::Retry => Err(TestConnectError { retryable: true }),
-            ConnectAction::Fail => Err(TestConnectError { retryable: false }),
-            x => panic!("expecting action {:?}, connect is called instead", x),
-        }
-    }
-
-    fn update_connect_config(&self, _conf: &mut compute::ConnCfg) {}
-}
-
-impl TestBackend for TestConnectMechanism {
-    fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
-        let mut counter = self.counter.lock().unwrap();
-        let action = self.sequence[*counter];
-        *counter += 1;
-        match action {
-            ConnectAction::Wake => Ok(helper_create_cached_node_info()),
-            ConnectAction::WakeFail => {
-                let err = console::errors::ApiError::Console {
-                    status: http::StatusCode::FORBIDDEN,
-                    text: "TEST".into(),
-                };
-                assert!(!err.could_retry());
-                Err(console::errors::WakeComputeError::ApiError(err))
-            }
-            ConnectAction::WakeRetry => {
-                let err = console::errors::ApiError::Console {
-                    status: http::StatusCode::INTERNAL_SERVER_ERROR,
-                    text: "TEST".into(),
-                };
-                assert!(err.could_retry());
-                Err(console::errors::WakeComputeError::ApiError(err))
-            }
-            x => panic!("expecting action {:?}, wake_compute is called instead", x),
-        }
-    }
-}
-
-fn helper_create_cached_node_info() -> CachedNodeInfo {
-    let node = NodeInfo {
-        config: compute::ConnCfg::new(),
-        aux: Default::default(),
-        allow_self_signed_compute: false,
-    };
-    CachedNodeInfo::new_uncached(node)
-}
-
-fn helper_create_connect_info(
-    mechanism: &TestConnectMechanism,
-) -> (
-    CachedNodeInfo,
-    console::ConsoleReqExtra<'static>,
-    auth::BackendType<'_, ClientCredentials<'static>>,
-) {
-    let cache = helper_create_cached_node_info();
-    let extra = console::ConsoleReqExtra {
-        session_id: uuid::Uuid::new_v4(),
-        application_name: Some("TEST"),
-    };
-    let creds = auth::BackendType::Test(mechanism);
-    (cache, extra, creds)
-}
-
-#[tokio::test]
-async fn connect_to_compute_success() {
-    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Connect]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds)
-        .await
-        .unwrap();
-    mechanism.verify();
-}
-
-#[tokio::test]
-async fn connect_to_compute_retry() {
-    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds)
-        .await
-        .unwrap();
-    mechanism.verify();
-}
-
-/// Test that we don't retry if the error is not retryable.
-#[tokio::test]
-async fn connect_to_compute_non_retry_1() {
-    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds)
-        .await
-        .unwrap_err();
-    mechanism.verify();
-}
-
-/// Even for non-retryable errors, we should retry at least once.
-#[tokio::test]
-async fn connect_to_compute_non_retry_2() {
-    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds)
-        .await
-        .unwrap();
-    mechanism.verify();
-}
-
-/// Retry for at most `NUM_RETRIES_CONNECT` times.
-#[tokio::test]
-async fn connect_to_compute_non_retry_3() {
-    assert_eq!(NUM_RETRIES_CONNECT, 10);
-    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![
-        Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
-        /* the 11th time */ Retry,
-    ]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds)
-        .await
-        .unwrap_err();
-    mechanism.verify();
-}
-
-/// Should retry wake compute.
-#[tokio::test]
-async fn wake_retry() {
-    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds)
-        .await
-        .unwrap();
-    mechanism.verify();
-}
-
-/// Wake failed with a non-retryable error.
-#[tokio::test]
-async fn wake_non_retry() {
-    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds)
-        .await
-        .unwrap_err();
-    mechanism.verify();
-}
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -234,10 +234,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
                listen_pg_addr_tenant_only
            );
            let listener = tcp_listener::bind(listen_pg_addr_tenant_only.clone()).map_err(|e| {
-                error!(
-                    "failed to bind to address {}: {}",
-                    listen_pg_addr_tenant_only, e
-                );
+                error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
                e
            })?;
            Some(listener)
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -11,7 +11,6 @@ use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};

 use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED};
-use crate::timeline::TimelineError;
 use crate::wal_service::ConnectionId;
 use crate::{GlobalTimelines, SafeKeeperConf};
 use postgres_backend::QueryError;
@@ -46,7 +45,6 @@ enum SafekeeperPostgresCommand {
    StartWalPush,
    StartReplication { start_lsn: Lsn },
    IdentifySystem,
-    TimelineStatus,
    JSONCtrl { cmd: AppendLogicalMessage },
 }

@@ -66,8 +64,6 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
        Ok(SafekeeperPostgresCommand::StartReplication { start_lsn })
    } else if cmd.starts_with("IDENTIFY_SYSTEM") {
        Ok(SafekeeperPostgresCommand::IdentifySystem)
-    } else if cmd.starts_with("TIMELINE_STATUS") {
-        Ok(SafekeeperPostgresCommand::TimelineStatus)
    } else if cmd.starts_with("JSON_CTRL") {
        let cmd = cmd.strip_prefix("JSON_CTRL").context("invalid prefix")?;
        Ok(SafekeeperPostgresCommand::JSONCtrl {
@@ -82,7 +78,6 @@ fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str {
    match cmd {
        SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH",
        SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION",
-        SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS",
        SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM",
        SafekeeperPostgresCommand::JSONCtrl { .. } => "JSON_CTRL",
    }
@@ -224,7 +219,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                    .await
            }
            SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await,
-            SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await,
            SafekeeperPostgresCommand::JSONCtrl { ref cmd } => {
                handle_json_ctrl(self, pgb, cmd).await
            }
@@ -269,38 +263,6 @@ impl SafekeeperPostgresHandler {
        check_permission(claims, tenant_id)
    }

-    async fn handle_timeline_status<IO: AsyncRead + AsyncWrite + Unpin>(
-        &mut self,
-        pgb: &mut PostgresBackend<IO>,
-    ) -> Result<(), QueryError> {
-        // Get timeline, handling "not found" error
-        let tli = match GlobalTimelines::get(self.ttid) {
-            Ok(tli) => Ok(Some(tli)),
-            Err(TimelineError::NotFound(_)) => Ok(None),
-            Err(e) => Err(QueryError::Other(e.into())),
-        }?;
-
-        // Write row description
-        pgb.write_message_noflush(&BeMessage::RowDescription(&[
-            RowDescriptor::text_col(b"flush_lsn"),
-            RowDescriptor::text_col(b"commit_lsn"),
-        ]))?;
-
-        // Write row if timeline exists
-        if let Some(tli) = tli {
-            let (inmem, _state) = tli.get_state().await;
-            let flush_lsn = tli.get_flush_lsn().await;
-            let commit_lsn = inmem.commit_lsn;
-            pgb.write_message_noflush(&BeMessage::DataRow(&[
-                Some(flush_lsn.to_string().as_bytes()),
-                Some(commit_lsn.to_string().as_bytes()),
-            ]))?;
-        }
-
-        pgb.write_message_noflush(&BeMessage::CommandComplete(b"TIMELINE_STATUS"))?;
-        Ok(())
-    }
-
    ///
    /// Handle IDENTIFY_SYSTEM replication command
    ///
--- a/scripts/combine_control_files.py
+++ b/scripts/combine_control_files.py
@@ -1,80 +0,0 @@
-#! /usr/bin/env python3
-# Script to generate ext_index.json metadata file
-# that stores content of the control files and location of extension archives
-# for all extensions in extensions subdir.
-import argparse
-import json
-import subprocess
-from pathlib import Path
-
-"""
-# ext_index.json example:
-{
-    "public_extensions": [
-        "anon"
-    ],
-    "library_index": {
-        "anon": "anon",
-        "kq_imcx": "kq_imcx"
-        // would be more complicated for something like postgis where multiple library names all map to postgis
-    },
-    "extension_data": {
-        "kq_imcx": {
-            "control_data": {
-                "kq_imcx.control": "# This file is generated content from add_postgresql_extension.\n# No point in modifying it, it will be overwritten anyway.\n\n# Default version, always set\ndefault_version = '0.1'\n\n# Module pathname generated from target shared library name. Use\n# MODULE_PATHNAME in script file.\nmodule_pathname = '$libdir/kq_imcx.so'\n\n# Comment for extension. Set using COMMENT option. Can be set in\n# script file as well.\ncomment = 'ketteQ In-Memory Calendar Extension (IMCX)'\n\n# Encoding for script file. Set using ENCODING option.\n#encoding = ''\n\n# Required extensions. Set using REQUIRES option (multi-valued).\n#requires = ''\ntrusted = true\n"
-            },
-            "archive_path": "5648391853/v15/extensions/kq_imcx.tar.zst"
-        },
-        "anon": {
-            "control_data": {
-                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
-            },
-            "archive_path": "5648391853/v15/extensions/anon.tar.zst"
-        }
-    }
-}
-"""
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="generate ext_index.json")
-    parser.add_argument("pg_version", type=str, choices=["v14", "v15"], help="pg_version")
-    parser.add_argument("BUILD_TAG", type=str, help="BUILD_TAG for this compute image")
-    parser.add_argument("--public_extensions", type=str, help="list of public extensions")
-    args = parser.parse_args()
-    pg_version = args.pg_version
-    BUILD_TAG = args.BUILD_TAG
-    public_ext_list = args.public_extensions.split(",")
-
-    ext_index = {}
-    library_index = {}
-    EXT_PATH = Path("extensions")
-    for extension in EXT_PATH.iterdir():
-        if extension.is_dir():
-            control_data = {}
-            for control_file in extension.glob("*.control"):
-                if control_file.suffix != ".control":
-                    continue
-                with open(control_file, "r") as f:
-                    control_data[control_file.name] = f.read()
-            ext_index[extension.name] = {
-                "control_data": control_data,
-                "archive_path": f"{BUILD_TAG}/{pg_version}/extensions/{extension.name}.tar.zst",
-            }
-        elif extension.suffix == ".zst":
-            file_list = (
-                str(subprocess.check_output(["tar", "tf", str(extension)]), "utf-8")
-                .strip()
-                .split("\n")
-            )
-            for file in file_list:
-                if file.endswith(".so") and file.startswith("lib/"):
-                    lib_name = file[4:-3]
-                    library_index[lib_name] = extension.name.replace(".tar.zst", "")
-
-    all_data = {
-        "public_extensions": public_ext_list,
-        "library_index": library_index,
-        "extension_data": ext_index,
-    }
-    with open("ext_index.json", "w") as f:
-        json.dump(all_data, f)
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -40,13 +40,10 @@ def parse_metrics(text: str, name: str = "") -> Metrics:
    return metrics


-def histogram(prefix_without_trailing_underscore: str) -> List[str]:
-    assert not prefix_without_trailing_underscore.endswith("_")
-    return [f"{prefix_without_trailing_underscore}_{x}" for x in ["bucket", "count", "sum"]]
-
-
 PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
    "pageserver_remote_timeline_client_calls_unfinished",
+    *[f"pageserver_remote_timeline_client_calls_started_{x}" for x in ["bucket", "count", "sum"]],
+    *[f"pageserver_remote_operation_seconds_{x}" for x in ["bucket", "count", "sum"]],
    "pageserver_remote_physical_size",
    "pageserver_remote_timeline_client_bytes_started_total",
    "pageserver_remote_timeline_client_bytes_finished_total",
@@ -70,29 +67,34 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
    "pageserver_getpage_reconstruct_seconds_count",
    "pageserver_getpage_reconstruct_seconds_sum",
    *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
-    *histogram("pageserver_read_num_fs_layers"),
-    *histogram("pageserver_getpage_get_reconstruct_data_seconds"),
-    *histogram("pageserver_wait_lsn_seconds"),
-    *histogram("pageserver_remote_operation_seconds"),
-    *histogram("pageserver_remote_timeline_client_calls_started"),
-    *histogram("pageserver_io_operations_seconds"),
-    "pageserver_tenant_states_count",
 )

 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
    "pageserver_current_logical_size",
    "pageserver_resident_physical_size",
+    "pageserver_getpage_get_reconstruct_data_seconds_bucket",
+    "pageserver_getpage_get_reconstruct_data_seconds_count",
+    "pageserver_getpage_get_reconstruct_data_seconds_sum",
    "pageserver_io_operations_bytes_total",
+    "pageserver_io_operations_seconds_bucket",
+    "pageserver_io_operations_seconds_count",
+    "pageserver_io_operations_seconds_sum",
    "pageserver_last_record_lsn",
+    "pageserver_read_num_fs_layers_bucket",
+    "pageserver_read_num_fs_layers_count",
+    "pageserver_read_num_fs_layers_sum",
    "pageserver_smgr_query_seconds_bucket",
    "pageserver_smgr_query_seconds_count",
    "pageserver_smgr_query_seconds_sum",
    "pageserver_storage_operations_seconds_count_total",
    "pageserver_storage_operations_seconds_sum_total",
+    "pageserver_wait_lsn_seconds_bucket",
+    "pageserver_wait_lsn_seconds_count",
+    "pageserver_wait_lsn_seconds_sum",
    "pageserver_created_persistent_files_total",
    "pageserver_written_persistent_bytes_total",
+    "pageserver_tenant_states_count",
    "pageserver_evictions_total",
    "pageserver_evictions_with_low_residence_duration_total",
    *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
-    # pageserver_broken_tenants_count is a leaked "metric" which is "cleared" on restart or reload
 )
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -542,7 +542,7 @@ class S3Storage:
    access_key: str
    secret_key: str
    endpoint: Optional[str] = None
-    prefix_in_bucket: Optional[str] = ""
+    prefix_in_bucket: Optional[str] = None

    def access_env_vars(self) -> Dict[str, str]:
        return {
@@ -1504,7 +1504,6 @@ class NeonCli(AbstractNeonCli):
        safekeepers: Optional[List[int]] = None,
        tenant_id: Optional[TenantId] = None,
        lsn: Optional[Lsn] = None,
-        branch_name: Optional[str] = None,
    ) -> "subprocess.CompletedProcess[str]":
        args = [
            "endpoint",
@@ -1518,11 +1517,8 @@ class NeonCli(AbstractNeonCli):
            args.append(f"--lsn={lsn}")
        args.extend(["--pg-port", str(pg_port)])
        args.extend(["--http-port", str(http_port)])
-
        if safekeepers is not None:
            args.extend(["--safekeepers", (",".join(map(str, safekeepers)))])
-        if branch_name is not None:
-            args.extend(["--branch-name", branch_name])
        if endpoint_id is not None:
            args.append(endpoint_id)

--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -194,17 +194,14 @@ def wait_for_upload_queue_empty(


 def wait_timeline_detail_404(
-    pageserver_http: PageserverHttpClient,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    iterations: int,
+    pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
 ):
    last_exc = None
-    for _ in range(iterations):
+    for _ in range(2):
        time.sleep(0.250)
        try:
            data = pageserver_http.timeline_detail(tenant_id, timeline_id)
-            log.info(f"detail {data}")
+            log.error(f"detail {data}")
        except PageserverApiException as e:
            log.debug(e)
            if e.status_code == 404:
@@ -219,8 +216,7 @@ def timeline_delete_wait_completed(
    pageserver_http: PageserverHttpClient,
    tenant_id: TenantId,
    timeline_id: TimelineId,
-    iterations: int = 20,
    **delete_args,
 ):
    pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
-    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations)
+    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id)
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -61,7 +61,6 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc
        durations = {
            "wait_for_spec_ms": f"{i}_wait_for_spec",
            "sync_safekeepers_ms": f"{i}_sync_safekeepers",
-            "sync_sk_check_ms": f"{i}_sync_sk_check",
            "basebackup_ms": f"{i}_basebackup",
            "start_postgres_ms": f"{i}_start_postgres",
            "config_ms": f"{i}_config",
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -123,7 +123,7 @@ def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests):
@pytest.mark.parametrize("content_type", [None, "application/json"])
 def test_empty_body(positive_env: NeonEnv, content_type: Optional[str]):
    """
-    For backwards-compatibility: if we send an empty body,
+    For backwards-compatiblity: if we send an empty body,
    the request should be accepted and the config should be the default config.
    """
    env = positive_env
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -4,7 +4,7 @@ import shutil
 import subprocess
 import tempfile
 from pathlib import Path
-from typing import Any, List, Optional
+from typing import Any, Optional

 import pytest
 import toml  # TODO: replace with tomllib for Python >= 3.11
@@ -14,6 +14,7 @@ from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    PgBin,
    PortDistributor,
+    parse_project_git_version_output,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -62,6 +63,7 @@ def test_create_snapshot(
    neon_env_builder.pg_version = pg_version
    neon_env_builder.num_safekeepers = 3
    neon_env_builder.enable_local_fs_remote_storage()
+    neon_env_builder.preserve_database_files = True

    env = neon_env_builder.init_start()
    endpoint = env.endpoints.create_start("main")
@@ -257,15 +259,36 @@ def prepare_snapshot(
        shutil.rmtree(repo_dir / "pgdatadirs")
    os.mkdir(repo_dir / "endpoints")

+    # Remove wal-redo temp directory if it exists. Newer pageserver versions don't create
+    # them anymore, but old versions did.
+    for tenant in (repo_dir / "tenants").glob("*"):
+        wal_redo_dir = tenant / "wal-redo-datadir.___temp"
+        if wal_redo_dir.exists() and wal_redo_dir.is_dir():
+            shutil.rmtree(wal_redo_dir)
+
    # Update paths and ports in config files
    pageserver_toml = repo_dir / "pageserver.toml"
    pageserver_config = toml.load(pageserver_toml)
    pageserver_config["remote_storage"]["local_path"] = str(repo_dir / "local_fs_remote_storage")
-    for param in ("listen_http_addr", "listen_pg_addr", "broker_endpoint"):
-        pageserver_config[param] = port_distributor.replace_with_new_port(pageserver_config[param])
+    pageserver_config["listen_http_addr"] = port_distributor.replace_with_new_port(
+        pageserver_config["listen_http_addr"]
+    )
+    pageserver_config["listen_pg_addr"] = port_distributor.replace_with_new_port(
+        pageserver_config["listen_pg_addr"]
+    )
+    # since storage_broker these are overriden by neon_local during pageserver
+    # start; remove both to prevent unknown options during etcd ->
+    # storage_broker migration. TODO: remove once broker is released
+    pageserver_config.pop("broker_endpoint", None)
+    pageserver_config.pop("broker_endpoints", None)
+    etcd_broker_endpoints = [f"http://localhost:{port_distributor.get_port()}/"]
+    if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0":
+        pageserver_config["broker_endpoints"] = etcd_broker_endpoints  # old etcd version

-    # We don't use authentication in compatibility tests
-    # so just remove authentication related settings.
+    # Older pageserver versions had just one `auth_type` setting. Now there
+    # are separate settings for pg and http ports. We don't use authentication
+    # in compatibility tests so just remove authentication related settings.
+    pageserver_config.pop("auth_type", None)
    pageserver_config.pop("pg_auth_type", None)
    pageserver_config.pop("http_auth_type", None)

@@ -277,16 +300,31 @@ def prepare_snapshot(

    snapshot_config_toml = repo_dir / "config"
    snapshot_config = toml.load(snapshot_config_toml)
-    for param in ("listen_http_addr", "listen_pg_addr"):
-        snapshot_config["pageserver"][param] = port_distributor.replace_with_new_port(
-            snapshot_config["pageserver"][param]
-        )
-    snapshot_config["broker"]["listen_addr"] = port_distributor.replace_with_new_port(
-        snapshot_config["broker"]["listen_addr"]
+
+    # Provide up/downgrade etcd <-> storage_broker to make forward/backward
+    # compatibility test happy. TODO: leave only the new part once broker is released.
+    if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0":
+        # old etcd version
+        snapshot_config["etcd_broker"] = {
+            "etcd_binary_path": shutil.which("etcd"),
+            "broker_endpoints": etcd_broker_endpoints,
+        }
+        snapshot_config.pop("broker", None)
+    else:
+        # new storage_broker version
+        broker_listen_addr = f"127.0.0.1:{port_distributor.get_port()}"
+        snapshot_config["broker"] = {"listen_addr": broker_listen_addr}
+        snapshot_config.pop("etcd_broker", None)
+
+    snapshot_config["pageserver"]["listen_http_addr"] = port_distributor.replace_with_new_port(
+        snapshot_config["pageserver"]["listen_http_addr"]
+    )
+    snapshot_config["pageserver"]["listen_pg_addr"] = port_distributor.replace_with_new_port(
+        snapshot_config["pageserver"]["listen_pg_addr"]
    )
    for sk in snapshot_config["safekeepers"]:
-        for param in ("http_port", "pg_port", "pg_tenant_only_port"):
-            sk[param] = port_distributor.replace_with_new_port(sk[param])
+        sk["http_port"] = port_distributor.replace_with_new_port(sk["http_port"])
+        sk["pg_port"] = port_distributor.replace_with_new_port(sk["pg_port"])

    if pg_distrib_dir:
        snapshot_config["pg_distrib_dir"] = str(pg_distrib_dir)
@@ -312,6 +350,12 @@ def prepare_snapshot(
    ), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}"


+# get git SHA of neon binary
+def get_neon_version(neon_binpath: Path):
+    out = subprocess.check_output([neon_binpath / "neon_local", "--version"]).decode("utf-8")
+    return parse_project_git_version_output(out)
+
+
 def check_neon_works(
    repo_dir: Path,
    neon_target_binpath: Path,
@@ -337,6 +381,7 @@ def check_neon_works(
    config.pg_version = pg_version
    config.initial_tenant = snapshot_config["default_tenant_id"]
    config.pg_distrib_dir = pg_distrib_dir
+    config.preserve_database_files = True

    # Use the "target" binaries to launch the storage nodes
    config_target = config
@@ -393,14 +438,6 @@ def check_neon_works(
        test_output_dir / "dump-from-wal.filediff",
    )

-    # TODO: Run pg_amcheck unconditionally after the next release
-    try:
-        pg_bin.run(["psql", connstr, "--command", "CREATE EXTENSION IF NOT EXISTS amcheck"])
-    except subprocess.CalledProcessError:
-        log.info("Extension amcheck is not available, skipping pg_amcheck")
-    else:
-        pg_bin.run_capture(["pg_amcheck", connstr, "--install-missing", "--verbose"])
-
    # Check that we can interract with the data
    pg_bin.run_capture(["pgbench", "--time=10", "--progress=2", connstr])

@@ -408,15 +445,10 @@ def check_neon_works(
    assert not initial_dump_differs, "initial dump differs"


-def dump_differs(
-    first: Path, second: Path, output: Path, allowed_diffs: Optional[List[str]] = None
-) -> bool:
+def dump_differs(first: Path, second: Path, output: Path) -> bool:
    """
    Runs diff(1) command on two SQL dumps and write the output to the given output file.
-    The function supports allowed diffs, if the diff is in the allowed_diffs list, it's not considered as a difference.
-    See the example of it in https://github.com/neondatabase/neon/pull/4425/files#diff-15c5bfdd1d5cc1411b9221091511a60dd13a9edf672bdfbb57dd2ef8bb7815d6
-
-    Returns True if the dumps differ and produced diff is not allowed, False otherwise (in most cases we want it to return False).
+    Returns True if the dumps differ, False otherwise.
    """

    with output.open("w") as stdout:
@@ -434,30 +466,51 @@ def dump_differs(

    differs = res.returncode != 0

-    allowed_diffs = allowed_diffs or []
-    if differs and len(allowed_diffs) > 0:
-        for allowed_diff in allowed_diffs:
-            with tempfile.NamedTemporaryFile(mode="w") as tmp:
-                tmp.write(allowed_diff)
-                tmp.flush()
+    # TODO: Remove after https://github.com/neondatabase/neon/pull/4425 is merged, and a couple of releases are made
+    if differs:
+        with tempfile.NamedTemporaryFile(mode="w") as tmp:
+            tmp.write(PR4425_ALLOWED_DIFF)
+            tmp.flush()

-                allowed = subprocess.run(
-                    [
-                        "diff",
-                        "--unified",  # Make diff output more readable
-                        r"--ignore-matching-lines=^---",  # Ignore diff headers
-                        r"--ignore-matching-lines=^\+\+\+",  # Ignore diff headers
-                        "--ignore-matching-lines=^@@",  # Ignore diff blocks location
-                        "--ignore-matching-lines=^ *$",  # Ignore lines with only spaces
-                        "--ignore-matching-lines=^ --.*",  # Ignore SQL comments in diff
-                        "--ignore-blank-lines",
-                        str(output),
-                        str(tmp.name),
-                    ],
-                )
+            allowed = subprocess.run(
+                [
+                    "diff",
+                    "--unified",  # Make diff output more readable
+                    r"--ignore-matching-lines=^---",  # Ignore diff headers
+                    r"--ignore-matching-lines=^\+\+\+",  # Ignore diff headers
+                    "--ignore-matching-lines=^@@",  # Ignore diff blocks location
+                    "--ignore-matching-lines=^ *$",  # Ignore lines with only spaces
+                    "--ignore-matching-lines=^ --.*",  # Ignore the " --" lines for compatibility with PG14
+                    "--ignore-blank-lines",
+                    str(output),
+                    str(tmp.name),
+                ],
+            )

-                differs = allowed.returncode != 0
-                if not differs:
-                    break
+            differs = allowed.returncode != 0

    return differs
+
+
+PR4425_ALLOWED_DIFF = """
+--- /tmp/test_output/test_backward_compatibility[release-pg15]/compatibility_snapshot/dump.sql 2023-06-08 18:12:45.000000000 +0000
+++ /tmp/test_output/test_backward_compatibility[release-pg15]/dump.sql        2023-06-13 07:25:35.211733653 +0000
+@@ -13,12 +13,20 @@
+
+ CREATE ROLE cloud_admin;
+ ALTER ROLE cloud_admin WITH SUPERUSER INHERIT CREATEROLE CREATEDB LOGIN REPLICATION BYPASSRLS;
+CREATE ROLE neon_superuser;
+ALTER ROLE neon_superuser WITH NOSUPERUSER INHERIT CREATEROLE CREATEDB NOLOGIN NOREPLICATION NOBYPASSRLS;
+
+ --
+ -- User Configurations
+ --
+
+
+--
+-- Role memberships
+--
+
+GRANT pg_read_all_data TO neon_superuser GRANTED BY cloud_admin;
+GRANT pg_write_all_data TO neon_superuser GRANTED BY cloud_admin;
+"""
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -136,6 +136,8 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
        for sample in ps_metrics.query_all(
            name="pageserver_remote_operation_seconds_count",
            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
                "file_kind": str(file_kind),
                "op_kind": str(op_kind),
            },
--- a/test_runner/regress/test_gc_cutoff.py
+++ b/test_runner/regress/test_gc_cutoff.py
@@ -14,6 +14,10 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    env = neon_env_builder.init_start()

+    # These warnings are expected, when the pageserver is restarted abruptly
+    env.pageserver.allowed_errors.append(".*found future image layer.*")
+    env.pageserver.allowed_errors.append(".*found future delta layer.*")
+
    pageserver_http = env.pageserver.http_client()

    # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -140,6 +140,8 @@ def test_metric_collection(
        for sample in ps_metrics.query_all(
            name="pageserver_remote_operation_seconds_count",
            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
                "file_kind": str(file_kind),
                "op_kind": str(op_kind),
            },
--- a/test_runner/regress/test_multixact_conc.py
+++ b/test_runner/regress/test_multixact_conc.py
@@ -0,0 +1,92 @@
+import random
+import threading
+from threading import Thread
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
+from fixtures.utils import query_scalar
+
+
+#
+# Test multixact state after branching
+# Now this test is very minimalistic -
+# it only checks next_multixact_id field in restored pg_control,
+# since we don't have functions to check multixact internals.
+#
+def test_multixact_conc(neon_simple_env: NeonEnv, test_output_dir):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_multixact", "empty")
+    endpoint = env.endpoints.create_start("test_multixact")
+
+    log.info("postgres is running on 'test_multixact' branch")
+
+    n_records = 100
+    n_threads =   5
+    n_iters =  1000
+    n_restarts = 10
+
+    cur = endpoint.connect().cursor()
+    cur.execute(
+        f"""
+        CREATE TABLE t1(pk int primary key, val integer);
+        INSERT INTO t1 values (generate_series(1, {n_records}), 0);
+    """
+    )
+
+    next_multixact_id_old = query_scalar(
+        cur, "SELECT next_multixact_id FROM pg_control_checkpoint()"
+    )
+
+    # Lock entries using parallel connections in a round-robin fashion.
+    def do_updates():
+        conn = endpoint.connect(autocommit=False)
+        for i in range(n_iters):
+            pk = random.randrange(1, n_records)
+            conn.cursor().execute(f"update t1 set val=val+1 where pk={pk}")
+            conn.cursor().execute("select * from t1 for key share")
+            conn.commit()
+        conn.close()
+
+    for iter in range(n_restarts):
+        threads: List[threading.Thread] = []
+        for i in range(n_threads):
+            threads.append(threading.Thread(target=do_updates, args=(), daemon=False))
+            threads[-1].start()
+
+        for thread in threads:
+            thread.join()
+
+        # Restart endpoint
+        endpoint.stop()
+        endpoint.start()
+
+        conn = endpoint.connect()
+        cur = conn.cursor()
+        cur.execute("select count(*) from t1")
+        assert cur.fetchone() == (n_records,)
+
+    # force wal flush
+    cur.execute("checkpoint")
+
+    cur.execute(
+        "SELECT next_multixact_id, pg_current_wal_insert_lsn() FROM pg_control_checkpoint()"
+    )
+    res = cur.fetchone()
+    assert res is not None
+    next_multixact_id = res[0]
+    lsn = res[1]
+
+    # Ensure that we did lock some tuples
+    assert int(next_multixact_id) > int(next_multixact_id_old)
+
+    # Branch at this point
+    env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn)
+    endpoint_new = env.endpoints.create_start("test_multixact_new")
+
+    log.info("postgres is running on 'test_multixact_new' branch")
+    next_multixact_id_new = endpoint_new.safe_psql(
+        "SELECT next_multixact_id FROM pg_control_checkpoint()"
+    )[0][0]
+
+    # Check that we restored pg_controlfile correctly
+    assert next_multixact_id_new == next_multixact_id
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -16,13 +16,11 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por
            endpoint_id="ep-basic-main", pg_port=pg_port, http_port=http_port
        )

-        branch_name = "migration-check"
-
-        env.neon_cli.create_branch(new_branch_name=branch_name)
+        env.neon_cli.create_branch(new_branch_name="migration_check")
        pg_port = port_distributor.get_port()
        http_port = port_distributor.get_port()
        env.neon_cli.endpoint_start(
-            f"ep-{branch_name}", pg_port, http_port, branch_name=branch_name
+            endpoint_id="ep-migration_check", pg_port=pg_port, http_port=http_port
        )
    finally:
        env.neon_cli.stop()
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -27,16 +27,15 @@ from fixtures.types import Lsn
 from fixtures.utils import query_scalar, wait_until


-def get_num_downloaded_layers(client: PageserverHttpClient):
-    """
-    This assumes that the pageserver only has a single tenant.
-    """
+def get_num_downloaded_layers(client: PageserverHttpClient, tenant_id, timeline_id):
    value = client.get_metric_value(
        "pageserver_remote_operation_seconds_count",
        {
            "file_kind": "layer",
            "op_kind": "download",
            "status": "success",
+            "tenant_id": tenant_id,
+            "timeline_id": timeline_id,
        },
    )
    if value is None:
@@ -58,8 +57,7 @@ def test_ondemand_download_large_rel(
        test_name="test_ondemand_download_large_rel",
    )

-    # thinking about using a shared environment? the test assumes that global
-    # metrics are for single tenant.
+    ##### First start, insert secret data and upload it to the remote storage
    env = neon_env_builder.init_start(
        initial_tenant_conf={
            # disable background GC
@@ -131,7 +129,7 @@ def test_ondemand_download_large_rel(
    # safekeepers, that have now been shut down.
    endpoint = env.endpoints.create_start("main", lsn=current_lsn)

-    before_downloads = get_num_downloaded_layers(client)
+    before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
    assert before_downloads != 0, "basebackup should on-demand non-zero layers"

    # Probe in the middle of the table. There's a high chance that the beginning
@@ -142,7 +140,7 @@ def test_ondemand_download_large_rel(
    with endpoint.cursor() as cur:
        assert query_scalar(cur, "select count(*) from tbl where id = 500000") == 1

-    after_downloads = get_num_downloaded_layers(client)
+    after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
    log.info(f"layers downloaded before {before_downloads} and after {after_downloads}")
    assert after_downloads > before_downloads

@@ -161,11 +159,13 @@ def test_ondemand_download_timetravel(
        test_name="test_ondemand_download_timetravel",
    )

-    # thinking about using a shared environment? the test assumes that global
-    # metrics are for single tenant.
+    ##### First start, insert data and upload it to the remote storage
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()

-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
+    # Override defaults, to create more layers
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
            # Disable background GC & compaction
            # We don't want GC, that would break the assertion about num downloads.
            # We don't want background compaction, we force a compaction every time we do explicit checkpoint.
@@ -178,7 +178,7 @@ def test_ondemand_download_timetravel(
            "compaction_target_size": f"{1 * 1024 ** 2}",  # 1 MB
        }
    )
-    pageserver_http = env.pageserver.http_client()
+    env.initial_tenant = tenant

    endpoint = env.endpoints.create_start("main")

@@ -283,7 +283,7 @@ def test_ondemand_download_timetravel(
                == table_len
            )

-        after_downloads = get_num_downloaded_layers(client)
+        after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
        num_layers_downloaded.append(after_downloads)
        log.info(f"num_layers_downloaded[-1]={num_layers_downloaded[-1]}")

@@ -324,8 +324,11 @@ def test_download_remote_layers_api(
    )

    ##### First start, insert data and upload it to the remote storage
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
+    env = neon_env_builder.init_start()
+
+    # Override defaults, to create more layers
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
            # Disable background GC & compaction
            # We don't want GC, that would break the assertion about num downloads.
            # We don't want background compaction, we force a compaction every time we do explicit checkpoint.
@@ -338,6 +341,7 @@ def test_download_remote_layers_api(
            "compaction_target_size": f"{1 * 1024 ** 2}",  # 1 MB
        }
    )
+    env.initial_tenant = tenant

    endpoint = env.endpoints.create_start("main")

@@ -485,6 +489,8 @@ def test_compaction_downloads_on_demand_without_image_creation(
        test_name="test_compaction_downloads_on_demand_without_image_creation",
    )

+    env = neon_env_builder.init_start()
+
    conf = {
        # Disable background GC & compaction
        "gc_period": "0s",
@@ -500,8 +506,6 @@ def test_compaction_downloads_on_demand_without_image_creation(
        # pitr_interval and gc_horizon are not interesting because we dont run gc
    }

-    env = neon_env_builder.init_start(initial_tenant_conf=stringify(conf))
-
    def downloaded_bytes_and_count(pageserver_http: PageserverHttpClient) -> Tuple[int, int]:
        m = pageserver_http.get_metrics()
        # these are global counters
@@ -513,12 +517,11 @@ def test_compaction_downloads_on_demand_without_image_creation(
        assert count < 2**53 and count.is_integer(), "count should still be safe integer-in-f64"
        return (int(total_bytes), int(count))

+    # Override defaults, to create more layers
+    tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
+    env.initial_tenant = tenant_id
    pageserver_http = env.pageserver.http_client()

-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-    assert timeline_id is not None
-
    with env.endpoints.create_start("main") as endpoint:
        # no particular reason to create the layers like this, but we are sure
        # not to hit the image_creation_threshold here.
@@ -574,6 +577,8 @@ def test_compaction_downloads_on_demand_with_image_creation(
        test_name="test_compaction_downloads_on_demand",
    )

+    env = neon_env_builder.init_start()
+
    conf = {
        # Disable background GC & compaction
        "gc_period": "0s",
@@ -588,11 +593,9 @@ def test_compaction_downloads_on_demand_with_image_creation(
        # pitr_interval and gc_horizon are not interesting because we dont run gc
    }

-    env = neon_env_builder.init_start(initial_tenant_conf=stringify(conf))
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-    assert timeline_id is not None
-
+    # Override defaults, to create more layers
+    tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
+    env.initial_tenant = tenant_id
    pageserver_http = env.pageserver.http_client()

    endpoint = env.endpoints.create_start("main")
@@ -661,6 +664,10 @@ def test_compaction_downloads_on_demand_with_image_creation(
    assert dict(kinds_after) == {"Delta": 4, "Image": 1}


+def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
+    return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
+
+
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_ondemand_download_failure_to_replace(
    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
@@ -684,12 +691,15 @@ def test_ondemand_download_failure_to_replace(

    env = neon_env_builder.init_start()

-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-    assert timeline_id is not None
+    tenant_id, timeline_id = env.neon_cli.create_tenant()

+    env.initial_tenant = tenant_id
    pageserver_http = env.pageserver.http_client()

+    lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
+
+    wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
+
    # remove layers so that they will be redownloaded
    pageserver_http.tenant_detach(tenant_id)
    pageserver_http.tenant_attach(tenant_id)
@@ -700,10 +710,8 @@ def test_ondemand_download_failure_to_replace(
    # requesting details with non-incremental size should trigger a download of the only layer
    # this will need to be adjusted if an index for logical sizes is ever implemented
    with pytest.raises(PageserverApiException):
-        # PageserverApiException is expected because of the failpoint (timeline_detail building does something)
-        # ReadTimeout can happen on our busy CI, but it should not, because there is no more busylooping
-        # but should it be added back, we would wait for 15s here.
-        pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=15)
+        # error message is not useful
+        pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2)

    actual_message = ".* ERROR .*layermap-replace-notfound"
    assert env.pageserver.log_contains(actual_message) is not None
@@ -716,7 +724,3 @@ def test_ondemand_download_failure_to_replace(
    env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'")

    # if the above returned, then we didn't have a livelock, and all is well
-
-
-def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
-    return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -72,6 +72,10 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()

+    # These warnings are expected, when the pageserver is restarted abruptly
+    env.pageserver.allowed_errors.append(".*found future image layer.*")
+    env.pageserver.allowed_errors.append(".*found future delta layer.*")
+
    # Use a tiny checkpoint distance, to create a lot of layers quickly.
    # That allows us to stress the compaction and layer flushing logic more.
    tenant, _ = env.neon_cli.create_tenant(
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -1,6 +1,6 @@
 import json
 import subprocess
-from typing import Any, List, Optional, Tuple
+from typing import Any, List, Optional

 import psycopg2
 import pytest
@@ -260,73 +260,3 @@ def test_sql_over_http_output_options(static_proxy: NeonProxy):

    rows = q("select 1 as n, 'a' as s, '{1,2,3}'::int4[] as arr", True, True)["rows"]
    assert rows == [["1", "a", "{1,2,3}"]]
-
-
-def test_sql_over_http_batch(static_proxy: NeonProxy):
-    static_proxy.safe_psql("create role http with login password 'http' superuser")
-
-    def qq(queries: List[Tuple[str, Optional[List[Any]]]], read_only: bool = False) -> Any:
-        connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
-        response = requests.post(
-            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
-            data=json.dumps(list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))),
-            headers={
-                "Content-Type": "application/sql",
-                "Neon-Connection-String": connstr,
-                "Neon-Batch-Isolation-Level": "Serializable",
-                "Neon-Batch-Read-Only": "true" if read_only else "false",
-            },
-            verify=str(static_proxy.test_output_dir / "proxy.crt"),
-        )
-        assert response.status_code == 200
-        return response.json()["results"], response.headers
-
-    result, headers = qq(
-        [
-            ("select 42 as answer", None),
-            ("select $1 as answer", [42]),
-            ("select $1 * 1 as answer", [42]),
-            ("select $1::int[] as answer", [[1, 2, 3]]),
-            ("select $1::json->'a' as answer", [{"a": {"b": 42}}]),
-            ("select * from pg_class limit 1", None),
-            ("create table t(id serial primary key, val int)", None),
-            ("insert into t(val) values (10), (20), (30) returning id", None),
-            ("select * from t", None),
-            ("drop table t", None),
-        ]
-    )
-
-    assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
-    assert headers["Neon-Batch-Read-Only"] == "false"
-
-    assert result[0]["rows"] == [{"answer": 42}]
-    assert result[1]["rows"] == [{"answer": "42"}]
-    assert result[2]["rows"] == [{"answer": 42}]
-    assert result[3]["rows"] == [{"answer": [1, 2, 3]}]
-    assert result[4]["rows"] == [{"answer": {"b": 42}}]
-    assert len(result[5]["rows"]) == 1
-    res = result[6]
-    assert res["command"] == "CREATE"
-    assert res["rowCount"] is None
-    res = result[7]
-    assert res["command"] == "INSERT"
-    assert res["rowCount"] == 3
-    assert res["rows"] == [{"id": 1}, {"id": 2}, {"id": 3}]
-    res = result[8]
-    assert res["command"] == "SELECT"
-    assert res["rowCount"] == 3
-    res = result[9]
-    assert res["command"] == "DROP"
-    assert res["rowCount"] is None
-    assert len(result) == 10
-
-    result, headers = qq(
-        [
-            ("select 42 as answer", None),
-        ],
-        True,
-    )
-    assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
-    assert headers["Neon-Batch-Read-Only"] == "true"
-
-    assert result[0]["rows"] == [{"answer": 42}]
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -15,6 +15,10 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    env.pageserver.is_testing_enabled_or_skip()

+    # These warnings are expected, when the pageserver is restarted abruptly
+    env.pageserver.allowed_errors.append(".*found future delta layer.*")
+    env.pageserver.allowed_errors.append(".*found future image layer.*")
+
    # Create a branch for us
    env.neon_cli.create_branch("test_pageserver_recovery", "main")

--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -378,10 +378,12 @@ def test_remote_timeline_client_calls_started_metric(
        test_name="test_remote_timeline_client_metrics",
    )

-    # thinking about using a shared environment? the test assumes that global
-    # metrics are for single tenant.
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
+    env = neon_env_builder.init_start()
+
+    # create tenant with config that will determinstically allow
+    # compaction and gc
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        conf={
            # small checkpointing and compaction targets to ensure we generate many upload operations
            "checkpoint_distance": f"{128 * 1024}",
            "compaction_threshold": "1",
@@ -396,10 +398,6 @@ def test_remote_timeline_client_calls_started_metric(
        }
    )

-    tenant_id = env.initial_tenant
-    assert env.initial_timeline is not None
-    timeline_id: TimelineId = env.initial_timeline
-
    client = env.pageserver.http_client()

    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
@@ -421,7 +419,6 @@ def test_remote_timeline_client_calls_started_metric(
                "VACUUM foo",
            ]
        )
-        assert timeline_id is not None
        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)

    calls_started: Dict[Tuple[str, str], List[int]] = {
@@ -431,14 +428,13 @@ def test_remote_timeline_client_calls_started_metric(
    }

    def fetch_calls_started():
-        assert timeline_id is not None
        for (file_kind, op_kind), observations in calls_started.items():
-            val = client.get_metric_value(
-                name="pageserver_remote_timeline_client_calls_started_count",
-                filter={
-                    "file_kind": str(file_kind),
-                    "op_kind": str(op_kind),
-                },
+            val = client.get_remote_timeline_client_metric(
+                "pageserver_remote_timeline_client_calls_started_count",
+                tenant_id,
+                timeline_id,
+                file_kind,
+                op_kind,
            )
            assert val is not None, f"expecting metric to be present: {file_kind} {op_kind}"
            val = int(val)
@@ -522,8 +518,12 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
        test_name="test_timeline_deletion_with_files_stuck_in_upload_queue",
    )

-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
+    env = neon_env_builder.init_start()
+
+    # create tenant with config that will determinstically allow
+    # compaction and gc
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        conf={
            # small checkpointing and compaction targets to ensure we generate many operations
            "checkpoint_distance": f"{64 * 1024}",
            "compaction_threshold": "1",
@@ -535,10 +535,6 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
            "pitr_interval": "0s",
        }
    )
-    tenant_id = env.initial_tenant
-    assert env.initial_timeline is not None
-    timeline_id: TimelineId = env.initial_timeline
-
    timeline_path = env.timeline_dir(tenant_id, timeline_id)

    client = env.pageserver.http_client()
@@ -791,8 +787,12 @@ def test_compaction_delete_before_upload(
        test_name="test_compaction_delete_before_upload",
    )

-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
+    env = neon_env_builder.init_start()
+
+    # create tenant with config that will determinstically allow
+    # compaction and disables gc
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        conf={
            # Set a small compaction threshold
            "compaction_threshold": "3",
            # Disable GC
@@ -802,10 +802,6 @@ def test_compaction_delete_before_upload(
        }
    )

-    tenant_id = env.initial_tenant
-    assert env.initial_timeline is not None
-    timeline_id: TimelineId = env.initial_timeline
-
    client = env.pageserver.http_client()

    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -2,7 +2,6 @@ import asyncio
 import random
 import time
 from threading import Thread
-from typing import List, Optional

 import asyncpg
 import pytest
@@ -22,7 +21,6 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
-from prometheus_client.samples import Sample


 def do_gc_target(
@@ -856,89 +854,3 @@ def ensure_test_data(data_id: int, data: str, endpoint: Endpoint):
        assert (
            query_scalar(cur, f"SELECT secret FROM test WHERE id = {data_id};") == data
        ), "Should have timeline data back"
-
-
-@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
-def test_metrics_while_ignoring_broken_tenant_and_reloading(
-    neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
-):
-    neon_env_builder.enable_remote_storage(
-        remote_storage_kind=remote_storage_kind,
-        test_name="test_metrics_while_ignoring_broken_tenant_and_reloading",
-    )
-
-    env = neon_env_builder.init_start()
-
-    client = env.pageserver.http_client()
-    env.pageserver.allowed_errors.append(
-        r".* Changing Active tenant to Broken state, reason: broken from test"
-    )
-
-    def only_int(samples: List[Sample]) -> Optional[int]:
-        if len(samples) == 1:
-            return int(samples[0].value)
-        assert len(samples) == 0
-        return None
-
-    wait_until_tenant_state(client, env.initial_tenant, "Active", 10, 0.5)
-
-    client.tenant_break(env.initial_tenant)
-
-    found_broken = False
-    active, broken, broken_set = ([], [], [])
-    for _ in range(10):
-        m = client.get_metrics()
-        active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
-        broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
-        broken_set = m.query_all(
-            "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
-        )
-        found_broken = only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
-
-        if found_broken:
-            break
-        log.info(f"active: {active}, broken: {broken}, broken_set: {broken_set}")
-        time.sleep(0.5)
-    assert (
-        found_broken
-    ), f"tenant shows up as broken; active={active}, broken={broken}, broken_set={broken_set}"
-
-    client.tenant_ignore(env.initial_tenant)
-
-    found_broken = False
-    broken, broken_set = ([], [])
-    for _ in range(10):
-        m = client.get_metrics()
-        broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
-        broken_set = m.query_all(
-            "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
-        )
-        found_broken = only_int(broken) == 0 and only_int(broken_set) == 1
-
-        if found_broken:
-            break
-        time.sleep(0.5)
-    assert (
-        found_broken
-    ), f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
-
-    client.tenant_load(env.initial_tenant)
-
-    found_active = False
-    active, broken_set = ([], [])
-    for _ in range(10):
-        m = client.get_metrics()
-        active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
-        broken_set = m.query_all(
-            "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
-        )
-        found_active = only_int(active) == 1 and len(broken_set) == 0
-
-        if found_active:
-            break
-        time.sleep(0.5)
-
-    assert (
-        found_active
-    ), f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -213,9 +213,6 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):

    # Test (a subset of) pageserver global metrics
    for metric in PAGESERVER_GLOBAL_METRICS:
-        if metric.startswith("pageserver_remote"):
-            continue
-
        ps_samples = ps_metrics.query_all(metric, {})
        assert len(ps_samples) > 0, f"expected at least one sample for {metric}"
        for sample in ps_samples:
@@ -383,8 +380,10 @@ def test_pageserver_with_empty_tenants(
    ps_metrics = client.get_metrics()
    broken_tenants_metric_filter = {
        "tenant_id": str(tenant_without_timelines_dir),
+        "state": "Broken",
    }
    active_tenants_metric_filter = {
+        "tenant_id": str(tenant_with_empty_timelines),
        "state": "Active",
    }

@@ -400,7 +399,7 @@ def test_pageserver_with_empty_tenants(

    tenant_broken_count = int(
        ps_metrics.query_one(
-            "pageserver_broken_tenants_count", filter=broken_tenants_metric_filter
+            "pageserver_tenant_states_count", filter=broken_tenants_metric_filter
        ).value
    )

--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -38,12 +38,6 @@ def test_threshold_based_eviction(
    env = neon_env_builder.init_start()
    env.pageserver.allowed_errors.append(metrics_refused_log_line)

-    # these can happen whenever we run consumption metrics collection
-    env.pageserver.allowed_errors.append(r".*failed to calculate logical size at \S+: cancelled")
-    env.pageserver.allowed_errors.append(
-        r".*failed to calculate synthetic size for tenant \S+: failed to calculate some logical_sizes"
-    )
-
    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
    assert isinstance(timeline_id, TimelineId)

--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -1,4 +1,3 @@
-import enum
 import os
 import queue
 import shutil
@@ -12,12 +11,9 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
-    PgBin,
    RemoteStorageKind,
    S3Storage,
    available_remote_storages,
-    last_flush_lsn_upload,
-    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
@@ -121,183 +117,59 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
        ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)


-class Check(enum.Enum):
-    RETRY_WITHOUT_RESTART = enum.auto()
-    RETRY_WITH_RESTART = enum.auto()
-
-
-DELETE_FAILPOINTS = [
-    "timeline-delete-before-index-deleted-at",
-    "timeline-delete-before-schedule",
-    "timeline-delete-before-rm",
-    "timeline-delete-during-rm",
-    "timeline-delete-after-rm",
-    "timeline-delete-before-index-delete",
-    "timeline-delete-after-index-delete",
-    "timeline-delete-after-rm-metadata",
-    "timeline-delete-after-rm-dir",
-]
-
-
-def combinations():
-    result = []
-
-    remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
-    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
-        remotes.append(RemoteStorageKind.REAL_S3)
-
-    for remote_storage_kind in remotes:
-        for delete_failpoint in DELETE_FAILPOINTS:
-            if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in (
-                "timeline-delete-before-index-delete",
-                "timeline-delete-after-index-delete",
-            ):
-                # the above failpoints are not relevant for config without remote storage
-                continue
-
-            result.append((remote_storage_kind, delete_failpoint))
-    return result
-
-
 # cover the two cases: remote storage configured vs not configured
-@pytest.mark.parametrize("remote_storage_kind, failpoint", combinations())
-@pytest.mark.parametrize("check", list(Check))
-def test_delete_timeline_exercise_crash_safety_failpoints(
-    neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
-    failpoint: str,
-    check: Check,
-    pg_bin: PgBin,
+@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
+def test_delete_timeline_post_rm_failure(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
 ):
    """
-    If there is a failure during deletion in one of the associated failpoints (or crash restart happens at this point) the delete operation
-    should be retryable and should be successfully resumed.
-
-    We iterate over failpoints list, changing failpoint to the next one.
-
-    1. Set settings to generate many layers
-    2. Create branch.
-    3. Insert something
-    4. Go with the test.
-    5. Iterate over failpoints
-    6. Execute delete for each failpoint
-    7. Ensure failpoint is hit
-    8. Retry or restart without the failpoint and check the result.
+    If there is a failure after removing the timeline directory, the delete operation
+    should be retryable.
    """

    if remote_storage_kind is not None:
        neon_env_builder.enable_remote_storage(
-            remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints"
+            remote_storage_kind, "test_delete_timeline_post_rm_failure"
        )

-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            "gc_period": "0s",
-            "compaction_period": "0s",
-            "checkpoint_distance": f"{1024 ** 2}",
-            "image_creation_threshold": "100",
-        }
-    )
+    env = neon_env_builder.init_start()
+    assert env.initial_timeline
+
+    env.pageserver.allowed_errors.append(".*Error: failpoint: timeline-delete-after-rm")
+    env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline")

    ps_http = env.pageserver.http_client()

-    timeline_id = env.neon_cli.create_timeline("delete")
-    with env.endpoints.create_start("delete") as endpoint:
-        # generate enough layers
-        pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", endpoint.connstr()])
-        if remote_storage_kind is RemoteStorageKind.NOOP:
-            wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline_id)
-        else:
-            last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
+    failpoint_name = "timeline-delete-after-rm"
+    ps_http.configure_failpoints((failpoint_name, "return"))

-    env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
-    # It appears when we stopped flush loop during deletion and then pageserver is stopped
-    env.pageserver.allowed_errors.append(
-        ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited"
+    ps_http.timeline_delete(env.initial_tenant, env.initial_timeline)
+    wait_until_timeline_state(
+        pageserver_http=ps_http,
+        tenant_id=env.initial_tenant,
+        timeline_id=env.initial_timeline,
+        expected_state="Broken",
+        iterations=2,  # effectively try immediately and retry once in one second
    )
-    # This happens when we fail before scheduling background operation.
-    # Timeline is left in stopping state and retry tries to stop it again.
+
+    # FIXME: #4719
+    # timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm"
+
+    at_failpoint_log_message = f".*{env.initial_timeline}.*at failpoint {failpoint_name}.*"
+    env.pageserver.allowed_errors.append(at_failpoint_log_message)
    env.pageserver.allowed_errors.append(
-        ".*Ignoring new state, equal to the existing one: Stopping"
+        f".*DELETE.*{env.initial_timeline}.*InternalServerError.*{failpoint_name}"
    )
-    # This happens when we retry delete requests for broken timelines
-    env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline")
-    # This happens when timeline remains are cleaned up during loading
-    env.pageserver.allowed_errors.append(".*Timeline dir entry become invalid.*")
-    # In one of the branches we poll for tenant to become active. Polls can generate this log message:
-    env.pageserver.allowed_errors.append(f".*Tenant {env.initial_tenant} is not active*")

-    ps_http.configure_failpoints((failpoint, "return"))
+    # retry without failpoint, it should succeed
+    ps_http.configure_failpoints((failpoint_name, "off"))

-    iterations = 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 4
-
-    # These failpoints are earlier than background task is spawned.
-    # so they result in api request failure.
-    if failpoint in (
-        "timeline-delete-before-index-deleted-at",
-        "timeline-delete-before-schedule",
-    ):
-        with pytest.raises(PageserverApiException, match=failpoint):
-            ps_http.timeline_delete(env.initial_tenant, timeline_id)
-
-    else:
-        ps_http.timeline_delete(env.initial_tenant, timeline_id)
-        timeline_info = wait_until_timeline_state(
-            pageserver_http=ps_http,
-            tenant_id=env.initial_tenant,
-            timeline_id=timeline_id,
-            expected_state="Broken",
-            iterations=iterations,
-        )
-
-        reason = timeline_info["state"]["Broken"]["reason"]
-        log.info(f"timeline broken: {reason}")
-
-        # failpoint may not be the only error in the stack
-        assert reason.endswith(f"failpoint: {failpoint}"), reason
-
-    if check is Check.RETRY_WITH_RESTART:
-        env.pageserver.stop()
-        env.pageserver.start()
-
-        wait_until_tenant_active(ps_http, env.initial_tenant, iterations=iterations)
-
-        if failpoint == "timeline-delete-before-index-deleted-at":
-            # We crashed before persisting this to remote storage, need to retry delete request
-            timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id)
-        else:
-            # Pageserver should've resumed deletion after restart.
-            wait_timeline_detail_404(
-                ps_http, env.initial_tenant, timeline_id, iterations=iterations
-            )
-    elif check is Check.RETRY_WITHOUT_RESTART:
-        # this should succeed
-        # this also checks that delete can be retried even when timeline is in Broken state
-        ps_http.configure_failpoints((failpoint, "off"))
-
-        timeline_delete_wait_completed(
-            ps_http, env.initial_tenant, timeline_id, iterations=iterations
-        )
-
-    # Check remote is impty
-    if remote_storage_kind is RemoteStorageKind.MOCK_S3:
-        assert_prefix_empty(
-            neon_env_builder,
-            prefix="/".join(
-                (
-                    "tenants",
-                    str(env.initial_tenant),
-                    "timelines",
-                    str(timeline_id),
-                )
-            ),
-        )
-
-    timeline_dir = env.timeline_dir(env.initial_tenant, timeline_id)
-    # Check local is empty
-    assert not timeline_dir.exists()
-    # Check no delete mark present
-    assert not (timeline_dir.parent / f"{timeline_id}.___deleted").exists()
+    # this should succeed
+    # this also checks that delete can be retried even when timeline is in Broken state
+    timeline_delete_wait_completed(ps_http, env.initial_tenant, env.initial_timeline)
+    env.pageserver.allowed_errors.append(
+        f".*{env.initial_timeline}.*timeline directory not found, proceeding anyway.*"
+    )


@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
@@ -455,7 +327,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
    )

    ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)
-    timeline_info = wait_until_timeline_state(
+    wait_until_timeline_state(
        pageserver_http=ps_http,
        tenant_id=env.initial_tenant,
        timeline_id=leaf_timeline_id,
@@ -463,7 +335,8 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
        iterations=2,  # effectively try immediately and retry once in one second
    )

-    assert timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-before-rm"
+    # FIXME: #4719
+    # timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm"

    assert leaf_timeline_path.exists(), "the failpoint didn't work"

@@ -569,7 +442,7 @@ def test_concurrent_timeline_delete_stuck_on(
        try:
            log.info("first call start")
            timeline_delete_wait_completed(
-                ps_http, env.initial_tenant, child_timeline_id, timeout=20
+                ps_http, env.initial_tenant, child_timeline_id, timeout=10
            )
            log.info("first call success")
            result_queue.put("success")
@@ -683,7 +556,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
    wait_until(50, 0.1, first_request_finished)

    # check that the timeline is gone
-    wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=2)
+    wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id)


@pytest.mark.parametrize(
@@ -715,7 +588,6 @@ def test_timeline_delete_works_for_remote_smoke(
    assert tenant_id == env.initial_tenant
    assert main_timeline_id == env.initial_timeline

-    assert env.initial_timeline is not None
    timeline_ids = [env.initial_timeline]
    for i in range(2):
        branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main")
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,4 +1,4 @@
 {
-    "postgres-v15": "770c6dffc5ef6aac05bf049693877fb377eea6fc",
-    "postgres-v14": "da3885c34db312afd555802be2ce985fafd1d8ad"
+    "postgres-v15": "e3fbfc4d143b2d3c3c1813ce747f8af35aa9405e",
+    "postgres-v14": "12c5dc8281d20b5bd636e1097eea80a7bc609591"
 }