Add multixact test reproducing the problem with duplicates caused by incorrect opffset calculation

2026-01-23 21:30:36 +00:00 · 2023-07-21 22:40:47 +03:00
101 changed files with 1618 additions and 4599 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -21,5 +21,4 @@
 !workspace_hack/
 !neon_local/
 !scripts/ninstall.sh
-!scripts/combine_control_files.py
 !vm-cgconfig.conf
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -209,4 +209,4 @@ runs:
      uses: ./.github/actions/allure-report-store
      with:
        report-dir: /tmp/test_output/allure/results
-        unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
+        unique-key: ${{ inputs.build_type }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -955,15 +955,22 @@ jobs:
        version: [ v14, v15 ]

    env:
-      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+      # While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
+      # Later all the extensions will be moved to extensions image.
+      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
+      COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
      AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
-      S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
+      S3_BUCKETS: |
+        ${{ github.ref_name == 'release' &&
+          'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
+          'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}

    steps:
      - name: Pull postgres-extensions image
        run: |
          docker pull ${EXTENSIONS_IMAGE}
+          docker pull ${COMPUTE_NODE_IMAGE}

      - name: Create postgres-extensions container
        id: create-container
@@ -971,23 +978,46 @@ jobs:
          EID=$(docker create ${EXTENSIONS_IMAGE} true)
          echo "EID=${EID}" >> $GITHUB_OUTPUT

+          CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
+          echo "CID=${CID}" >> $GITHUB_OUTPUT
+
      - name: Extract postgres-extensions from container
        run: |
-          rm -rf ./extensions-to-upload # Just in case
-          mkdir -p extensions-to-upload
+          rm -rf ./extensions-to-upload ./custom-extensions # Just in case

-          docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
-          docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
+          # In compute image we have a bit different directory layout
+          mkdir -p extensions-to-upload/share
+          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
+          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib             ./extensions-to-upload/lib
+
+          # Delete Neon extensitons (they always present on compute-node image)
+          rm -rf ./extensions-to-upload/share/extension/neon*
+          rm -rf ./extensions-to-upload/lib/neon*
+
+          # Delete leftovers from the extension build step
+          rm -rf ./extensions-to-upload/lib/pgxs
+          rm -rf ./extensions-to-upload/lib/pkgconfig
+
+          docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
+          for EXT_NAME in $(ls ./custom-extensions); do
+            mkdir -p ./extensions-to-upload/${EXT_NAME}/share
+
+            mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
+            mv ./custom-extensions/${EXT_NAME}/lib             ./extensions-to-upload/${EXT_NAME}/lib
+          done

      - name: Upload postgres-extensions to S3
+        # TODO: Reenable step after switching to the new extensions format (tar-gzipped + index.json)
+        if: false
        run: |
-          for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
+          for BUCKET in $(echo ${S3_BUCKETS}); do
            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
          done

      - name: Cleanup
-        if: ${{ always() && steps.create-container.outputs.EID }}
+        if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
        run: |
+          docker rm ${{ steps.create-container.outputs.CID }} || true
          docker rm ${{ steps.create-container.outputs.EID }} || true

  deploy:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2506,7 +2506,6 @@ dependencies = [
 "pageserver",
 "postgres_ffi",
 "svg_fmt",
- "tokio",
 "utils",
 "workspace_hack",
 ]
@@ -2545,7 +2544,6 @@ dependencies = [
 "metrics",
 "nix",
 "num-traits",
- "num_cpus",
 "once_cell",
 "pageserver_api",
 "pin-project-lite",
@@ -2654,6 +2652,16 @@ dependencies = [
 "windows-sys 0.45.0",
 ]

+[[package]]
+name = "pbkdf2"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
+dependencies = [
+ "digest",
+ "hmac",
+]
+
 [[package]]
 name = "peeking_take_while"
 version = "0.1.2"
@@ -2772,7 +2780,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2785,7 +2793,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "native-tls",
 "tokio",
@@ -2796,7 +2804,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -2814,7 +2822,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3030,7 +3038,6 @@ dependencies = [
 "chrono",
 "clap",
 "consumption_metrics",
- "fallible-iterator",
 "futures",
 "git-version",
 "hashbrown 0.13.2",
@@ -3048,9 +3055,9 @@ dependencies = [
 "once_cell",
 "opentelemetry",
 "parking_lot 0.12.1",
+ "pbkdf2",
 "pin-project-lite",
 "postgres-native-tls",
- "postgres-protocol",
 "postgres_backend",
 "pq_proto",
 "prometheus",
@@ -3074,7 +3081,6 @@ dependencies = [
 "thiserror",
 "tls-listener",
 "tokio",
- "tokio-native-tls",
 "tokio-postgres",
 "tokio-postgres-rustls",
 "tokio-rustls 0.23.4",
@@ -4306,7 +4312,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "async-trait",
 "byteorder",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -88,6 +88,7 @@ opentelemetry = "0.19.0"
 opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.11.0"
 parking_lot = "0.12"
+pbkdf2 = "0.12.1"
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
@@ -143,11 +144,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -182,7 +183,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }

 ################# Binary contents sections

--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -13,7 +13,7 @@ FROM debian:bullseye-slim AS build-deps
 RUN apt update &&  \
    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
-    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
+    libicu-dev libxslt1-dev liblz4-dev libzstd-dev

 #########################################################################################
 #
@@ -77,7 +77,6 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"
 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
    echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    ./autogen.sh && \
    ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -90,28 +89,17 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
-    mkdir -p /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis_raster.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis_sfcgal.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/postgis_topology.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
-    cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control

 RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
    echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
-    mkdir build && cd build && \
+    mkdir build && \
+    cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control

 #########################################################################################
 #
@@ -431,16 +419,12 @@ RUN apt-get update && \
    wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
    echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
-    mkdir build && cd build && \
+    mkdir build && \
+    cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control

 #########################################################################################
 #
@@ -551,8 +535,10 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.1.tar.gz -O pg_embedding.tar.gz && \
-    echo "c4ae84eef36fa8ec5868f6e061f39812f19ee5ba3604d428d40935685c7be512 pg_embedding.tar.gz" | sha256sum --check && \
+# eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703 made on 15/07/2023
+# There is no release tag yet
+RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703.tar.gz -O pg_embedding.tar.gz && \
+    echo "030846df723652f99a8689ce63b66fa0c23477a7fd723533ab8a6b28ab70730f pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -567,17 +553,16 @@ RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.1.ta
 FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

+# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
    echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
+    find /usr/local/pgsql -type f | sort  > /before.txt && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
+    find /usr/local/pgsql -type f | sort  > /after.txt && \
+    /bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'

 #########################################################################################
 #
@@ -769,23 +754,16 @@ RUN rm /usr/local/pgsql/lib/lib*.a
 # Extenstion only
 #
 #########################################################################################
-FROM python:3.9-slim-bullseye AS generate-ext-index
-ARG PG_VERSION
-ARG BUILD_TAG
-RUN apt update && apt install -y zstd
-
-# copy the control files here
-COPY --from=kq-imcx-pg-build /extensions/ /extensions/
-COPY --from=pg-anon-pg-build /extensions/ /extensions/
-COPY --from=postgis-build /extensions/ /extensions/
-COPY scripts/combine_control_files.py ./combine_control_files.py
-RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
-
 FROM scratch AS postgres-extensions
 # After the transition this layer will include all extensitons.
-# As for now, it's only a couple for testing purposses
-COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
-COPY --from=generate-ext-index /ext_index.json /ext_index.json
+# As for now, it's only for new custom ones
+#
+# # Default extensions
+# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
+# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib             /usr/local/pgsql/lib
+# Custom extensions
+COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
+COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension

 #########################################################################################
 #
--- a/2
+++ b/2
@@ -108,8 +108,6 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
-	+@echo "Compiling amcheck $*"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install

 .PHONY: postgres-clean-%
 postgres-clean-%:
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -193,13 +193,6 @@ fn main() -> Result<()> {
    if !spec_set {
        // No spec provided, hang waiting for it.
        info!("no compute spec provided, waiting");
-
-        // TODO this can stall startups in the unlikely event that we bind
-        //      this compute node while it's busy prewarming. It's not too
-        //      bad because it's just 100ms and unlikely, but it's an
-        //      avoidable problem.
-        compute.prewarm_postgres()?;
-
        let mut state = compute.state.lock().unwrap();
        while state.status != ComputeStatus::ConfigurationPending {
            state = compute.state_changed.wait(state).unwrap();
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -8,11 +8,9 @@ use std::sync::{Condvar, Mutex};

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use futures::stream::FuturesUnordered;
-use futures::StreamExt;
 use postgres::{Client, NoTls};
 use tokio_postgres;
-use tracing::{error, info, instrument, warn};
+use tracing::{info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -23,7 +21,6 @@ use utils::measured_stream::MeasuredReader;
 use crate::config;
 use crate::pg_helpers::*;
 use crate::spec::*;
-use crate::sync_sk::{check_if_synced, ping_safekeeper};

 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
@@ -89,7 +86,6 @@ pub struct ParsedSpec {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    pub pageserver_connstr: String,
-    pub safekeeper_connstrings: Vec<String>,
    pub storage_auth_token: Option<String>,
 }

@@ -107,21 +103,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
            .clone()
            .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
            .ok_or("pageserver connstr should be provided")?;
-        let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
-            if matches!(spec.mode, ComputeMode::Primary) {
-                spec.cluster
-                    .settings
-                    .find("neon.safekeepers")
-                    .ok_or("safekeeper connstrings should be provided")?
-                    .split(',')
-                    .map(|str| str.to_string())
-                    .collect()
-            } else {
-                vec![]
-            }
-        } else {
-            spec.safekeeper_connstrings.clone()
-        };
        let storage_auth_token = spec.storage_auth_token.clone();
        let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
            tenant_id
@@ -147,7 +128,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
        Ok(ParsedSpec {
            spec,
            pageserver_connstr,
-            safekeeper_connstrings,
            storage_auth_token,
            tenant_id,
            timeline_id,
@@ -329,102 +309,6 @@ impl ComputeNode {
        Ok(())
    }

-    pub async fn check_safekeepers_synced_async(
-        &self,
-        compute_state: &ComputeState,
-    ) -> Result<Option<Lsn>> {
-        // Construct a connection config for each safekeeper
-        let pspec: ParsedSpec = compute_state
-            .pspec
-            .as_ref()
-            .expect("spec must be set")
-            .clone();
-        let sk_connstrs: Vec<String> = pspec.safekeeper_connstrings.clone();
-        let sk_configs = sk_connstrs.into_iter().map(|connstr| {
-            // Format connstr
-            let id = connstr.clone();
-            let connstr = format!("postgresql://no_user@{}", connstr);
-            let options = format!(
-                "-c timeline_id={} tenant_id={}",
-                pspec.timeline_id, pspec.tenant_id
-            );
-
-            // Construct client
-            let mut config = tokio_postgres::Config::from_str(&connstr).unwrap();
-            config.options(&options);
-            if let Some(storage_auth_token) = pspec.storage_auth_token.clone() {
-                config.password(storage_auth_token);
-            }
-
-            (id, config)
-        });
-
-        // Create task set to query all safekeepers
-        let mut tasks = FuturesUnordered::new();
-        let quorum = sk_configs.len() / 2 + 1;
-        for (id, config) in sk_configs {
-            let timeout = tokio::time::Duration::from_millis(100);
-            let task = tokio::time::timeout(timeout, ping_safekeeper(id, config));
-            tasks.push(tokio::spawn(task));
-        }
-
-        // Get a quorum of responses or errors
-        let mut responses = Vec::new();
-        let mut join_errors = Vec::new();
-        let mut task_errors = Vec::new();
-        let mut timeout_errors = Vec::new();
-        while let Some(response) = tasks.next().await {
-            match response {
-                Ok(Ok(Ok(r))) => responses.push(r),
-                Ok(Ok(Err(e))) => task_errors.push(e),
-                Ok(Err(e)) => timeout_errors.push(e),
-                Err(e) => join_errors.push(e),
-            };
-            if responses.len() >= quorum {
-                break;
-            }
-            if join_errors.len() + task_errors.len() + timeout_errors.len() >= quorum {
-                break;
-            }
-        }
-
-        // In case of error, log and fail the check, but don't crash.
-        // We're playing it safe because these errors could be transient
-        // and we don't yet retry. Also being careful here allows us to
-        // be backwards compatible with safekeepers that don't have the
-        // TIMELINE_STATUS API yet.
-        if responses.len() < quorum {
-            error!(
-                "failed sync safekeepers check {:?} {:?} {:?}",
-                join_errors, task_errors, timeout_errors
-            );
-            return Ok(None);
-        }
-
-        Ok(check_if_synced(responses))
-    }
-
-    // Fast path for sync_safekeepers. If they're already synced we get the lsn
-    // in one roundtrip. If not, we should do a full sync_safekeepers.
-    pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result<Option<Lsn>> {
-        let start_time = Utc::now();
-
-        // Run actual work with new tokio runtime
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .expect("failed to create rt");
-        let result = rt.block_on(self.check_safekeepers_synced_async(compute_state));
-
-        // Record runtime
-        self.state.lock().unwrap().metrics.sync_sk_check_ms = Utc::now()
-            .signed_duration_since(start_time)
-            .to_std()
-            .unwrap()
-            .as_millis() as u64;
-        result
-    }
-
    // Run `postgres` in a special mode with `--sync-safekeepers` argument
    // and return the reported LSN back to the caller.
    #[instrument(skip_all)]
@@ -487,14 +371,10 @@ impl ComputeNode {
        // cannot sync safekeepers.
        let lsn = match spec.mode {
            ComputeMode::Primary => {
-                info!("checking if safekeepers are synced");
-                let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) {
-                    lsn
-                } else {
-                    info!("starting safekeepers syncing");
-                    self.sync_safekeepers(pspec.storage_auth_token.clone())
-                        .with_context(|| "failed to sync safekeepers")?
-                };
+                info!("starting safekeepers syncing");
+                let lsn = self
+                    .sync_safekeepers(pspec.storage_auth_token.clone())
+                    .with_context(|| "failed to sync safekeepers")?;
                info!("safekeepers synced at LSN {}", lsn);
                lsn
            }
@@ -532,50 +412,6 @@ impl ComputeNode {
        Ok(())
    }

-    /// Start and stop a postgres process to warm up the VM for startup.
-    pub fn prewarm_postgres(&self) -> Result<()> {
-        info!("prewarming");
-
-        // Create pgdata
-        let pgdata = &format!("{}.warmup", self.pgdata);
-        create_pgdata(pgdata)?;
-
-        // Run initdb to completion
-        info!("running initdb");
-        let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
-        Command::new(initdb_bin)
-            .args(["-D", pgdata])
-            .output()
-            .expect("cannot start initdb process");
-
-        // Write conf
-        use std::io::Write;
-        let conf_path = Path::new(pgdata).join("postgresql.conf");
-        let mut file = std::fs::File::create(conf_path)?;
-        writeln!(file, "shared_buffers=65536")?;
-        writeln!(file, "port=51055")?; // Nobody should be connecting
-        writeln!(file, "shared_preload_libraries = 'neon'")?;
-
-        // Start postgres
-        info!("starting postgres");
-        let mut pg = Command::new(&self.pgbin)
-            .args(["-D", pgdata])
-            .spawn()
-            .expect("cannot start postgres process");
-
-        // Stop it when it's ready
-        info!("waiting for postgres");
-        wait_for_postgres(&mut pg, Path::new(pgdata))?;
-        pg.kill()?;
-        info!("sent kill signal");
-        pg.wait()?;
-        info!("done prewarming");
-
-        // clean up
-        let _ok = fs::remove_dir_all(pgdata);
-        Ok(())
-    }
-
    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
    #[instrument(skip_all)]
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -13,4 +13,3 @@ pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
 pub mod spec;
-pub mod sync_sk;
--- a/compute_tools/src/sync_sk.rs
+++ b/compute_tools/src/sync_sk.rs
@@ -1,98 +0,0 @@
-// Utils for running sync_safekeepers
-use anyhow::Result;
-use tracing::info;
-use utils::lsn::Lsn;
-
-#[derive(Copy, Clone, Debug)]
-pub enum TimelineStatusResponse {
-    NotFound,
-    Ok(TimelineStatusOkResponse),
-}
-
-#[derive(Copy, Clone, Debug)]
-pub struct TimelineStatusOkResponse {
-    flush_lsn: Lsn,
-    commit_lsn: Lsn,
-}
-
-/// Get a safekeeper's metadata for our timeline. The id is only used for logging
-pub async fn ping_safekeeper(
-    id: String,
-    config: tokio_postgres::Config,
-) -> Result<TimelineStatusResponse> {
-    // TODO add retries
-
-    // Connect
-    info!("connecting to {}", id);
-    let (client, conn) = config.connect(tokio_postgres::NoTls).await?;
-    tokio::spawn(async move {
-        if let Err(e) = conn.await {
-            eprintln!("connection error: {}", e);
-        }
-    });
-
-    // Query
-    info!("querying {}", id);
-    let result = client.simple_query("TIMELINE_STATUS").await?;
-
-    // Parse result
-    info!("done with {}", id);
-    if let postgres::SimpleQueryMessage::Row(row) = &result[0] {
-        use std::str::FromStr;
-        let response = TimelineStatusResponse::Ok(TimelineStatusOkResponse {
-            flush_lsn: Lsn::from_str(row.get("flush_lsn").unwrap())?,
-            commit_lsn: Lsn::from_str(row.get("commit_lsn").unwrap())?,
-        });
-        Ok(response)
-    } else {
-        // Timeline doesn't exist
-        Ok(TimelineStatusResponse::NotFound)
-    }
-}
-
-/// Given a quorum of responses, check if safekeepers are synced at some Lsn
-pub fn check_if_synced(responses: Vec<TimelineStatusResponse>) -> Option<Lsn> {
-    // Check if all responses are ok
-    let ok_responses: Vec<TimelineStatusOkResponse> = responses
-        .iter()
-        .filter_map(|r| match r {
-            TimelineStatusResponse::Ok(ok_response) => Some(ok_response),
-            _ => None,
-        })
-        .cloned()
-        .collect();
-    if ok_responses.len() < responses.len() {
-        info!(
-            "not synced. Only {} out of {} know about this timeline",
-            ok_responses.len(),
-            responses.len()
-        );
-        return None;
-    }
-
-    // Get the min and the max of everything
-    let commit: Vec<Lsn> = ok_responses.iter().map(|r| r.commit_lsn).collect();
-    let flush: Vec<Lsn> = ok_responses.iter().map(|r| r.flush_lsn).collect();
-    let commit_max = commit.iter().max().unwrap();
-    let commit_min = commit.iter().min().unwrap();
-    let flush_max = flush.iter().max().unwrap();
-    let flush_min = flush.iter().min().unwrap();
-
-    // Check that all values are equal
-    if commit_min != commit_max {
-        info!("not synced. {:?} {:?}", commit_min, commit_max);
-        return None;
-    }
-    if flush_min != flush_max {
-        info!("not synced. {:?} {:?}", flush_min, flush_max);
-        return None;
-    }
-
-    // Check that commit == flush
-    if commit_max != flush_max {
-        info!("not synced. {:?} {:?}", commit_max, flush_max);
-        return None;
-    }
-
-    Some(*commit_max)
-}
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -564,7 +564,9 @@ impl Endpoint {
                }
                Err(e) => {
                    if attempt == MAX_ATTEMPTS {
-                        return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
+                        return Err(e).context(
+                            "timed out waiting to connect to compute_ctl HTTP; last error: {e}",
+                        );
                    }
                }
            }
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -70,7 +70,6 @@ where
 pub struct ComputeMetrics {
    pub wait_for_spec_ms: u64,
    pub sync_safekeepers_ms: u64,
-    pub sync_sk_check_ms: u64,
    pub basebackup_ms: u64,
    pub basebackup_bytes: u64,
    pub start_postgres_ms: u64,
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::Serialize;

-#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
    #[serde(rename = "absolute")]
@@ -17,32 +17,6 @@ pub enum EventType {
    },
 }

-impl EventType {
-    pub fn absolute_time(&self) -> Option<&DateTime<Utc>> {
-        use EventType::*;
-        match self {
-            Absolute { time } => Some(time),
-            _ => None,
-        }
-    }
-
-    pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
-        // these can most likely be thought of as Range or RangeFull
-        use EventType::*;
-        match self {
-            Incremental {
-                start_time,
-                stop_time,
-            } => Some(start_time..stop_time),
-            _ => None,
-        }
-    }
-
-    pub fn is_incremental(&self) -> bool {
-        matches!(self, EventType::Incremental { .. })
-    }
-}
-
 #[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 pub struct Event<Extra> {
    #[serde(flatten)]
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -179,7 +179,7 @@ pub struct FeExecuteMessage {
 #[derive(Debug)]
 pub struct FeCloseMessage;

-/// An error occurred while parsing or serializing raw stream into Postgres
+/// An error occured while parsing or serializing raw stream into Postgres
 /// messages.
 #[derive(thiserror::Error, Debug)]
 pub enum ProtocolError {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -200,17 +200,13 @@ impl S3Bucket {
        )
    }

-    pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
-        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path
-            .get_path()
-            .to_string_lossy()
-            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
-            .to_string();
-        match &self.prefix_in_bucket {
-            Some(prefix) => prefix.clone() + "/" + &path_string,
-            None => path_string,
+    fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
+        let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
+        for segment in path.0.iter() {
+            full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+            full_path.push_str(segment.to_str().unwrap_or_default());
        }
+        full_path
    }

    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
@@ -431,12 +427,10 @@ impl RemoteStorage for S3Bucket {
    }

    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
-        // if prefix is not none then download file `prefix/from`
-        // if prefix is none then download file `from`
        self.download_object(GetObjectRequest {
            bucket: self.bucket_name.clone(),
            key: self.relative_path_to_s3_object(from),
-            range: None,
+            ..GetObjectRequest::default()
        })
        .await
    }
@@ -529,63 +523,3 @@ impl RemoteStorage for S3Bucket {
        Ok(())
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use std::num::NonZeroUsize;
-    use std::path::Path;
-
-    use crate::{RemotePath, S3Bucket, S3Config};
-
-    #[test]
-    fn relative_path() {
-        let all_paths = vec!["", "some/path", "some/path/"];
-        let all_paths: Vec<RemotePath> = all_paths
-            .iter()
-            .map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
-            .collect();
-        let prefixes = [
-            None,
-            Some(""),
-            Some("test/prefix"),
-            Some("test/prefix/"),
-            Some("/test/prefix/"),
-        ];
-        let expected_outputs = vec![
-            vec!["", "some/path", "some/path"],
-            vec!["/", "/some/path", "/some/path"],
-            vec![
-                "test/prefix/",
-                "test/prefix/some/path",
-                "test/prefix/some/path",
-            ],
-            vec![
-                "test/prefix/",
-                "test/prefix/some/path",
-                "test/prefix/some/path",
-            ],
-            vec![
-                "test/prefix/",
-                "test/prefix/some/path",
-                "test/prefix/some/path",
-            ],
-        ];
-
-        for (prefix_idx, prefix) in prefixes.iter().enumerate() {
-            let config = S3Config {
-                bucket_name: "bucket".to_owned(),
-                bucket_region: "region".to_owned(),
-                prefix_in_bucket: prefix.map(str::to_string),
-                endpoint: None,
-                concurrency_limit: NonZeroUsize::new(100).unwrap(),
-                max_keys_per_list_response: Some(5),
-            };
-            let storage = S3Bucket::new(&config).expect("remote storage init");
-            for (test_path_idx, test_path) in all_paths.iter().enumerate() {
-                let result = storage.relative_path_to_s3_object(test_path);
-                let expected = expected_outputs[prefix_idx][test_path_idx];
-                assert_eq!(result, expected);
-            }
-        }
-    }
-}
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -19,7 +19,7 @@ static LOGGING_DONE: OnceCell<()> = OnceCell::new();

 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";

-const BASE_PREFIX: &str = "test";
+const BASE_PREFIX: &str = "test/";

 /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -24,29 +24,12 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
    Ok(dir.next_entry().await?.is_none())
 }

-pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
-    if e.kind() == io::ErrorKind::NotFound {
-        Ok(())
-    } else {
-        Err(e)
-    }
-}
-
-pub fn ignore_absent_files<F>(fs_operation: F) -> io::Result<()>
-where
-    F: Fn() -> io::Result<()>,
-{
-    fs_operation().or_else(ignore_not_found)
-}
-
 #[cfg(test)]
 mod test {
    use std::path::PathBuf;

    use crate::fs_ext::is_directory_empty;

-    use super::ignore_absent_files;
-
    #[test]
    fn is_empty_dir() {
        use super::PathExt;
@@ -92,21 +75,4 @@ mod test {
        std::fs::remove_file(&file_path).unwrap();
        assert!(is_directory_empty(file_path).await.is_err());
    }
-
-    #[test]
-    fn ignore_absent_files_works() {
-        let dir = tempfile::tempdir().unwrap();
-        let dir_path = dir.path();
-
-        let file_path: PathBuf = dir_path.join("testfile");
-
-        ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
-
-        let f = std::fs::File::create(&file_path).unwrap();
-        drop(f);
-
-        ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
-
-        assert!(!file_path.exists());
-    }
 }
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -1,7 +1,5 @@
-use std::ffi::OsStr;
 use std::{fmt, str::FromStr};

-use anyhow::Context;
 use hex::FromHex;
 use rand::Rng;
 use serde::{Deserialize, Serialize};
@@ -215,18 +213,6 @@ pub struct TimelineId(Id);

 id_newtype!(TimelineId);

-impl TryFrom<Option<&OsStr>> for TimelineId {
-    type Error = anyhow::Error;
-
-    fn try_from(value: Option<&OsStr>) -> Result<Self, Self::Error> {
-        value
-            .and_then(OsStr::to_str)
-            .unwrap_or_default()
-            .parse::<TimelineId>()
-            .with_context(|| format!("Could not parse timeline id from {:?}", value))
-    }
-}
-
 /// Neon Tenant Id represents identifiar of a particular tenant.
 /// Is used for distinguishing requests and data belonging to different users.
 ///
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -35,8 +35,6 @@ humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
 nix.workspace = true
-# hack to get the number of worker threads tokio uses
-num_cpus = { version = "1.15" }
 num-traits.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -13,7 +13,6 @@ clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
 pageserver = { path = ".." }
 postgres_ffi.workspace = true
-tokio.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -95,7 +95,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 }

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
-async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
+fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    let file = FileBlockReader::new(VirtualFile::open(path)?);
    let summary_blk = file.read_blk(0)?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -129,7 +129,7 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    Ok(holes)
 }

-pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
+pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let storage_path = &cmd.path;
    let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);

@@ -160,7 +160,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
                    parse_filename(&layer.file_name().into_string().unwrap())
                {
                    if layer_file.is_delta {
-                        layer_file.holes = get_holes(&layer.path(), max_holes).await?;
+                        layer_file.holes = get_holes(&layer.path(), max_holes)?;
                        n_deltas += 1;
                    }
                    layers.push(layer_file);
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -43,7 +43,8 @@ pub(crate) enum LayerCmd {
    },
 }

-async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
+fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
+    use pageserver::tenant::blob_io::BlobCursor;
    use pageserver::tenant::block_io::BlockReader;

    let path = path.as_ref();
@@ -77,7 +78,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    Ok(())
 }

-pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
+pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
    match cmd {
        LayerCmd::List { path } => {
            for tenant in fs::read_dir(path.join("tenants"))? {
@@ -152,7 +153,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                        );

                        if layer_file.is_delta {
-                            read_delta_file(layer.path()).await?;
+                            read_delta_file(layer.path())?;
                        } else {
                            anyhow::bail!("not supported yet :(");
                        }
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -72,13 +72,12 @@ struct AnalyzeLayerMapCmd {
    max_holes: Option<usize>,
 }

-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
+fn main() -> anyhow::Result<()> {
    let cli = CliOpts::parse();

    match cli.command {
        Commands::Layer(cmd) => {
-            layers::main(&cmd).await?;
+            layers::main(&cmd)?;
        }
        Commands::Metadata(cmd) => {
            handle_metadata(&cmd)?;
@@ -87,7 +86,7 @@ async fn main() -> anyhow::Result<()> {
            draw_timeline_dir::main()?;
        }
        Commands::AnalyzeLayerMap(cmd) => {
-            layer_map_analyzer::main(&cmd).await?;
+            layer_map_analyzer::main(&cmd)?;
        }
        Commands::PrintLayerFile(cmd) => {
            if let Err(e) = read_pg_control_file(&cmd.path) {
@@ -95,7 +94,7 @@ async fn main() -> anyhow::Result<()> {
                    "Failed to read input file as a pg control one: {e:#}\n\
                    Attempting to read it as layer file"
                );
-                print_layerfile(&cmd.path).await?;
+                print_layerfile(&cmd.path)?;
            }
        }
    };
@@ -114,12 +113,12 @@ fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
    Ok(())
 }

-async fn print_layerfile(path: &Path) -> anyhow::Result<()> {
+fn print_layerfile(path: &Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
    virtual_file::init(10);
    page_cache::init(100);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
-    dump_layerfile_from_path(path, true, &ctx).await
+    dump_layerfile_from_path(path, true, &ctx)
 }

 fn handle_metadata(
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -33,8 +33,7 @@ use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
-    TIMELINE_UNINIT_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
 };

 pub mod defaults {
@@ -602,17 +601,6 @@ impl PageServerConf {
        )
    }

-    pub fn timeline_delete_mark_file_path(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> PathBuf {
-        path_with_suffix_extension(
-            self.timeline_path(&tenant_id, &timeline_id),
-            TIMELINE_DELETE_MARK_SUFFIX,
-        )
-    }
-
    pub fn traces_path(&self) -> PathBuf {
        self.workdir.join("traces")
    }
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -7,7 +7,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use anyhow;
-use chrono::{DateTime, Utc};
+use chrono::Utc;
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use pageserver_api::models::TenantState;
 use reqwest::Url;
@@ -18,6 +18,12 @@ use std::time::Duration;
 use tracing::*;
 use utils::id::{NodeId, TenantId, TimelineId};

+const WRITTEN_SIZE: &str = "written_size";
+const SYNTHETIC_STORAGE_SIZE: &str = "synthetic_storage_size";
+const RESIDENT_SIZE: &str = "resident_size";
+const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
+const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
+
 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

 #[serde_as]
@@ -38,121 +44,6 @@ pub struct PageserverConsumptionMetricsKey {
    pub metric: &'static str,
 }

-impl PageserverConsumptionMetricsKey {
-    const fn absolute_values(self) -> AbsoluteValueFactory {
-        AbsoluteValueFactory(self)
-    }
-    const fn incremental_values(self) -> IncrementalValueFactory {
-        IncrementalValueFactory(self)
-    }
-}
-
-/// Helper type which each individual metric kind can return to produce only absolute values.
-struct AbsoluteValueFactory(PageserverConsumptionMetricsKey);
-
-impl AbsoluteValueFactory {
-    fn now(self, val: u64) -> (PageserverConsumptionMetricsKey, (EventType, u64)) {
-        let key = self.0;
-        let time = Utc::now();
-        (key, (EventType::Absolute { time }, val))
-    }
-}
-
-/// Helper type which each individual metric kind can return to produce only incremental values.
-struct IncrementalValueFactory(PageserverConsumptionMetricsKey);
-
-impl IncrementalValueFactory {
-    #[allow(clippy::wrong_self_convention)]
-    fn from_previous_up_to(
-        self,
-        prev_end: DateTime<Utc>,
-        up_to: DateTime<Utc>,
-        val: u64,
-    ) -> (PageserverConsumptionMetricsKey, (EventType, u64)) {
-        let key = self.0;
-        // cannot assert prev_end < up_to because these are realtime clock based
-        (
-            key,
-            (
-                EventType::Incremental {
-                    start_time: prev_end,
-                    stop_time: up_to,
-                },
-                val,
-            ),
-        )
-    }
-
-    fn key(&self) -> &PageserverConsumptionMetricsKey {
-        &self.0
-    }
-}
-
-// the static part of a PageserverConsumptionMetricsKey
-impl PageserverConsumptionMetricsKey {
-    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
-        PageserverConsumptionMetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: "written_size",
-        }
-        .absolute_values()
-    }
-
-    /// Values will be the difference of the latest written_size (last_record_lsn) to what we
-    /// previously sent.
-    const fn written_size_delta(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> IncrementalValueFactory {
-        PageserverConsumptionMetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: "written_size_bytes_delta",
-        }
-        .incremental_values()
-    }
-
-    const fn timeline_logical_size(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> AbsoluteValueFactory {
-        PageserverConsumptionMetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: "timeline_logical_size",
-        }
-        .absolute_values()
-    }
-
-    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        PageserverConsumptionMetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "remote_storage_size",
-        }
-        .absolute_values()
-    }
-
-    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        PageserverConsumptionMetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "resident_size",
-        }
-        .absolute_values()
-    }
-
-    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        PageserverConsumptionMetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "synthetic_storage_size",
-        }
-        .absolute_values()
-    }
-}
-
 /// Main thread that serves metrics collection
 pub async fn collect_metrics(
    metric_collection_endpoint: &Url,
@@ -188,7 +79,7 @@ pub async fn collect_metrics(
        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
        .build()
        .expect("Failed to create http client with timeout");
-    let mut cached_metrics = HashMap::new();
+    let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();

    loop {
@@ -230,13 +121,13 @@ pub async fn collect_metrics(
 /// - refactor this function (chunking+sending part) to reuse it in proxy module;
 pub async fn collect_metrics_iteration(
    client: &reqwest::Client,
-    cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, (EventType, u64)>,
+    cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
    metric_collection_endpoint: &reqwest::Url,
    node_id: NodeId,
    ctx: &RequestContext,
    send_cached: bool,
 ) {
-    let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, (EventType, u64))> = Vec::new();
+    let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
    trace!(
        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
        metric_collection_endpoint
@@ -275,80 +166,27 @@ pub async fn collect_metrics_iteration(
            if timeline.is_active() {
                let timeline_written_size = u64::from(timeline.get_last_record_lsn());

-                let (key, written_size_now) =
-                    PageserverConsumptionMetricsKey::written_size(tenant_id, timeline.timeline_id)
-                        .now(timeline_written_size);
-
-                // last_record_lsn can only go up, right now at least, TODO: #2592 or related
-                // features might change this.
-
-                let written_size_delta_key = PageserverConsumptionMetricsKey::written_size_delta(
-                    tenant_id,
-                    timeline.timeline_id,
-                );
-
-                // use this when available, because in a stream of incremental values, it will be
-                // accurate where as when last_record_lsn stops moving, we will only cache the last
-                // one of those.
-                let last_stop_time =
-                    cached_metrics
-                        .get(written_size_delta_key.key())
-                        .map(|(until, _val)| {
-                            until
-                                .incremental_timerange()
-                                .expect("never create EventType::Absolute for written_size_delta")
-                                .end
-                        });
-
-                // by default, use the last sent written_size as the basis for
-                // calculating the delta. if we don't yet have one, use the load time value.
-                let prev = cached_metrics
-                    .get(&key)
-                    .map(|(prev_at, prev)| {
-                        // use the prev time from our last incremental update, or default to latest
-                        // absolute update on the first round.
-                        let prev_at = prev_at
-                            .absolute_time()
-                            .expect("never create EventType::Incremental for written_size");
-                        let prev_at = last_stop_time.unwrap_or(prev_at);
-                        (*prev_at, *prev)
-                    })
-                    .unwrap_or_else(|| {
-                        // if we don't have a previous point of comparison, compare to the load time
-                        // lsn.
-                        let (disk_consistent_lsn, loaded_at) = &timeline.loaded_at;
-                        (DateTime::from(*loaded_at), disk_consistent_lsn.0)
-                    });
-
-                // written_size_delta_bytes
-                current_metrics.extend(
-                    if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
-                        let up_to = written_size_now
-                            .0
-                            .absolute_time()
-                            .expect("never create EventType::Incremental for written_size");
-                        let key_value =
-                            written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
-                        Some(key_value)
-                    } else {
-                        None
+                current_metrics.push((
+                    PageserverConsumptionMetricsKey {
+                        tenant_id,
+                        timeline_id: Some(timeline.timeline_id),
+                        metric: WRITTEN_SIZE,
                    },
-                );
-
-                // written_size
-                current_metrics.push((key, written_size_now));
+                    timeline_written_size,
+                ));

                let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
                match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
                    // Only send timeline logical size when it is fully calculated.
                    Ok((size, is_exact)) if is_exact => {
-                        current_metrics.push(
-                            PageserverConsumptionMetricsKey::timeline_logical_size(
+                        current_metrics.push((
+                            PageserverConsumptionMetricsKey {
                                tenant_id,
-                                timeline.timeline_id,
-                            )
-                            .now(size),
-                        );
+                                timeline_id: Some(timeline.timeline_id),
+                                metric: TIMELINE_LOGICAL_SIZE,
+                            },
+                            size,
+                        ));
                    }
                    Ok((_, _)) => {}
                    Err(err) => {
@@ -367,10 +205,14 @@ pub async fn collect_metrics_iteration(

        match tenant.get_remote_size().await {
            Ok(tenant_remote_size) => {
-                current_metrics.push(
-                    PageserverConsumptionMetricsKey::remote_storage_size(tenant_id)
-                        .now(tenant_remote_size),
-                );
+                current_metrics.push((
+                    PageserverConsumptionMetricsKey {
+                        tenant_id,
+                        timeline_id: None,
+                        metric: REMOTE_STORAGE_SIZE,
+                    },
+                    tenant_remote_size,
+                ));
            }
            Err(err) => {
                error!(
@@ -380,9 +222,14 @@ pub async fn collect_metrics_iteration(
            }
        }

-        current_metrics.push(
-            PageserverConsumptionMetricsKey::resident_size(tenant_id).now(tenant_resident_size),
-        );
+        current_metrics.push((
+            PageserverConsumptionMetricsKey {
+                tenant_id,
+                timeline_id: None,
+                metric: RESIDENT_SIZE,
+            },
+            tenant_resident_size,
+        ));

        // Note that this metric is calculated in a separate bgworker
        // Here we only use cached value, which may lag behind the real latest one
@@ -390,27 +237,23 @@ pub async fn collect_metrics_iteration(

        if tenant_synthetic_size != 0 {
            // only send non-zeroes because otherwise these show up as errors in logs
-            current_metrics.push(
-                PageserverConsumptionMetricsKey::synthetic_size(tenant_id)
-                    .now(tenant_synthetic_size),
-            );
+            current_metrics.push((
+                PageserverConsumptionMetricsKey {
+                    tenant_id,
+                    timeline_id: None,
+                    metric: SYNTHETIC_STORAGE_SIZE,
+                },
+                tenant_synthetic_size,
+            ));
        }
    }

    // Filter metrics, unless we want to send all metrics, including cached ones.
    // See: https://github.com/neondatabase/neon/issues/3485
    if !send_cached {
-        current_metrics.retain(|(curr_key, (kind, curr_val))| {
-            if kind.is_incremental() {
-                // incremental values (currently only written_size_delta) should not get any cache
-                // deduplication because they will be used by upstream for "is still alive."
-                true
-            } else {
-                match cached_metrics.get(curr_key) {
-                    Some((_, val)) => val != curr_val,
-                    None => true,
-                }
-            }
+        current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
+            Some(val) => val != curr_val,
+            None => true,
        });
    }

@@ -429,8 +272,8 @@ pub async fn collect_metrics_iteration(
        chunk_to_send.clear();

        // enrich metrics with type,timestamp and idempotency key before sending
-        chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
-            kind: *when,
+        chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| Event {
+            kind: EventType::Absolute { time: Utc::now() },
            metric: curr_key.metric,
            idempotency_key: idempotency_key(node_id.to_string()),
            value: *curr_val,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -545,12 +545,12 @@ async fn collect_eviction_candidates(
        // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
        // That's what's typically used by the various background loops.
        //
-        // The default can be overridden with a fixed value in the tenant conf.
+        // The default can be overriden with a fixed value in the tenant conf.
        // A default override can be put in the default tenant conf in the pageserver.toml.
        let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
            debug!(
                tenant_id=%tenant.tenant_id(),
-                overridden_size=s,
+                overriden_size=s,
                "using overridden min resident size for tenant"
            );
            s
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -994,29 +994,31 @@ async fn timeline_gc_handler(
 // Run compaction immediately on given timeline.
 async fn timeline_compact_handler(
    request: Request<Body>,
-    cancel: CancellationToken,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    async {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-        timeline
-            .compact(&cancel, &ctx)
-            .await
-            .map_err(ApiError::InternalServerError)?;
-        json_response(StatusCode::OK, ())
-    }
-    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
-    .await
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let result_receiver = mgr::immediate_compact(tenant_id, timeline_id, &ctx)
+        .await
+        .context("spawn compaction task")
+        .map_err(ApiError::InternalServerError)?;
+
+    let result: anyhow::Result<()> = result_receiver
+        .await
+        .context("receive compaction result")
+        .map_err(ApiError::InternalServerError)?;
+    result.map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
 }

 // Run checkpoint immediately on given timeline.
 async fn timeline_checkpoint_handler(
    request: Request<Body>,
-    cancel: CancellationToken,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1029,13 +1031,13 @@ async fn timeline_checkpoint_handler(
            .await
            .map_err(ApiError::InternalServerError)?;
        timeline
-            .compact(&cancel, &ctx)
+            .compact(&ctx)
            .await
            .map_err(ApiError::InternalServerError)?;

        json_response(StatusCode::OK, ())
    }
-    .instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id))
+    .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_id, timeline_id = %timeline_id))
    .await
 }

--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -109,8 +109,6 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
 pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";

-pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
-
 /// A marker file to prevent pageserver from loading a certain tenant on restart.
 /// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
 /// `ignore` management API command, that expects the ignored tenant to be properly loaded
@@ -125,30 +123,15 @@ pub fn is_temporary(path: &Path) -> bool {
    }
 }

-fn ends_with_suffix(path: &Path, suffix: &str) -> bool {
+pub fn is_uninit_mark(path: &Path) -> bool {
    match path.file_name() {
-        Some(name) => name.to_string_lossy().ends_with(suffix),
+        Some(name) => name
+            .to_string_lossy()
+            .ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
        None => false,
    }
 }

-pub fn is_uninit_mark(path: &Path) -> bool {
-    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
-}
-
-pub fn is_delete_mark(path: &Path) -> bool {
-    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
-}
-
-fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
-    if let Some(e) = e.io_error() {
-        if e.kind() == std::io::ErrorKind::NotFound {
-            return true;
-        }
-    }
-    false
-}
-
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -6,6 +6,7 @@ use metrics::{
    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
+use pageserver_api::models::TenantState;
 use strum::VariantNames;
 use strum_macros::{EnumVariantNames, IntoStaticStr};
 use utils::id::{TenantId, TimelineId};
@@ -73,7 +74,7 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
 // Buckets for background operations like compaction, GC, size calculation
 const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];

-pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
+pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_storage_operations_seconds_global",
        "Time spent on storage operations",
@@ -83,17 +84,18 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
        "pageserver_read_num_fs_layers",
        "Number of persistent layers accessed for processing a read request, including those in the cache",
+        &["tenant_id", "timeline_id"],
        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
    )
    .expect("failed to define a metric")
 });

 // Metrics collected on operations on the storage repository.
-pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_getpage_reconstruct_seconds",
        "Time spent in reconstruct_value (reconstruct a page from deltas)",
@@ -102,7 +104,7 @@ pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
+pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_materialized_cache_hits_direct_total",
        "Number of cache hits from materialized page cache without redo",
@@ -110,16 +112,17 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::n
    .expect("failed to define a metric")
 });

-pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
        "pageserver_getpage_get_reconstruct_data_seconds",
        "Time spent in get_reconstruct_value_data",
+        &["tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });

-pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
+pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_materialized_cache_hits_total",
        "Number of cache hits from materialized page cache",
@@ -243,10 +246,11 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
    },
 });

-pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
        "pageserver_wait_lsn_seconds",
        "Time spent waiting for WAL to arrive",
+        &["tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
@@ -280,7 +284,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
+pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_layers_total",
        "Total on-demand downloaded layers"
@@ -288,7 +292,7 @@ pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::ne
    .unwrap()
 });

-pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_bytes_total",
        "Total bytes of layers on-demand downloaded",
@@ -305,29 +309,16 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define current logical size metric")
 });

-pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_states_count",
        "Count of tenants per state",
-        &["state"]
+        &["tenant_id", "state"]
    )
    .expect("Failed to register pageserver_tenant_states_count metric")
 });

-/// A set of broken tenants.
-///
-/// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
-/// tenant.
-pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_broken_tenants_count",
-        "Set of broken tenants",
-        &["tenant_id"]
-    )
-    .expect("Failed to register pageserver_tenant_states_count metric")
-});
-
-pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_synthetic_cached_size_bytes",
        "Synthetic size of each tenant in bytes",
@@ -385,7 +376,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
    .expect("failed to define a metric")
 });

-pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
+pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_unexpected_ondemand_downloads_count",
        "Number of unexpected on-demand downloads. \
@@ -508,31 +499,23 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
    30.000,   // 30000 ms
 ];

-/// Tracks time taken by fs operations near VirtualFile.
-///
-/// Operations:
-/// - open ([`std::fs::OpenOptions::open`])
-/// - close (dropping [`std::fs::File`])
-/// - close-by-replace (close by replacement algorithm)
-/// - read (`read_at`)
-/// - write (`write_at`)
-/// - seek (modify internal position or file length query)
-/// - fsync ([`std::fs::File::sync_all`])
-/// - metadata ([`std::fs::File::metadata`])
-pub(crate) static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
+    "open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
+];
+
+const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
+
+pub static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_io_operations_seconds",
        "Time spent in IO operations",
-        &["operation"],
+        &["operation", "tenant_id", "timeline_id"],
        STORAGE_IO_TIME_BUCKETS.into()
    )
    .expect("failed to define a metric")
 });

-const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
-
-// Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
-pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
+pub static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_io_operations_bytes_total",
        "Total amount of bytes read/written in IO operations",
@@ -622,7 +605,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
         at a given instant. It gives you a better idea of the queue depth \
         than plotting the gauge directly, since operations may complete faster \
         than the sampling interval.",
-        &["file_kind", "op_kind"],
+        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
        // The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
        vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
    )
@@ -679,18 +662,18 @@ impl RemoteOpFileKind {
    }
 }

-pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_remote_operation_seconds",
        "Time spent on remote storage operations. \
        Grouped by tenant, timeline, operation_kind and status. \
        Does not account for time spent waiting in remote timeline client's queues.",
-        &["file_kind", "op_kind", "status"]
+        &["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
    )
    .expect("failed to define a metric")
 });

-pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_tenant_task_events",
        "Number of task start/stop/fail events.",
@@ -699,7 +682,7 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

-pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_background_loop_period_overrun_count",
        "Incremented whenever warn_when_period_overrun() logs a warning.",
@@ -710,7 +693,7 @@ pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = La

 // walreceiver metrics

-pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
+pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_walreceiver_started_connections_total",
        "Number of started walreceiver connections"
@@ -718,7 +701,7 @@ pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
+pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
    register_int_gauge!(
        "pageserver_walreceiver_active_managers",
        "Number of active walreceiver managers"
@@ -726,7 +709,7 @@ pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_walreceiver_switches_total",
        "Number of walreceiver manager change_connection calls",
@@ -735,7 +718,7 @@ pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
+pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_walreceiver_broker_updates_total",
        "Number of received broker updates in walreceiver"
@@ -743,7 +726,7 @@ pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_walreceiver_candidates_events_total",
        "Number of walreceiver candidate events",
@@ -752,10 +735,10 @@ pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new
    .expect("failed to define a metric")
 });

-pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
+pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));

-pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
+pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));

 // Metrics collected on WAL redo operations
@@ -802,7 +785,7 @@ macro_rules! redo_bytes_histogram_count_buckets {
    };
 }

-pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_seconds",
        "Time spent on WAL redo",
@@ -811,7 +794,7 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_wait_seconds",
        "Time spent waiting for access to the Postgres WAL redo process",
@@ -820,7 +803,7 @@ pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_records_histogram",
        "Histogram of number of records replayed per redo in the Postgres WAL redo process",
@@ -829,7 +812,7 @@ pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_bytes_histogram",
        "Histogram of number of records replayed per redo sent to Postgres",
@@ -838,8 +821,7 @@ pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-// FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
-pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
+pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_replayed_wal_records_total",
        "Number of WAL records replayed in WAL redo process"
@@ -915,6 +897,7 @@ impl StorageTimeMetrics {
 pub struct TimelineMetrics {
    tenant_id: String,
    timeline_id: String,
+    pub get_reconstruct_data_time_histo: Histogram,
    pub flush_time_histo: StorageTimeMetrics,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
@@ -923,7 +906,9 @@ pub struct TimelineMetrics {
    pub load_layer_map_histo: StorageTimeMetrics,
    pub garbage_collect_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
+    pub wait_lsn_time_histo: Histogram,
    pub resident_physical_size_gauge: UIntGauge,
+    pub read_num_fs_layers: Histogram,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub num_persistent_files_created: IntCounter,
@@ -940,6 +925,9 @@ impl TimelineMetrics {
    ) -> Self {
        let tenant_id = tenant_id.to_string();
        let timeline_id = timeline_id.to_string();
+        let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let flush_time_histo =
            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
        let compact_time_histo =
@@ -960,6 +948,9 @@ impl TimelineMetrics {
        let last_record_gauge = LAST_RECORD_LSN
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
+        let wait_lsn_time_histo = WAIT_LSN_TIME
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
@@ -975,12 +966,16 @@ impl TimelineMetrics {
        let evictions = EVICTIONS
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
+        let read_num_fs_layers = READ_NUM_FS_LAYERS
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let evictions_with_low_residence_duration =
            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);

        TimelineMetrics {
            tenant_id,
            timeline_id,
+            get_reconstruct_data_time_histo,
            flush_time_histo,
            compact_time_histo,
            create_images_time_histo,
@@ -989,6 +984,7 @@ impl TimelineMetrics {
            garbage_collect_histo,
            load_layer_map_histo,
            last_record_gauge,
+            wait_lsn_time_histo,
            resident_physical_size_gauge,
            current_logical_size_gauge,
            num_persistent_files_created,
@@ -997,6 +993,7 @@ impl TimelineMetrics {
            evictions_with_low_residence_duration: std::sync::RwLock::new(
                evictions_with_low_residence_duration,
            ),
+            read_num_fs_layers,
        }
    }
 }
@@ -1005,12 +1002,15 @@ impl Drop for TimelineMetrics {
    fn drop(&mut self) {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
+        let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);

        self.evictions_with_low_residence_duration
            .write()
@@ -1022,6 +1022,9 @@ impl Drop for TimelineMetrics {
            let _ =
                STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
        }
+        for op in STORAGE_IO_TIME_OPERATIONS {
+            let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
+        }

        for op in STORAGE_IO_SIZE_OPERATIONS {
            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
@@ -1036,7 +1039,9 @@ impl Drop for TimelineMetrics {
 pub fn remove_tenant_metrics(tenant_id: &TenantId) {
    let tid = tenant_id.to_string();
    let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
-    // we leave the BROKEN_TENANTS_SET entry if any
+    for state in TenantState::VARIANTS {
+        let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
+    }
 }

 use futures::Future;
@@ -1051,7 +1056,9 @@ pub struct RemoteTimelineClientMetrics {
    tenant_id: String,
    timeline_id: String,
    remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
+    remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
+    calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
 }
@@ -1061,13 +1068,14 @@ impl RemoteTimelineClientMetrics {
        RemoteTimelineClientMetrics {
            tenant_id: tenant_id.to_string(),
            timeline_id: timeline_id.to_string(),
+            remote_operation_time: Mutex::new(HashMap::default()),
            calls_unfinished_gauge: Mutex::new(HashMap::default()),
+            calls_started_hist: Mutex::new(HashMap::default()),
            bytes_started_counter: Mutex::new(HashMap::default()),
            bytes_finished_counter: Mutex::new(HashMap::default()),
            remote_physical_size_gauge: Mutex::new(None),
        }
    }
-
    pub fn remote_physical_size_gauge(&self) -> UIntGauge {
        let mut guard = self.remote_physical_size_gauge.lock().unwrap();
        guard
@@ -1081,17 +1089,26 @@ impl RemoteTimelineClientMetrics {
            })
            .clone()
    }
-
    pub fn remote_operation_time(
        &self,
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
        status: &'static str,
    ) -> Histogram {
+        let mut guard = self.remote_operation_time.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str(), status);
-        REMOTE_OPERATION_TIME
-            .get_metric_with_label_values(&[key.0, key.1, key.2])
-            .unwrap()
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_OPERATION_TIME
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                    key.2,
+                ])
+                .unwrap()
+        });
+        metric.clone()
    }

    fn calls_unfinished_gauge(
@@ -1119,10 +1136,19 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> Histogram {
+        let mut guard = self.calls_started_hist.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
-        REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
-            .get_metric_with_label_values(&[key.0, key.1])
-            .unwrap()
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                ])
+                .unwrap()
+        });
+        metric.clone()
    }

    fn bytes_started_counter(
@@ -1302,10 +1328,15 @@ impl Drop for RemoteTimelineClientMetrics {
            tenant_id,
            timeline_id,
            remote_physical_size_gauge,
+            remote_operation_time,
            calls_unfinished_gauge,
+            calls_started_hist,
            bytes_started_counter,
            bytes_finished_counter,
        } = self;
+        for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
+            let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
+        }
        for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
                tenant_id,
@@ -1314,6 +1345,14 @@ impl Drop for RemoteTimelineClientMetrics {
                b,
            ]);
        }
+        for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() {
+            let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                a,
+                b,
+            ]);
+        }
        for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
                tenant_id,
@@ -1395,51 +1434,15 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }

 pub fn preinitialize_metrics() {
-    // Python tests need these and on some we do alerting.
-    //
-    // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
-    // order:
-    // - global metrics reside in a Lazy<PageserverMetrics>
-    //   - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
-    // - could move the statics into TimelineMetrics::new()?
+    // We want to alert on this metric increasing.
+    // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
+    assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
+    UNEXPECTED_ONDEMAND_DOWNLOADS.reset();

-    // counters
-    [
-        &MATERIALIZED_PAGE_CACHE_HIT,
-        &MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
-        &UNEXPECTED_ONDEMAND_DOWNLOADS,
-        &WALRECEIVER_STARTED_CONNECTIONS,
-        &WALRECEIVER_BROKER_UPDATES,
-        &WALRECEIVER_CANDIDATES_ADDED,
-        &WALRECEIVER_CANDIDATES_REMOVED,
-    ]
-    .into_iter()
-    .for_each(|c| {
-        Lazy::force(c);
-    });
+    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
+    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();

-    // countervecs
-    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
-        .into_iter()
-        .for_each(|c| {
-            Lazy::force(c);
-        });
-
-    // gauges
-    WALRECEIVER_ACTIVE_MANAGERS.get();
-
-    // histograms
-    [
-        &READ_NUM_FS_LAYERS,
-        &RECONSTRUCT_TIME,
-        &WAIT_LSN_TIME,
-        &WAL_REDO_TIME,
-        &WAL_REDO_WAIT_TIME,
-        &WAL_REDO_RECORDS_HISTOGRAM,
-        &WAL_REDO_BYTES_HISTOGRAM,
-    ]
-    .into_iter()
-    .for_each(|h| {
-        Lazy::force(h);
-    });
+    // Python tests need these.
+    MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
+    MATERIALIZED_PAGE_CACHE_HIT.get();
 }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -130,25 +130,11 @@ pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
 pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
    tokio::runtime::Builder::new_multi_thread()
        .thread_name("background op worker")
-        // if you change the number of worker threads please change the constant below
        .enable_all()
        .build()
        .expect("Failed to create background op runtime")
 });

-pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
-    // force init and thus panics
-    let _ = BACKGROUND_RUNTIME.handle();
-    // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
-    // tokio would had already panicked for parsing errors or NotUnicode
-    //
-    // this will be wrong if any of the runtimes gets their worker threads configured to something
-    // else, but that has not been needed in a long time.
-    std::env::var("TOKIO_WORKER_THREADS")
-        .map(|s| s.parse::<usize>().unwrap())
-        .unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
-});
-
 #[derive(Debug, Clone, Copy)]
 pub struct PageserverTaskId(u64);

@@ -559,7 +545,7 @@ pub fn current_task_id() -> Option<PageserverTaskId> {
 pub async fn shutdown_watcher() {
    let token = SHUTDOWN_TOKEN
        .try_with(|t| t.clone())
-        .expect("shutdown_watcher() called in an unexpected task or thread");
+        .expect("shutdown_requested() called in an unexpected task or thread");

    token.cancelled().await;
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -18,8 +18,8 @@ use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
+use tokio::sync::OwnedMutexGuard;
 use tokio::task::JoinSet;
-use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
@@ -28,6 +28,7 @@ use std::cmp::min;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
 use std::collections::HashMap;
+use std::ffi::OsStr;
 use std::fs;
 use std::fs::File;
 use std::fs::OpenOptions;
@@ -46,7 +47,6 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

 use self::config::TenantConf;
-use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineUninitMark;
@@ -64,12 +64,12 @@ use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
+use crate::tenant::remote_timeline_client::PersistIndexPartWithDeletedFlagError;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
 use crate::InitializationOrder;

-use crate::tenant::timeline::delete::DeleteTimelineFlow;
 use crate::tenant::timeline::uninit::cleanup_timeline_directory;
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
@@ -265,14 +265,6 @@ pub enum GetTimelineError {
    },
 }

-#[derive(Debug, thiserror::Error)]
-pub enum LoadLocalTimelineError {
-    #[error("FailedToLoad")]
-    Load(#[source] anyhow::Error),
-    #[error("FailedToResumeDeletion")]
-    ResumeDeletion(#[source] anyhow::Error),
-}
-
 #[derive(Debug, thiserror::Error)]
 pub enum DeleteTimelineError {
    #[error("NotFound")]
@@ -326,6 +318,14 @@ impl std::fmt::Display for WaitToBecomeActiveError {
    }
 }

+struct DeletionGuard(OwnedMutexGuard<bool>);
+
+impl DeletionGuard {
+    fn is_deleted(&self) -> bool {
+        *self.0
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 pub enum CreateTimelineError {
    #[error("a timeline with the given ID already exists")]
@@ -336,16 +336,6 @@ pub enum CreateTimelineError {
    Other(#[from] anyhow::Error),
 }

-struct TenantDirectoryScan {
-    sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>,
-    timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)>,
-}
-
-enum CreateTimelineCause {
-    Load,
-    Delete,
-}
-
 impl Tenant {
    /// Yet another helper for timeline initialization.
    /// Contains the common part of `load_local_timeline` and `load_remote_timeline`.
@@ -384,7 +374,6 @@ impl Tenant {
            ancestor.clone(),
            remote_client,
            init_order,
-            CreateTimelineCause::Load,
        )?;
        let new_disk_consistent_lsn = timeline.get_disk_consistent_lsn();
        anyhow::ensure!(
@@ -813,14 +802,11 @@ impl Tenant {
        tenant
    }

-    fn scan_and_sort_timelines_dir(self: Arc<Tenant>) -> anyhow::Result<TenantDirectoryScan> {
-        let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
-        // Note timelines_to_resume_deletion needs to be separate because it can be not sortable
-        // from the point of `tree_sort_timelines`. I e some parents can be missing because deletion
-        // completed in non topological order (for example because parent has smaller number of layer files in it)
-        let mut timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)> = vec![];
-
+    pub fn scan_and_sort_timelines_dir(
+        self: Arc<Tenant>,
+    ) -> anyhow::Result<Vec<(TimelineId, TimelineMetadata)>> {
        let timelines_dir = self.conf.timelines_path(&self.tenant_id);
+        let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();

        for entry in
            std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")?
@@ -848,13 +834,16 @@ impl Tenant {
                    );
                    continue;
                }
-
                let timeline_uninit_mark_file = &timeline_dir;
                info!(
                    "Found an uninit mark file {}, removing the timeline and its uninit mark",
                    timeline_uninit_mark_file.display()
                );
-                let timeline_id = TimelineId::try_from(timeline_uninit_mark_file.file_stem())
+                let timeline_id = timeline_uninit_mark_file
+                    .file_stem()
+                    .and_then(OsStr::to_str)
+                    .unwrap_or_default()
+                    .parse::<TimelineId>()
                    .with_context(|| {
                        format!(
                            "Could not parse timeline id out of the timeline uninit mark name {}",
@@ -867,47 +856,6 @@ impl Tenant {
                {
                    error!("Failed to clean up uninit marked timeline: {e:?}");
                }
-            } else if crate::is_delete_mark(&timeline_dir) {
-                // If metadata exists, load as usual, continue deletion
-                let timeline_id =
-                    TimelineId::try_from(timeline_dir.file_stem()).with_context(|| {
-                        format!(
-                            "Could not parse timeline id out of the timeline uninit mark name {}",
-                            timeline_dir.display()
-                        )
-                    })?;
-
-                match load_metadata(self.conf, &self.tenant_id, &timeline_id) {
-                    Ok(metadata) => {
-                        timelines_to_resume_deletion.push((timeline_id, Some(metadata)))
-                    }
-                    Err(e) => match &e {
-                        LoadMetadataError::Read(r) => {
-                            if r.kind() != io::ErrorKind::NotFound {
-                                return Err(anyhow::anyhow!(e)).with_context(|| {
-                                    format!("Failed to load metadata for timeline_id {timeline_id}")
-                                });
-                            }
-
-                            // If metadata doesnt exist it means that we've crashed without
-                            // completing cleanup_remaining_timeline_fs_traces in DeleteTimelineFlow.
-                            // So save timeline_id for later call to `DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`.
-                            // We cant do it here because the method is async so we'd need block_on
-                            // and here we're in spawn_blocking. cleanup_remaining_timeline_fs_traces uses fs operations
-                            // so that basically results in a cycle:
-                            // spawn_blocking
-                            // - block_on
-                            //   - spawn_blocking
-                            // which can lead to running out of threads in blocing pool.
-                            timelines_to_resume_deletion.push((timeline_id, None));
-                        }
-                        _ => {
-                            return Err(anyhow::anyhow!(e)).with_context(|| {
-                                format!("Failed to load metadata for timeline_id {timeline_id}")
-                            })
-                        }
-                    },
-                }
            } else {
                if !timeline_dir.exists() {
                    warn!(
@@ -916,8 +864,12 @@ impl Tenant {
                    );
                    continue;
                }
-                let timeline_id =
-                    TimelineId::try_from(timeline_dir.file_name()).with_context(|| {
+                let timeline_id = timeline_dir
+                    .file_name()
+                    .and_then(OsStr::to_str)
+                    .unwrap_or_default()
+                    .parse::<TimelineId>()
+                    .with_context(|| {
                        format!(
                            "Could not parse timeline id out of the timeline dir name {}",
                            timeline_dir.display()
@@ -939,14 +891,6 @@ impl Tenant {
                    continue;
                }

-                let timeline_delete_mark_file = self
-                    .conf
-                    .timeline_delete_mark_file_path(self.tenant_id, timeline_id);
-                if timeline_delete_mark_file.exists() {
-                    // Cleanup should be done in `is_delete_mark` branch above
-                    continue;
-                }
-
                let file_name = entry.file_name();
                if let Ok(timeline_id) =
                    file_name.to_str().unwrap_or_default().parse::<TimelineId>()
@@ -966,10 +910,7 @@ impl Tenant {

        // Sort the array of timeline IDs into tree-order, so that parent comes before
        // all its children.
-        tree_sort_timelines(timelines_to_load).map(|sorted_timelines| TenantDirectoryScan {
-            sorted_timelines_to_load: sorted_timelines,
-            timelines_to_resume_deletion,
-        })
+        tree_sort_timelines(timelines_to_load)
    }

    ///
@@ -995,7 +936,7 @@ impl Tenant {
        let span = info_span!("blocking");
        let cloned = Arc::clone(self);

-        let scan = tokio::task::spawn_blocking(move || {
+        let sorted_timelines: Vec<(_, _)> = tokio::task::spawn_blocking(move || {
            let _g = span.entered();
            cloned.scan_and_sort_timelines_dir()
        })
@@ -1006,60 +947,10 @@ impl Tenant {
        // FIXME original collect_timeline_files contained one more check:
        //    1. "Timeline has no ancestor and no layer files"

-        // Process loadable timelines first
-        for (timeline_id, local_metadata) in scan.sorted_timelines_to_load {
-            if let Err(e) = self
-                .load_local_timeline(timeline_id, local_metadata, init_order, ctx, false)
+        for (timeline_id, local_metadata) in sorted_timelines {
+            self.load_local_timeline(timeline_id, local_metadata, init_order, ctx)
                .await
-            {
-                match e {
-                    LoadLocalTimelineError::Load(source) => {
-                        return Err(anyhow::anyhow!(source)
-                            .context("Failed to load local timeline: {timeline_id}"))
-                    }
-                    LoadLocalTimelineError::ResumeDeletion(source) => {
-                        // Make sure resumed deletion wont fail loading for entire tenant.
-                        error!("Failed to resume timeline deletion: {source:#}")
-                    }
-                }
-            }
-        }
-
-        // Resume deletion ones with deleted_mark
-        for (timeline_id, maybe_local_metadata) in scan.timelines_to_resume_deletion {
-            match maybe_local_metadata {
-                None => {
-                    // See comment in `scan_and_sort_timelines_dir`.
-                    if let Err(e) =
-                        DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(self, timeline_id)
-                            .await
-                    {
-                        warn!(
-                            "cannot clean up deleted timeline dir timeline_id: {} error: {:#}",
-                            timeline_id, e
-                        );
-                    }
-                }
-                Some(local_metadata) => {
-                    if let Err(e) = self
-                        .load_local_timeline(timeline_id, local_metadata, init_order, ctx, true)
-                        .await
-                    {
-                        match e {
-                            LoadLocalTimelineError::Load(source) => {
-                                // We tried to load deleted timeline, this is a bug.
-                                return Err(anyhow::anyhow!(source).context(
-                                "This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}"
-                            ));
-                            }
-                            LoadLocalTimelineError::ResumeDeletion(source) => {
-                                // Make sure resumed deletion wont fail loading for entire tenant.
-                                error!("Failed to resume timeline deletion: {source:#}")
-                            }
-                        }
-                    }
-                }
-            }
+                .with_context(|| format!("load local timeline {timeline_id}"))?;
        }

        trace!("Done");
@@ -1077,8 +968,7 @@ impl Tenant {
        local_metadata: TimelineMetadata,
        init_order: Option<&InitializationOrder>,
        ctx: &RequestContext,
-        found_delete_mark: bool,
-    ) -> Result<(), LoadLocalTimelineError> {
+    ) -> anyhow::Result<()> {
        span::debug_assert_current_span_has_tenant_id();

        let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
@@ -1090,6 +980,14 @@ impl Tenant {
            )
        });

+        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
+            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
+                .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?;
+            Some(ancestor_timeline)
+        } else {
+            None
+        };
+
        let (remote_startup_data, remote_client) = match remote_client {
            Some(remote_client) => match remote_client.download_index_file().await {
                Ok(index_part) => {
@@ -1109,29 +1007,45 @@ impl Tenant {
                            info!("is_deleted is set on remote, resuming removal of timeline data originally done by timeline deletion handler");

                            remote_client
-                                .init_upload_queue_stopped_to_continue_deletion(&index_part)
-                                .context("init queue stopped")
-                                .map_err(LoadLocalTimelineError::ResumeDeletion)?;
+                                .init_upload_queue_stopped_to_continue_deletion(&index_part)?;

-                            DeleteTimelineFlow::resume_deletion(
+                            let timeline = self
+                                .create_timeline_struct(
+                                    timeline_id,
+                                    &local_metadata,
+                                    ancestor,
+                                    Some(remote_client),
+                                    init_order,
+                                )
+                                .context("create_timeline_struct")?;
+
+                            let guard = DeletionGuard(
+                                Arc::clone(&timeline.delete_lock)
+                                    .try_lock_owned()
+                                    .expect("cannot happen because we're the only owner"),
+                            );
+
+                            // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
+                            // RemoteTimelineClient is the only functioning part.
+                            timeline.set_state(TimelineState::Stopping);
+                            // We meed to do this because when console retries delete request we shouldnt answer with 404
+                            // because 404 means successful deletion.
+                            // FIXME consider TimelineState::Deleting.
+                            let mut locked = self.timelines.lock().unwrap();
+                            locked.insert(timeline_id, Arc::clone(&timeline));
+
+                            Tenant::schedule_delete_timeline(
                                Arc::clone(self),
                                timeline_id,
-                                &local_metadata,
-                                Some(remote_client),
-                                init_order,
-                            )
-                            .await
-                            .context("resume deletion")
-                            .map_err(LoadLocalTimelineError::ResumeDeletion)?;
+                                timeline,
+                                guard,
+                            );

                            return Ok(());
                        }
                    };

-                    let remote_metadata = index_part
-                        .parse_metadata()
-                        .context("parse_metadata")
-                        .map_err(LoadLocalTimelineError::Load)?;
+                    let remote_metadata = index_part.parse_metadata().context("parse_metadata")?;
                    (
                        Some(RemoteStartupData {
                            index_part,
@@ -1141,54 +1055,12 @@ impl Tenant {
                    )
                }
                Err(DownloadError::NotFound) => {
-                    info!("no index file was found on the remote, found_delete_mark: {found_delete_mark}");
-
-                    if found_delete_mark {
-                        // We could've resumed at a point where remote index was deleted, but metadata file wasnt.
-                        // Cleanup:
-                        return DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(
-                            self,
-                            timeline_id,
-                        )
-                        .await
-                        .context("cleanup_remaining_timeline_fs_traces")
-                        .map_err(LoadLocalTimelineError::ResumeDeletion);
-                    }
-
-                    // We're loading fresh timeline that didnt yet make it into remote.
+                    info!("no index file was found on the remote");
                    (None, Some(remote_client))
                }
-                Err(e) => return Err(LoadLocalTimelineError::Load(anyhow::Error::new(e))),
+                Err(e) => return Err(anyhow::anyhow!(e)),
            },
-            None => {
-                // No remote client
-                if found_delete_mark {
-                    // There is no remote client, we found local metadata.
-                    // Continue cleaning up local disk.
-                    DeleteTimelineFlow::resume_deletion(
-                        Arc::clone(self),
-                        timeline_id,
-                        &local_metadata,
-                        None,
-                        init_order,
-                    )
-                    .await
-                    .context("resume deletion")
-                    .map_err(LoadLocalTimelineError::ResumeDeletion)?;
-                    return Ok(());
-                }
-
-                (None, remote_client)
-            }
-        };
-
-        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
-            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
-                .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))
-                .map_err(LoadLocalTimelineError::Load)?;
-            Some(ancestor_timeline)
-        } else {
-            None
+            None => (None, remote_client),
        };

        self.timeline_init_and_sync(
@@ -1202,7 +1074,6 @@ impl Tenant {
            ctx,
        )
        .await
-        .map_err(LoadLocalTimelineError::Load)
    }

    pub fn tenant_id(&self) -> TenantId {
@@ -1464,11 +1335,7 @@ impl Tenant {
    /// This function is periodically called by compactor task.
    /// Also it can be explicitly requested per timeline through page server
    /// api's 'compact' command.
-    pub async fn compaction_iteration(
-        &self,
-        cancel: &CancellationToken,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    pub async fn compaction_iteration(&self, ctx: &RequestContext) -> anyhow::Result<()> {
        anyhow::ensure!(
            self.is_active(),
            "Cannot run compaction iteration on inactive tenant"
@@ -1496,7 +1363,7 @@ impl Tenant {

        for (timeline_id, timeline) in &timelines_to_compact {
            timeline
-                .compact(cancel, ctx)
+                .compact(ctx)
                .instrument(info_span!("compact_timeline", %timeline_id))
                .await?;
        }
@@ -1563,6 +1430,269 @@ impl Tenant {
        }
    }

+    /// Shuts down a timeline's tasks, removes its in-memory structures, and deletes its
+    /// data from both disk and s3.
+    async fn delete_timeline(
+        &self,
+        timeline_id: TimelineId,
+        timeline: Arc<Timeline>,
+        guard: DeletionGuard,
+    ) -> anyhow::Result<()> {
+        {
+            // Grab the layer_removal_cs lock, and actually perform the deletion.
+            //
+            // This lock prevents prevents GC or compaction from running at the same time.
+            // The GC task doesn't register itself with the timeline it's operating on,
+            // so it might still be running even though we called `shutdown_tasks`.
+            //
+            // Note that there are still other race conditions between
+            // GC, compaction and timeline deletion. See
+            // https://github.com/neondatabase/neon/issues/2671
+            //
+            // No timeout here, GC & Compaction should be responsive to the
+            // `TimelineState::Stopping` change.
+            info!("waiting for layer_removal_cs.lock()");
+            let layer_removal_guard = timeline.layer_removal_cs.lock().await;
+            info!("got layer_removal_cs.lock(), deleting layer files");
+
+            // NB: remote_timeline_client upload tasks that reference these layers have been cancelled
+            //     by the caller.
+
+            let local_timeline_directory = self
+                .conf
+                .timeline_path(&self.tenant_id, &timeline.timeline_id);
+
+            fail::fail_point!("timeline-delete-before-rm", |_| {
+                Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
+            });
+
+            // NB: This need not be atomic because the deleted flag in the IndexPart
+            // will be observed during tenant/timeline load. The deletion will be resumed there.
+            //
+            // For configurations without remote storage, we tolerate that we're not crash-safe here.
+            // The timeline may come up Active but with missing layer files, in such setups.
+            // See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720
+            match std::fs::remove_dir_all(&local_timeline_directory) {
+                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+                    // This can happen if we're called a second time, e.g.,
+                    // because of a previous failure/cancellation at/after
+                    // failpoint timeline-delete-after-rm.
+                    //
+                    // It can also happen if we race with tenant detach, because,
+                    // it doesn't grab the layer_removal_cs lock.
+                    //
+                    // For now, log and continue.
+                    // warn! level is technically not appropriate for the
+                    // first case because we should expect retries to happen.
+                    // But the error is so rare, it seems better to get attention if it happens.
+                    let tenant_state = self.current_state();
+                    warn!(
+                        timeline_dir=?local_timeline_directory,
+                        ?tenant_state,
+                        "timeline directory not found, proceeding anyway"
+                    );
+                    // continue with the rest of the deletion
+                }
+                res => res.with_context(|| {
+                    format!(
+                        "Failed to remove local timeline directory '{}'",
+                        local_timeline_directory.display()
+                    )
+                })?,
+            }
+
+            info!("finished deleting layer files, releasing layer_removal_cs.lock()");
+            drop(layer_removal_guard);
+        }
+
+        fail::fail_point!("timeline-delete-after-rm", |_| {
+            Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
+        });
+
+        if let Some(remote_client) = &timeline.remote_client {
+            remote_client.delete_all().await.context("delete_all")?
+        };
+
+        pausable_failpoint!("in_progress_delete");
+
+        {
+            // Remove the timeline from the map.
+            let mut timelines = self.timelines.lock().unwrap();
+            let children_exist = timelines
+                .iter()
+                .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
+            // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
+            // We already deleted the layer files, so it's probably best to panic.
+            // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
+            if children_exist {
+                panic!("Timeline grew children while we removed layer files");
+            }
+
+            timelines.remove(&timeline_id).expect(
+                "timeline that we were deleting was concurrently removed from 'timelines' map",
+            );
+
+            drop(timelines);
+        }
+
+        drop(guard);
+
+        Ok(())
+    }
+
+    /// Removes timeline-related in-memory data and schedules removal from remote storage.
+    #[instrument(skip(self, _ctx))]
+    pub async fn prepare_and_schedule_delete_timeline(
+        self: Arc<Self>,
+        timeline_id: TimelineId,
+        _ctx: &RequestContext,
+    ) -> Result<(), DeleteTimelineError> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
+        // Transition the timeline into TimelineState::Stopping.
+        // This should prevent new operations from starting.
+        //
+        // Also grab the Timeline's delete_lock to prevent another deletion from starting.
+        let timeline;
+        let delete_lock_guard;
+        {
+            let mut timelines = self.timelines.lock().unwrap();
+
+            // Ensure that there are no child timelines **attached to that pageserver**,
+            // because detach removes files, which will break child branches
+            let children: Vec<TimelineId> = timelines
+                .iter()
+                .filter_map(|(id, entry)| {
+                    if entry.get_ancestor_timeline_id() == Some(timeline_id) {
+                        Some(*id)
+                    } else {
+                        None
+                    }
+                })
+                .collect();
+
+            if !children.is_empty() {
+                return Err(DeleteTimelineError::HasChildren(children));
+            }
+
+            let timeline_entry = match timelines.entry(timeline_id) {
+                Entry::Occupied(e) => e,
+                Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound),
+            };
+
+            timeline = Arc::clone(timeline_entry.get());
+
+            // Prevent two tasks from trying to delete the timeline at the same time.
+            delete_lock_guard = DeletionGuard(
+                Arc::clone(&timeline.delete_lock)
+                    .try_lock_owned()
+                    .map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
+            );
+
+            // If another task finished the deletion just before we acquired the lock,
+            // return success.
+            if delete_lock_guard.is_deleted() {
+                return Ok(());
+            }
+
+            timeline.set_state(TimelineState::Stopping);
+
+            drop(timelines);
+        }
+
+        // Now that the Timeline is in Stopping state, request all the related tasks to
+        // shut down.
+        //
+        // NB: If this fails half-way through, and is retried, the retry will go through
+        // all the same steps again. Make sure the code here is idempotent, and don't
+        // error out if some of the shutdown tasks have already been completed!
+
+        // Stop the walreceiver first.
+        debug!("waiting for wal receiver to shutdown");
+        let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
+        if let Some(walreceiver) = maybe_started_walreceiver {
+            walreceiver.stop().await;
+        }
+        debug!("wal receiver shutdown confirmed");
+
+        // Prevent new uploads from starting.
+        if let Some(remote_client) = timeline.remote_client.as_ref() {
+            let res = remote_client.stop();
+            match res {
+                Ok(()) => {}
+                Err(e) => match e {
+                    remote_timeline_client::StopError::QueueUninitialized => {
+                        // This case shouldn't happen currently because the
+                        // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
+                        // That is, before we declare the Tenant as Active.
+                        // But we only allow calls to delete_timeline on Active tenants.
+                        return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
+                    }
+                },
+            }
+        }
+
+        // Stop & wait for the remaining timeline tasks, including upload tasks.
+        // NB: This and other delete_timeline calls do not run as a task_mgr task,
+        //     so, they are not affected by this shutdown_tasks() call.
+        info!("waiting for timeline tasks to shutdown");
+        task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await;
+
+        // Mark timeline as deleted in S3 so we won't pick it up next time
+        // during attach or pageserver restart.
+        // See comment in persist_index_part_with_deleted_flag.
+        if let Some(remote_client) = timeline.remote_client.as_ref() {
+            match remote_client.persist_index_part_with_deleted_flag().await {
+                // If we (now, or already) marked it successfully as deleted, we can proceed
+                Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
+                // Bail out otherwise
+                //
+                // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
+                // two tasks from performing the deletion at the same time. The first task
+                // that starts deletion should run it to completion.
+                Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
+                | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
+                    return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
+                }
+            }
+        }
+        self.schedule_delete_timeline(timeline_id, timeline, delete_lock_guard);
+
+        Ok(())
+    }
+
+    fn schedule_delete_timeline(
+        self: Arc<Self>,
+        timeline_id: TimelineId,
+        timeline: Arc<Timeline>,
+        guard: DeletionGuard,
+    ) {
+        let tenant_id = self.tenant_id;
+        let timeline_clone = Arc::clone(&timeline);
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::TimelineDeletionWorker,
+            Some(self.tenant_id),
+            Some(timeline_id),
+            "timeline_delete",
+            false,
+            async move {
+                if let Err(err) = self.delete_timeline(timeline_id, timeline, guard).await {
+                    error!("Error: {err:#}");
+                    timeline_clone.set_broken(err.to_string())
+                };
+                Ok(())
+            }
+            .instrument({
+                let span =
+                    tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
+                span.follows_from(Span::current());
+                span
+            }),
+        );
+    }
+
    pub fn current_state(&self) -> TenantState {
        self.state.borrow().clone()
    }
@@ -2019,10 +2149,6 @@ impl Tenant {
    /// The returned Timeline is in Loading state. The caller is responsible for
    /// initializing any on-disk state, and for inserting the Timeline to the 'timelines'
    /// map.
-    ///
-    /// `validate_ancestor == false` is used when a timeline is created for deletion
-    /// and we might not have the ancestor present anymore which is fine for to be
-    /// deleted timelines.
    fn create_timeline_struct(
        &self,
        new_timeline_id: TimelineId,
@@ -2030,26 +2156,19 @@ impl Tenant {
        ancestor: Option<Arc<Timeline>>,
        remote_client: Option<RemoteTimelineClient>,
        init_order: Option<&InitializationOrder>,
-        cause: CreateTimelineCause,
    ) -> anyhow::Result<Arc<Timeline>> {
-        let state = match cause {
-            CreateTimelineCause::Load => {
-                let ancestor_id = new_metadata.ancestor_timeline();
-                anyhow::ensure!(
-                    ancestor_id == ancestor.as_ref().map(|t| t.timeline_id),
-                    "Timeline's {new_timeline_id} ancestor {ancestor_id:?} was not found"
-                );
-                TimelineState::Loading
-            }
-            CreateTimelineCause::Delete => TimelineState::Stopping,
-        };
+        if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() {
+            anyhow::ensure!(
+                ancestor.is_some(),
+                "Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found"
+            )
+        }

        let initial_logical_size_can_start = init_order.map(|x| &x.initial_logical_size_can_start);
        let initial_logical_size_attempt = init_order.map(|x| &x.initial_logical_size_attempt);

        let pg_version = new_metadata.pg_version();
-
-        let timeline = Timeline::new(
+        Ok(Timeline::new(
            self.conf,
            Arc::clone(&self.tenant_conf),
            new_metadata,
@@ -2061,10 +2180,7 @@ impl Tenant {
            pg_version,
            initial_logical_size_can_start.cloned(),
            initial_logical_size_attempt.cloned(),
-            state,
-        );
-
-        Ok(timeline)
+        ))
    }

    fn new(
@@ -2078,53 +2194,28 @@ impl Tenant {
        let (state, mut rx) = watch::channel(state);

        tokio::spawn(async move {
+            let mut current_state: &'static str = From::from(&*rx.borrow_and_update());
            let tid = tenant_id.to_string();
-
-            fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
-                ([state.into()], matches!(state, TenantState::Broken { .. }))
-            }
-
-            let mut tuple = inspect_state(&rx.borrow_and_update());
-
-            let is_broken = tuple.1;
-            let mut counted_broken = if !is_broken {
-                // the tenant might be ignored and reloaded, so first remove any previous set
-                // element. it most likely has already been scraped, as these are manual operations
-                // right now. most likely we will add it back very soon.
-                drop(crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid]));
-                false
-            } else {
-                // add the id to the set right away, there should not be any updates on the channel
-                // after
-                crate::metrics::BROKEN_TENANTS_SET
-                    .with_label_values(&[&tid])
-                    .set(1);
-                true
-            };
-
+            TENANT_STATE_METRIC
+                .with_label_values(&[&tid, current_state])
+                .inc();
            loop {
-                let labels = &tuple.0;
-                let current = TENANT_STATE_METRIC.with_label_values(labels);
-                current.inc();
+                match rx.changed().await {
+                    Ok(()) => {
+                        let new_state: &'static str = From::from(&*rx.borrow_and_update());
+                        TENANT_STATE_METRIC
+                            .with_label_values(&[&tid, current_state])
+                            .dec();
+                        TENANT_STATE_METRIC
+                            .with_label_values(&[&tid, new_state])
+                            .inc();

-                if rx.changed().await.is_err() {
-                    // tenant has been dropped; decrement the counter because a tenant with that
-                    // state is no longer in tenant map, but allow any broken set item to exist
-                    // still.
-                    current.dec();
-                    break;
-                }
-
-                current.dec();
-                tuple = inspect_state(&rx.borrow_and_update());
-
-                let is_broken = tuple.1;
-                if is_broken && !counted_broken {
-                    counted_broken = true;
-                    // insert the tenant_id (back) into the set
-                    crate::metrics::BROKEN_TENANTS_SET
-                        .with_label_values(&[&tid])
-                        .inc();
+                        current_state = new_state;
+                    }
+                    Err(_sender_dropped_error) => {
+                        info!("Tenant dropped the state updates sender, quitting waiting for tenant state change");
+                        return;
+                    }
                }
            }
        });
@@ -2731,14 +2822,7 @@ impl Tenant {
        };

        let timeline_struct = self
-            .create_timeline_struct(
-                new_timeline_id,
-                new_metadata,
-                ancestor,
-                remote_client,
-                None,
-                CreateTimelineCause::Load,
-            )
+            .create_timeline_struct(new_timeline_id, new_metadata, ancestor, remote_client, None)
            .context("Failed to create timeline data structure")?;

        timeline_struct.init_empty_layer_map(start_lsn);
@@ -3126,7 +3210,7 @@ impl Drop for Tenant {
    }
 }
 /// Dump contents of a layer file to stdout.
-pub async fn dump_layerfile_from_path(
+pub fn dump_layerfile_from_path(
    path: &Path,
    verbose: bool,
    ctx: &RequestContext,
@@ -3140,22 +3224,27 @@ pub async fn dump_layerfile_from_path(
    file.read_exact_at(&mut header_buf, 0)?;

    match u16::from_be_bytes(header_buf) {
-        crate::IMAGE_FILE_MAGIC => {
-            ImageLayer::new_for_path(path, file)?
-                .dump(verbose, ctx)
-                .await?
-        }
-        crate::DELTA_FILE_MAGIC => {
-            DeltaLayer::new_for_path(path, file)?
-                .dump(verbose, ctx)
-                .await?
-        }
+        crate::IMAGE_FILE_MAGIC => ImageLayer::new_for_path(path, file)?.dump(verbose, ctx)?,
+        crate::DELTA_FILE_MAGIC => DeltaLayer::new_for_path(path, file)?.dump(verbose, ctx)?,
        magic => bail!("unrecognized magic identifier: {:?}", magic),
    }

    Ok(())
 }

+fn ignore_absent_files<F>(fs_operation: F) -> io::Result<()>
+where
+    F: Fn() -> io::Result<()>,
+{
+    fs_operation().or_else(|e| {
+        if e.kind() == io::ErrorKind::NotFound {
+            Ok(())
+        } else {
+            Err(e)
+        }
+    })
+}
+
 #[cfg(test)]
 pub mod harness {
    use bytes::{Bytes, BytesMut};
@@ -3352,7 +3441,6 @@ mod tests {
    use hex_literal::hex;
    use once_cell::sync::Lazy;
    use rand::{thread_rng, Rng};
-    use tokio_util::sync::CancellationToken;

    static TEST_KEY: Lazy<Key> =
        Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001")));
@@ -3834,9 +3922,9 @@ mod tests {
            .await
            .err()
            .expect("should fail");
-        // get all the stack with all .context, not only the last one
+        // get all the stack with all .context, not tonly the last one
        let message = format!("{err:#}");
-        let expected = "failed to load metadata";
+        let expected = "Failed to parse metadata bytes from path";
        assert!(
            message.contains(expected),
            "message '{message}' expected to contain {expected}"
@@ -3853,8 +3941,7 @@ mod tests {
        }
        assert!(
            found_error_message,
-            "didn't find the corrupted metadata error in {}",
-            message
+            "didn't find the corrupted metadata error"
        );

        Ok(())
@@ -3875,7 +3962,7 @@ mod tests {
        drop(writer);

        tline.freeze_and_flush().await?;
-        tline.compact(&CancellationToken::new(), &ctx).await?;
+        tline.compact(&ctx).await?;

        let writer = tline.writer().await;
        writer
@@ -3885,7 +3972,7 @@ mod tests {
        drop(writer);

        tline.freeze_and_flush().await?;
-        tline.compact(&CancellationToken::new(), &ctx).await?;
+        tline.compact(&ctx).await?;

        let writer = tline.writer().await;
        writer
@@ -3895,7 +3982,7 @@ mod tests {
        drop(writer);

        tline.freeze_and_flush().await?;
-        tline.compact(&CancellationToken::new(), &ctx).await?;
+        tline.compact(&ctx).await?;

        let writer = tline.writer().await;
        writer
@@ -3905,7 +3992,7 @@ mod tests {
        drop(writer);

        tline.freeze_and_flush().await?;
-        tline.compact(&CancellationToken::new(), &ctx).await?;
+        tline.compact(&ctx).await?;

        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
@@ -3974,7 +4061,7 @@ mod tests {
                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
                .await?;
            tline.freeze_and_flush().await?;
-            tline.compact(&CancellationToken::new(), &ctx).await?;
+            tline.compact(&ctx).await?;
            tline.gc().await?;
        }

@@ -4051,7 +4138,7 @@ mod tests {
                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
                .await?;
            tline.freeze_and_flush().await?;
-            tline.compact(&CancellationToken::new(), &ctx).await?;
+            tline.compact(&ctx).await?;
            tline.gc().await?;
        }

@@ -4139,7 +4226,7 @@ mod tests {
                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
                .await?;
            tline.freeze_and_flush().await?;
-            tline.compact(&CancellationToken::new(), &ctx).await?;
+            tline.compact(&ctx).await?;
            tline.gc().await?;
        }

--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -16,19 +16,29 @@ use crate::tenant::block_io::{BlockCursor, BlockReader};
 use std::cmp::min;
 use std::io::{Error, ErrorKind};

-impl<R> BlockCursor<R>
-where
-    R: BlockReader,
-{
+/// For reading
+pub trait BlobCursor {
    /// Read a blob into a new buffer.
-    pub fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+    fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
        self.read_blob_into_buf(offset, &mut buf)?;
        Ok(buf)
    }
+
    /// Read blob into the given buffer. Any previous contents in the buffer
    /// are overwritten.
-    pub fn read_blob_into_buf(
+    fn read_blob_into_buf(
+        &mut self,
+        offset: u64,
+        dstbuf: &mut Vec<u8>,
+    ) -> Result<(), std::io::Error>;
+}
+
+impl<R> BlobCursor for BlockCursor<R>
+where
+    R: BlockReader,
+{
+    fn read_blob_into_buf(
        &mut self,
        offset: u64,
        dstbuf: &mut Vec<u8>,
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -390,42 +390,39 @@ where
    }

    #[allow(dead_code)]
-    pub async fn dump(&self) -> Result<()> {
-        let mut stack = Vec::new();
+    pub fn dump(&self) -> Result<()> {
+        self.dump_recurse(self.root_blk, &[], 0)
+    }

-        stack.push((self.root_blk, String::new(), 0, 0, 0));
+    fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
+        let blk = self.reader.read_blk(self.start_blk + blknum)?;
+        let buf: &[u8] = blk.as_ref();

-        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
-            let blk = self.reader.read_blk(self.start_blk + blknum)?;
-            let buf: &[u8] = blk.as_ref();
-            let node = OnDiskNode::<L>::deparse(buf)?;
+        let node = OnDiskNode::<L>::deparse(buf)?;

-            if child_idx == 0 {
-                print!("{:indent$}", "", indent = depth * 2);
-                let path_prefix = stack
-                    .iter()
-                    .map(|(_blknum, path, ..)| path.as_str())
-                    .collect::<String>();
-                println!(
-                    "blk #{blknum}: path {path_prefix}{path}: prefix {}, suffix_len {}",
-                    hex::encode(node.prefix),
-                    node.suffix_len
-                );
-            }
+        print!("{:indent$}", "", indent = depth * 2);
+        println!(
+            "blk #{}: path {}: prefix {}, suffix_len {}",
+            blknum,
+            hex::encode(path),
+            hex::encode(node.prefix),
+            node.suffix_len
+        );

-            if child_idx + 1 < node.num_children {
-                let key_off = key_off + node.suffix_len as usize;
-                stack.push((blknum, path.clone(), depth, child_idx + 1, key_off));
-            }
+        let mut idx = 0;
+        let mut key_off = 0;
+        while idx < node.num_children {
            let key = &node.keys[key_off..key_off + node.suffix_len as usize];
-            let val = node.value(child_idx as usize);
-
+            let val = node.value(idx as usize);
            print!("{:indent$}", "", indent = depth * 2 + 2);
            println!("{}: {}", hex::encode(key), hex::encode(val.0));

            if node.level > 0 {
-                stack.push((val.to_blknum(), hex::encode(node.prefix), depth + 1, 0, 0));
+                let child_path = [path, node.prefix].concat();
+                self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?;
            }
+            idx += 1;
+            key_off += node.suffix_len as usize;
        }
        Ok(())
    }
@@ -757,8 +754,8 @@ mod tests {
        }
    }

-    #[tokio::test]
-    async fn basic() -> Result<()> {
+    #[test]
+    fn basic() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);

@@ -778,7 +775,7 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump().await?;
+        reader.dump()?;

        // Test the `get` function on all the keys.
        for (key, val) in all_data.iter() {
@@ -838,8 +835,8 @@ mod tests {
        Ok(())
    }

-    #[tokio::test]
-    async fn lots_of_keys() -> Result<()> {
+    #[test]
+    fn lots_of_keys() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);

@@ -859,7 +856,7 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump().await?;
+        reader.dump()?;

        use std::sync::Mutex;

@@ -997,8 +994,8 @@ mod tests {
    ///
    /// This test contains a particular data set, see disk_btree_test_data.rs
    ///
-    #[tokio::test]
-    async fn particular_data() -> Result<()> {
+    #[test]
+    fn particular_data() -> Result<()> {
        // Build a tree from it
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
@@ -1025,7 +1022,7 @@ mod tests {
        })?;
        assert_eq!(count, disk_btree_test_data::TEST_DATA.len());

-        reader.dump().await?;
+        reader.dump()?;

        Ok(())
    }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -328,7 +328,7 @@ fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::tenant::blob_io::BlobWriter;
+    use crate::tenant::blob_io::{BlobCursor, BlobWriter};
    use crate::tenant::block_io::BlockCursor;
    use rand::{seq::SliceRandom, thread_rng, RngCore};
    use std::fs;
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -626,17 +626,17 @@ impl LayerMap {

    /// debugging function to print out the contents of the layer map
    #[allow(unused)]
-    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    pub fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!("Begin dump LayerMap");

        println!("open_layer:");
        if let Some(open_layer) = &self.open_layer {
-            open_layer.dump(verbose, ctx).await?;
+            open_layer.dump(verbose, ctx)?;
        }

        println!("frozen_layers:");
        for frozen_layer in self.frozen_layers.iter() {
-            frozen_layer.dump(verbose, ctx).await?;
+            frozen_layer.dump(verbose, ctx)?;
        }

        println!("historic_layers:");
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -9,11 +9,10 @@
 //! [`remote_timeline_client`]: super::remote_timeline_client

 use std::fs::{File, OpenOptions};
-use std::io::{self, Write};
+use std::io::Write;

 use anyhow::{bail, ensure, Context};
 use serde::{Deserialize, Serialize};
-use thiserror::Error;
 use tracing::info_span;
 use utils::bin_ser::SerializeError;
 use utils::{
@@ -268,24 +267,24 @@ pub fn save_metadata(
    Ok(())
 }

-#[derive(Error, Debug)]
-pub enum LoadMetadataError {
-    #[error(transparent)]
-    Read(#[from] io::Error),
-
-    #[error(transparent)]
-    Decode(#[from] anyhow::Error),
-}
-
 pub fn load_metadata(
    conf: &'static PageServerConf,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
-) -> Result<TimelineMetadata, LoadMetadataError> {
+) -> anyhow::Result<TimelineMetadata> {
    let metadata_path = conf.metadata_path(tenant_id, timeline_id);
-    let metadata_bytes = std::fs::read(metadata_path)?;
-
-    Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
+    let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
+        format!(
+            "Failed to read metadata bytes from path {}",
+            metadata_path.display()
+        )
+    })?;
+    TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| {
+        format!(
+            "Failed to parse metadata bytes from path {}",
+            metadata_path.display()
+        )
+    })
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -26,8 +26,6 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};

-use super::timeline::delete::DeleteTimelineFlow;
-
 /// The tenants known to the pageserver.
 /// The enum variants are used to distinguish the different states that the pageserver can be in.
 enum TenantsMap {
@@ -423,10 +421,12 @@ pub enum DeleteTimelineError {
 pub async fn delete_timeline(
    tenant_id: TenantId,
    timeline_id: TimelineId,
-    _ctx: &RequestContext,
+    ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
    let tenant = get_tenant(tenant_id, true).await?;
-    DeleteTimelineFlow::run(&tenant, timeline_id).await?;
+    tenant
+        .prepare_and_schedule_delete_timeline(timeline_id, ctx)
+        .await?;
    Ok(())
 }

@@ -768,6 +768,55 @@ pub async fn immediate_gc(
    Ok(wait_task_done)
 }

+pub async fn immediate_compact(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ctx: &RequestContext,
+) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
+    let guard = TENANTS.read().await;
+
+    let tenant = guard
+        .get(&tenant_id)
+        .map(Arc::clone)
+        .with_context(|| format!("tenant {tenant_id}"))
+        .map_err(|e| ApiError::NotFound(e.into()))?;
+
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(|e| ApiError::NotFound(e.into()))?;
+
+    // Run in task_mgr to avoid race with tenant_detach operation
+    let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
+    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
+    task_mgr::spawn(
+        &tokio::runtime::Handle::current(),
+        TaskKind::Compaction,
+        Some(tenant_id),
+        Some(timeline_id),
+        &format!(
+            "timeline_compact_handler compaction run for tenant {tenant_id} timeline {timeline_id}"
+        ),
+        false,
+        async move {
+            let result = timeline
+                .compact(&ctx)
+                .instrument(info_span!("manual_compact", %tenant_id, %timeline_id))
+                .await;
+
+            match task_done.send(result) {
+                Ok(_) => (),
+                Err(result) => error!("failed to send compaction result: {result:?}"),
+            }
+            Ok(())
+        },
+    );
+
+    // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
+    drop(guard);
+
+    Ok(wait_task_done)
+}
+
 #[cfg(test)]
 mod tests {
    use std::collections::HashMap;
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -514,7 +514,7 @@ impl RemoteTimelineClient {
    /// updated metadata.
    ///
    /// The upload will be added to the queue immediately, but it
-    /// won't be performed until all previously scheduled layer file
+    /// won't be performed until all previosuly scheduled layer file
    /// upload operations have completed successfully.  This is to
    /// ensure that when the index file claims that layers X, Y and Z
    /// exist in remote storage, they really do. To wait for the upload
@@ -625,7 +625,7 @@ impl RemoteTimelineClient {
    /// Note: This schedules an index file upload before the deletions.  The
    /// deletion won't actually be performed, until any previously scheduled
    /// upload operations, and the index file upload, have completed
-    /// successfully.
+    /// succesfully.
    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
        names: &[LayerFileName],
@@ -827,7 +827,7 @@ impl RemoteTimelineClient {
            )
        };

-        receiver.changed().await.context("upload queue shut down")?;
+        receiver.changed().await?;

        // Do not delete index part yet, it is needed for possible retry. If we remove it first
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
@@ -855,23 +855,11 @@ impl RemoteTimelineClient {
            self.storage_impl.delete_objects(&remaining).await?;
        }

-        fail::fail_point!("timeline-delete-before-index-delete", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: timeline-delete-before-index-delete"
-            ))?
-        });
-
        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));

        debug!("deleting index part");
        self.storage_impl.delete(&index_file_path).await?;

-        fail::fail_point!("timeline-delete-after-index-delete", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: timeline-delete-after-index-delete"
-            ))?
-        });
-
        info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");

        Ok(())
@@ -1117,7 +1105,7 @@ impl RemoteTimelineClient {
            debug!("remote task {} completed successfully", task.op);
        }

-        // The task has completed successfully. Remove it from the in-progress list.
+        // The task has completed succesfully. Remove it from the in-progress list.
        {
            let mut upload_queue_guard = self.upload_queue.lock().unwrap();
            let upload_queue = match upload_queue_guard.deref_mut() {
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -223,45 +223,6 @@ mod tests {
        assert_eq!(part, expected);
    }

-    #[test]
-    fn v2_indexpart_is_parsed_with_deleted_at() {
-        let example = r#"{
-            "version":2,
-            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
-            "missing_layers":["This shouldn't fail deserialization"],
-            "layer_metadata":{
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
-            },
-            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
-            "deleted_at": "2023-07-31T09:00:00.123"
-        }"#;
-
-        let expected = IndexPart {
-            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
-            version: 2,
-            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
-            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
-                    file_size: 25600000,
-                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
-                    // serde_json should always parse this but this might be a double with jq for
-                    // example.
-                    file_size: 9007199254741001,
-                })
-            ]),
-            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
-            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
-        };
-
-        let part = serde_json::from_str::<IndexPart>(example).unwrap();
-        assert_eq!(part, expected);
-    }
-
    #[test]
    fn empty_layers_are_parsed() {
        let empty_layers_json = r#"{
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -338,8 +338,7 @@ impl LayerAccessStats {
 /// All layers should implement a minimal `std::fmt::Debug` without tenant or
 /// timeline names, because those are known in the context of which the layers
 /// are used in (timeline).
-#[async_trait::async_trait]
-pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
+pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
    /// Range of keys that this layer covers
    fn get_key_range(&self) -> Range<Key>;

@@ -369,7 +368,7 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
    /// is available. If this returns ValueReconstructResult::Continue, look up
    /// the predecessor layer and call again with the same 'reconstruct_data' to
    /// collect more data.
-    async fn get_value_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
@@ -378,7 +377,7 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
    ) -> Result<ValueReconstructResult>;

    /// Dump summary of the contents of the layer to stdout
-    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
+    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
 }

 /// Returned by [`PersistentLayer::iter`]
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -31,7 +31,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::{PageReadGuard, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
-use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
+use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
@@ -223,10 +223,9 @@ impl std::fmt::Debug for DeltaLayerInner {
    }
 }

-#[async_trait::async_trait]
 impl Layer for DeltaLayer {
    /// debugging function to print out the contents of the layer
-    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
            self.desc.tenant_id,
@@ -256,7 +255,7 @@ impl Layer for DeltaLayer {
            file,
        );

-        tree_reader.dump().await?;
+        tree_reader.dump()?;

        let mut cursor = file.block_cursor();

@@ -301,7 +300,7 @@ impl Layer for DeltaLayer {
        Ok(())
    }

-    async fn get_value_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -27,7 +27,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, KEY_SIZE};
-use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
+use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
@@ -38,7 +38,6 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use hex;
-use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -48,6 +47,7 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
+use std::sync::{RwLock, RwLockReadGuard};
 use tracing::*;

 use utils::{
@@ -117,7 +117,7 @@ pub struct ImageLayer {

    access_stats: LayerAccessStats,

-    inner: OnceCell<ImageLayerInner>,
+    inner: RwLock<ImageLayerInner>,
 }

 impl std::fmt::Debug for ImageLayer {
@@ -134,27 +134,30 @@ impl std::fmt::Debug for ImageLayer {
 }

 pub struct ImageLayerInner {
+    /// If false, the 'index' has not been loaded into memory yet.
+    loaded: bool,
+
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,

-    /// Reader object for reading blocks from the file.
-    file: FileBlockReader<VirtualFile>,
+    /// Reader object for reading blocks from the file. (None if not loaded yet)
+    file: Option<FileBlockReader<VirtualFile>>,
 }

 impl std::fmt::Debug for ImageLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("ImageLayerInner")
+            .field("loaded", &self.loaded)
            .field("index_start_blk", &self.index_start_blk)
            .field("index_root_blk", &self.index_root_blk)
            .finish()
    }
 }

-#[async_trait::async_trait]
 impl Layer for ImageLayer {
    /// debugging function to print out the contents of the layer
-    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
            self.desc.tenant_id,
@@ -171,11 +174,11 @@ impl Layer for ImageLayer {
        }

        let inner = self.load(LayerAccessKind::Dump, ctx)?;
-        let file = &inner.file;
+        let file = inner.file.as_ref().unwrap();
        let tree_reader =
            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);

-        tree_reader.dump().await?;
+        tree_reader.dump()?;

        tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
            println!("key: {} offset {}", hex::encode(key), value);
@@ -186,7 +189,7 @@ impl Layer for ImageLayer {
    }

    /// Look up given page in the file
-    async fn get_value_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
@@ -199,7 +202,7 @@ impl Layer for ImageLayer {

        let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;

-        let file = &inner.file;
+        let file = inner.file.as_ref().unwrap();
        let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);

        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
@@ -318,26 +321,52 @@ impl ImageLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&ImageLayerInner> {
+    fn load(
+        &self,
+        access_kind: LayerAccessKind,
+        ctx: &RequestContext,
+    ) -> Result<RwLockReadGuard<ImageLayerInner>> {
        self.access_stats
            .record_access(access_kind, ctx.task_kind());
        loop {
-            if let Some(inner) = self.inner.get() {
+            // Quick exit if already loaded
+            let inner = self.inner.read().unwrap();
+            if inner.loaded {
                return Ok(inner);
            }
-            self.inner
-                .get_or_try_init(|| self.load_inner())
-                .with_context(|| format!("Failed to load image layer {}", self.path().display()))?;
+
+            // Need to open the file and load the metadata. Upgrade our lock to
+            // a write lock. (Or rather, release and re-lock in write mode.)
+            drop(inner);
+            let mut inner = self.inner.write().unwrap();
+            if !inner.loaded {
+                self.load_inner(&mut inner).with_context(|| {
+                    format!("Failed to load image layer {}", self.path().display())
+                })?
+            } else {
+                // Another thread loaded it while we were not holding the lock.
+            }
+
+            // We now have the file open and loaded. There's no function to do
+            // that in the std library RwLock, so we have to release and re-lock
+            // in read mode. (To be precise, the lock guard was moved in the
+            // above call to `load_inner`, so it's already been released). And
+            // while we do that, another thread could unload again, so we have
+            // to re-check and retry if that happens.
+            drop(inner);
        }
    }

-    fn load_inner(&self) -> Result<ImageLayerInner> {
+    fn load_inner(&self, inner: &mut ImageLayerInner) -> Result<()> {
        let path = self.path();

        // Open the file if it's not open already.
-        let file = VirtualFile::open(&path)
-            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-        let file = FileBlockReader::new(file);
+        if inner.file.is_none() {
+            let file = VirtualFile::open(&path)
+                .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+            inner.file = Some(FileBlockReader::new(file));
+        }
+        let file = inner.file.as_mut().unwrap();
        let summary_blk = file.read_blk(0)?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

@@ -365,11 +394,10 @@ impl ImageLayer {
            }
        }

-        Ok(ImageLayerInner {
-            index_start_blk: actual_summary.index_start_blk,
-            index_root_blk: actual_summary.index_root_blk,
-            file,
-        })
+        inner.index_start_blk = actual_summary.index_start_blk;
+        inner.index_root_blk = actual_summary.index_root_blk;
+        inner.loaded = true;
+        Ok(())
    }

    /// Create an ImageLayer struct representing an existing file on disk
@@ -393,7 +421,12 @@ impl ImageLayer {
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: filename.lsn,
            access_stats,
-            inner: OnceCell::new(),
+            inner: RwLock::new(ImageLayerInner {
+                loaded: false,
+                file: None,
+                index_start_blk: 0,
+                index_root_blk: 0,
+            }),
        }
    }

@@ -420,7 +453,12 @@ impl ImageLayer {
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
+            inner: RwLock::new(ImageLayerInner {
+                file: None,
+                loaded: false,
+                index_start_blk: 0,
+                index_root_blk: 0,
+            }),
        })
    }

@@ -581,7 +619,12 @@ impl ImageLayerWriterInner {
            desc,
            lsn: self.lsn,
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
+            inner: RwLock::new(ImageLayerInner {
+                loaded: false,
+                file: None,
+                index_start_blk,
+                index_root_blk,
+            }),
        };

        // fsync the file
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -7,7 +7,7 @@
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::{Key, Value};
-use crate::tenant::blob_io::BlobWriter;
+use crate::tenant::blob_io::{BlobCursor, BlobWriter};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
@@ -110,7 +110,6 @@ impl InMemoryLayer {
    }
 }

-#[async_trait::async_trait]
 impl Layer for InMemoryLayer {
    fn get_key_range(&self) -> Range<Key> {
        Key::MIN..Key::MAX
@@ -133,7 +132,7 @@ impl Layer for InMemoryLayer {
    }

    /// debugging function to print out the contents of the layer
-    async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
+    fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
        let inner = self.inner.read().unwrap();

        let end_str = inner
@@ -184,7 +183,7 @@ impl Layer for InMemoryLayer {
    }

    /// Look up given value in the layer.
-    async fn get_value_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -65,9 +65,8 @@ impl std::fmt::Debug for RemoteLayer {
    }
 }

-#[async_trait::async_trait]
 impl Layer for RemoteLayer {
-    async fn get_value_reconstruct_data(
+    fn get_value_reconstruct_data(
        &self,
        _key: Key,
        _lsn_range: Range<Lsn>,
@@ -78,7 +77,7 @@ impl Layer for RemoteLayer {
    }

    /// debugging function to print out the contents of the layer
-    async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
+    fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        println!(
            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
            self.desc.tenant_id,
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -111,7 +111,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                Duration::from_secs(10)
            } else {
                // Run compaction
-                if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
+                if let Err(e) = tenant.compaction_iteration(&ctx).await {
                    error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
                    wait_duration
                } else {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,4 +1,3 @@
-pub mod delete;
 mod eviction_task;
 pub mod layer_manager;
 mod logical_size;
@@ -80,7 +79,6 @@ use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};

-use self::delete::DeleteTimelineFlow;
 pub(super) use self::eviction_task::EvictionTaskTenantState;
 use self::eviction_task::EvictionTaskTimelineState;
 use self::layer_manager::LayerManager;
@@ -239,10 +237,11 @@ pub struct Timeline {

    /// Layer removal lock.
    /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
-    /// This lock is acquired in [`Timeline::gc`] and [`Timeline::compact`].
-    /// This is an `Arc<Mutex>` lock because we need an owned
+    /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
+    /// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
    /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
-    /// Note that [`DeleteTimelineFlow`] uses `delete_progress` field.
+    ///
+    /// [`Tenant::delete_timeline`]: super::Tenant::delete_timeline
    pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,

    // Needed to ensure that we can't create a branch at a point that was already garbage collected
@@ -284,7 +283,7 @@ pub struct Timeline {

    /// Prevent two tasks from deleting the timeline at the same time. If held, the
    /// timeline is being deleted. If 'true', the timeline has already been deleted.
-    pub delete_progress: Arc<tokio::sync::Mutex<DeleteTimelineFlow>>,
+    pub delete_lock: Arc<tokio::sync::Mutex<bool>>,

    eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,

@@ -294,10 +293,6 @@ pub struct Timeline {
    /// Completion shared between all timelines loaded during startup; used to delay heavier
    /// background tasks until some logical sizes have been calculated.
    initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
-
-    /// Load or creation time information about the disk_consistent_lsn and when the loading
-    /// happened. Used for consumption metrics.
-    pub(crate) loaded_at: (Lsn, SystemTime),
 }

 pub struct WalReceiverInfo {
@@ -339,7 +334,7 @@ pub struct GcInfo {
 #[derive(thiserror::Error)]
 pub enum PageReconstructError {
    #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error

    /// The operation would require downloading a layer that is missing locally.
    NeedsDownload(TenantTimelineId, LayerFileName),
@@ -480,7 +475,7 @@ impl Timeline {
            img: cached_page_img,
        };

-        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
+        let timer = self.metrics.get_reconstruct_data_time_histo.start_timer();
        self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
            .await?;
        timer.stop_and_record();
@@ -560,7 +555,7 @@ impl Timeline {
            "wait_lsn cannot be called in WAL receiver"
        );

-        let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();
+        let _timer = self.metrics.wait_lsn_time_histo.start_timer();

        match self
            .last_record_lsn
@@ -616,46 +611,9 @@ impl Timeline {
    }

    /// Outermost timeline compaction operation; downloads needed layers.
-    pub async fn compact(
-        self: &Arc<Self>,
-        cancel: &CancellationToken,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    pub async fn compact(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
        const ROUNDS: usize = 2;

-        static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
-            once_cell::sync::Lazy::new(|| {
-                let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
-                let permits = usize::max(
-                    1,
-                    // while a lot of the work is done on spawn_blocking, we still do
-                    // repartitioning in the async context. this should give leave us some workers
-                    // unblocked to be blocked on other work, hopefully easing any outside visible
-                    // effects of restarts.
-                    //
-                    // 6/8 is a guess; previously we ran with unlimited 8 and more from
-                    // spawn_blocking.
-                    (total_threads * 3).checked_div(4).unwrap_or(0),
-                );
-                assert_ne!(permits, 0, "we will not be adding in permits later");
-                assert!(
-                    permits < total_threads,
-                    "need threads avail for shorter work"
-                );
-                tokio::sync::Semaphore::new(permits)
-            });
-
-        // this wait probably never needs any "long time spent" logging, because we already nag if
-        // compaction task goes over it's period (20s) which is quite often in production.
-        let _permit = tokio::select! {
-            permit = CONCURRENT_COMPACTIONS.acquire() => {
-                permit
-            },
-            _ = cancel.cancelled() => {
-                return Ok(());
-            }
-        };
-
        let last_record_lsn = self.get_last_record_lsn();

        // Last record Lsn could be zero in case the timeline was just created
@@ -713,9 +671,11 @@ impl Timeline {

            let mut failed = 0;

+            let mut cancelled = pin!(task_mgr::shutdown_watcher());
+
            loop {
                tokio::select! {
-                    _ = cancel.cancelled() => anyhow::bail!("Cancelled while downloading remote layers"),
+                    _ = &mut cancelled => anyhow::bail!("Cancelled while downloading remote layers"),
                    res = downloads.next() => {
                        match res {
                            Some(Ok(())) => {},
@@ -930,7 +890,7 @@ impl Timeline {
                    new_state,
                    TimelineState::Stopping | TimelineState::Broken { .. }
                ) {
-                    // drop the completion guard, if any; it might be holding off the completion
+                    // drop the copmletion guard, if any; it might be holding off the completion
                    // forever needlessly
                    self.initial_logical_size_attempt
                        .lock()
@@ -1365,10 +1325,9 @@ impl Timeline {
        pg_version: u32,
        initial_logical_size_can_start: Option<completion::Barrier>,
        initial_logical_size_attempt: Option<completion::Completion>,
-        state: TimelineState,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
-        let (state, _) = watch::channel(state);
+        let (state, _) = watch::channel(TimelineState::Loading);

        let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
        let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
@@ -1408,8 +1367,6 @@ impl Timeline {
                last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0),
                last_freeze_ts: RwLock::new(Instant::now()),

-                loaded_at: (disk_consistent_lsn, SystemTime::now()),
-
                ancestor_timeline: ancestor,
                ancestor_lsn: metadata.ancestor_lsn(),

@@ -1461,7 +1418,7 @@ impl Timeline {
                eviction_task_timeline_state: tokio::sync::Mutex::new(
                    EvictionTaskTimelineState::default(),
                ),
-                delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),
+                delete_lock: Arc::new(tokio::sync::Mutex::new(false)),

                initial_logical_size_can_start,
                initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
@@ -1606,7 +1563,7 @@ impl Timeline {
            if let Some(imgfilename) = ImageFileName::parse_str(&fname) {
                // create an ImageLayer struct for each image file.
                if imgfilename.lsn > disk_consistent_lsn {
-                    info!(
+                    warn!(
                        "found future image layer {} on timeline {} disk_consistent_lsn is {}",
                        imgfilename, self.timeline_id, disk_consistent_lsn
                    );
@@ -1638,7 +1595,7 @@ impl Timeline {
                // is 102, then it might not have been fully flushed to disk
                // before crash.
                if deltafilename.lsn_range.end > disk_consistent_lsn + 1 {
-                    info!(
+                    warn!(
                        "found future delta layer {} on timeline {} disk_consistent_lsn is {}",
                        deltafilename, self.timeline_id, disk_consistent_lsn
                    );
@@ -1780,7 +1737,7 @@ impl Timeline {
            match remote_layer_name {
                LayerFileName::Image(imgfilename) => {
                    if imgfilename.lsn > up_to_date_disk_consistent_lsn {
-                        info!(
+                        warn!(
                        "found future image layer {} on timeline {} remote_consistent_lsn is {}",
                        imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn
                    );
@@ -1805,7 +1762,7 @@ impl Timeline {
                    // is 102, then it might not have been fully flushed to disk
                    // before crash.
                    if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
-                        info!(
+                        warn!(
                            "found future delta layer {} on timeline {} remote_consistent_lsn is {}",
                            deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
                        );
@@ -1926,15 +1883,6 @@ impl Timeline {
    }

    fn try_spawn_size_init_task(self: &Arc<Self>, lsn: Lsn, ctx: &RequestContext) {
-        let state = self.current_state();
-        if matches!(
-            state,
-            TimelineState::Broken { .. } | TimelineState::Stopping
-        ) {
-            // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
-            return;
-        }
-
        let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
            .try_acquire_owned()
        {
@@ -2304,9 +2252,8 @@ impl Timeline {
        let mut timeline_owned;
        let mut timeline = self;

-        let mut read_count = scopeguard::guard(0, |cnt| {
-            crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64)
-        });
+        let mut read_count =
+            scopeguard::guard(0, |cnt| self.metrics.read_num_fs_layers.observe(cnt as f64));

        // For debugging purposes, collect the path of layers that we traversed
        // through. It's included in the error message if we fail to find the key.
@@ -2440,15 +2387,12 @@ impl Timeline {
                            // Get all the data needed to reconstruct the page version from this layer.
                            // But if we have an older cached page image, no need to go past that.
                            let lsn_floor = max(cached_lsn + 1, start_lsn);
-                            result = match open_layer
-                                .get_value_reconstruct_data(
-                                    key,
-                                    lsn_floor..cont_lsn,
-                                    reconstruct_state,
-                                    ctx,
-                                )
-                                .await
-                            {
+                            result = match open_layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                                ctx,
+                            ) {
                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
@@ -2470,15 +2414,12 @@ impl Timeline {
                        if cont_lsn > start_lsn {
                            //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
                            let lsn_floor = max(cached_lsn + 1, start_lsn);
-                            result = match frozen_layer
-                                .get_value_reconstruct_data(
-                                    key,
-                                    lsn_floor..cont_lsn,
-                                    reconstruct_state,
-                                    ctx,
-                                )
-                                .await
-                            {
+                            result = match frozen_layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                                ctx,
+                            ) {
                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
@@ -2509,15 +2450,12 @@ impl Timeline {
                            // Get all the data needed to reconstruct the page version from this layer.
                            // But if we have an older cached page image, no need to go past that.
                            let lsn_floor = max(cached_lsn + 1, lsn_floor);
-                            result = match layer
-                                .get_value_reconstruct_data(
-                                    key,
-                                    lsn_floor..cont_lsn,
-                                    reconstruct_state,
-                                    ctx,
-                                )
-                                .await
-                            {
+                            result = match layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                                ctx,
+                            ) {
                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -1,576 +0,0 @@
-use std::{
-    ops::{Deref, DerefMut},
-    sync::Arc,
-};
-
-use anyhow::Context;
-use pageserver_api::models::TimelineState;
-use tokio::sync::OwnedMutexGuard;
-use tracing::{debug, error, info, instrument, warn, Instrument, Span};
-use utils::{
-    crashsafe, fs_ext,
-    id::{TenantId, TimelineId},
-};
-
-use crate::{
-    config::PageServerConf,
-    task_mgr::{self, TaskKind},
-    tenant::{
-        metadata::TimelineMetadata,
-        remote_timeline_client::{
-            self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
-        },
-        CreateTimelineCause, DeleteTimelineError, Tenant,
-    },
-    InitializationOrder,
-};
-
-use super::Timeline;
-
-/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
-async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
-    // Stop the walreceiver first.
-    debug!("waiting for wal receiver to shutdown");
-    let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
-    if let Some(walreceiver) = maybe_started_walreceiver {
-        walreceiver.stop().await;
-    }
-    debug!("wal receiver shutdown confirmed");
-
-    // Prevent new uploads from starting.
-    if let Some(remote_client) = timeline.remote_client.as_ref() {
-        let res = remote_client.stop();
-        match res {
-            Ok(()) => {}
-            Err(e) => match e {
-                remote_timeline_client::StopError::QueueUninitialized => {
-                    // This case shouldn't happen currently because the
-                    // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
-                    // That is, before we declare the Tenant as Active.
-                    // But we only allow calls to delete_timeline on Active tenants.
-                    return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
-                }
-            },
-        }
-    }
-
-    // Stop & wait for the remaining timeline tasks, including upload tasks.
-    // NB: This and other delete_timeline calls do not run as a task_mgr task,
-    //     so, they are not affected by this shutdown_tasks() call.
-    info!("waiting for timeline tasks to shutdown");
-    task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await;
-
-    fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: timeline-delete-before-index-deleted-at"
-        ))?
-    });
-    Ok(())
-}
-
-/// Mark timeline as deleted in S3 so we won't pick it up next time
-/// during attach or pageserver restart.
-/// See comment in persist_index_part_with_deleted_flag.
-async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
-    if let Some(remote_client) = timeline.remote_client.as_ref() {
-        match remote_client.persist_index_part_with_deleted_flag().await {
-            // If we (now, or already) marked it successfully as deleted, we can proceed
-            Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
-            // Bail out otherwise
-            //
-            // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
-            // two tasks from performing the deletion at the same time. The first task
-            // that starts deletion should run it to completion.
-            Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
-            | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
-                return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
-            }
-        }
-    }
-    Ok(())
-}
-
-// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued.
-// This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json
-// gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote"
-// and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file.
-// After index part is deleted presence of this mark file indentifies that it was a deletion intention.
-// So we can just remove the mark file.
-async fn create_delete_mark(
-    conf: &PageServerConf,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-) -> Result<(), DeleteTimelineError> {
-    fail::fail_point!("timeline-delete-before-delete-mark", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: timeline-delete-before-delete-mark"
-        ))?
-    });
-    let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id);
-
-    // Note: we're ok to replace existing file.
-    let _ = std::fs::OpenOptions::new()
-        .write(true)
-        .create(true)
-        .open(&marker_path)
-        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
-
-    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
-    Ok(())
-}
-
-/// Grab the layer_removal_cs lock, and actually perform the deletion.
-///
-/// This lock prevents prevents GC or compaction from running at the same time.
-/// The GC task doesn't register itself with the timeline it's operating on,
-/// so it might still be running even though we called `shutdown_tasks`.
-///
-/// Note that there are still other race conditions between
-/// GC, compaction and timeline deletion. See
-/// <https://github.com/neondatabase/neon/issues/2671>
-///
-/// No timeout here, GC & Compaction should be responsive to the
-/// `TimelineState::Stopping` change.
-async fn delete_local_layer_files(
-    conf: &PageServerConf,
-    tenant_id: TenantId,
-    timeline: &Timeline,
-) -> anyhow::Result<()> {
-    info!("waiting for layer_removal_cs.lock()");
-    let layer_removal_guard = timeline.layer_removal_cs.lock().await;
-    info!("got layer_removal_cs.lock(), deleting layer files");
-
-    // NB: storage_sync upload tasks that reference these layers have been cancelled
-    //     by the caller.
-
-    let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id);
-
-    fail::fail_point!("timeline-delete-before-rm", |_| {
-        Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
-    });
-
-    // NB: This need not be atomic because the deleted flag in the IndexPart
-    // will be observed during tenant/timeline load. The deletion will be resumed there.
-    //
-    // For configurations without remote storage, we guarantee crash-safety by persising delete mark file.
-    //
-    // Note that here we do not bail out on std::io::ErrorKind::NotFound.
-    // This can happen if we're called a second time, e.g.,
-    // because of a previous failure/cancellation at/after
-    // failpoint timeline-delete-after-rm.
-    //
-    // It can also happen if we race with tenant detach, because,
-    // it doesn't grab the layer_removal_cs lock.
-    //
-    // For now, log and continue.
-    // warn! level is technically not appropriate for the
-    // first case because we should expect retries to happen.
-    // But the error is so rare, it seems better to get attention if it happens.
-    //
-    // Note that metadata removal is skipped, this is not technically needed,
-    // but allows to reuse timeline loading code during resumed deletion.
-    // (we always expect that metadata is in place when timeline is being loaded)
-
-    #[cfg(feature = "testing")]
-    let mut counter = 0;
-
-    // Timeline directory may not exist if we failed to delete mark file and request was retried.
-    if !local_timeline_directory.exists() {
-        return Ok(());
-    }
-
-    let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id);
-
-    for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
-        #[cfg(feature = "testing")]
-        {
-            counter += 1;
-            if counter == 2 {
-                fail::fail_point!("timeline-delete-during-rm", |_| {
-                    Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))?
-                });
-            }
-        }
-
-        let entry = entry?;
-        if entry.path() == metadata_path {
-            debug!("found metadata, skipping");
-            continue;
-        }
-
-        if entry.path() == local_timeline_directory {
-            // Keeping directory because metedata file is still there
-            debug!("found timeline dir itself, skipping");
-            continue;
-        }
-
-        let metadata = match entry.metadata() {
-            Ok(metadata) => metadata,
-            Err(e) => {
-                if crate::is_walkdir_io_not_found(&e) {
-                    warn!(
-                        timeline_dir=?local_timeline_directory,
-                        path=?entry.path().display(),
-                        "got not found err while removing timeline dir, proceeding anyway"
-                    );
-                    continue;
-                }
-                anyhow::bail!(e);
-            }
-        };
-
-        let r = if metadata.is_dir() {
-            // There shouldnt be any directories inside timeline dir as of current layout.
-            tokio::fs::remove_dir(entry.path()).await
-        } else {
-            tokio::fs::remove_file(entry.path()).await
-        };
-
-        if let Err(e) = r {
-            if e.kind() == std::io::ErrorKind::NotFound {
-                warn!(
-                    timeline_dir=?local_timeline_directory,
-                    path=?entry.path().display(),
-                    "got not found err while removing timeline dir, proceeding anyway"
-                );
-                continue;
-            }
-            anyhow::bail!(anyhow::anyhow!(
-                "Failed to remove: {}. Error: {e}",
-                entry.path().display()
-            ));
-        }
-    }
-
-    info!("finished deleting layer files, releasing layer_removal_cs.lock()");
-    drop(layer_removal_guard);
-
-    fail::fail_point!("timeline-delete-after-rm", |_| {
-        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
-    });
-
-    Ok(())
-}
-
-/// Removes remote layers and an index file after them.
-async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
-    if let Some(remote_client) = &timeline.remote_client {
-        remote_client.delete_all().await.context("delete_all")?
-    };
-
-    Ok(())
-}
-
-// This function removs remaining traces of a timeline on disk.
-// Namely: metadata file, timeline directory, delete mark.
-// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
-// delete mark should be present because it is the last step during deletion.
-// (nothing can fail after its deletion)
-async fn cleanup_remaining_timeline_fs_traces(
-    conf: &PageServerConf,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-) -> anyhow::Result<()> {
-    // Remove local metadata
-    tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id))
-        .await
-        .or_else(fs_ext::ignore_not_found)
-        .context("remove metadata")?;
-
-    fail::fail_point!("timeline-delete-after-rm-metadata", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: timeline-delete-after-rm-metadata"
-        ))?
-    });
-
-    // Remove timeline dir
-    tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id))
-        .await
-        .or_else(fs_ext::ignore_not_found)
-        .context("timeline dir")?;
-
-    fail::fail_point!("timeline-delete-after-rm-dir", |_| {
-        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
-    });
-
-    // Remove delete mark
-    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
-        .await
-        .context("remove delete mark")
-}
-
-/// It is important that this gets called when DeletionGuard is being held.
-/// For more context see comments in [`DeleteTimelineFlow::prepare`]
-async fn remove_timeline_from_tenant(
-    tenant: &Tenant,
-    timeline_id: TimelineId,
-    _: &DeletionGuard, // using it as a witness
-) -> anyhow::Result<()> {
-    // Remove the timeline from the map.
-    let mut timelines = tenant.timelines.lock().unwrap();
-    let children_exist = timelines
-        .iter()
-        .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
-    // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
-    // We already deleted the layer files, so it's probably best to panic.
-    // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
-    if children_exist {
-        panic!("Timeline grew children while we removed layer files");
-    }
-
-    timelines
-        .remove(&timeline_id)
-        .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
-
-    drop(timelines);
-
-    Ok(())
-}
-
-/// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
-/// and deletes its data from both disk and s3.
-/// The sequence of steps:
-/// 1. Set deleted_at in remote index part.
-/// 2. Create local mark file.
-/// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
-/// 4. Delete remote layers
-/// 5. Delete index part
-/// 6. Delete meta, timeline directory
-/// 7. Delete mark file
-/// It is resumable from any step in case a crash/restart occurs.
-/// There are three entrypoints to the process:
-/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
-/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
-/// and we possibly neeed to continue deletion of remote files.
-/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
-/// index but still have local metadata, timeline directory and delete mark.
-/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
-#[derive(Default)]
-pub enum DeleteTimelineFlow {
-    #[default]
-    NotStarted,
-    InProgress,
-    Finished,
-}
-
-impl DeleteTimelineFlow {
-    // These steps are run in the context of management api request handler.
-    // Long running steps are continued to run in the background.
-    // NB: If this fails half-way through, and is retried, the retry will go through
-    // all the same steps again. Make sure the code here is idempotent, and don't
-    // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))]
-    pub async fn run(
-        tenant: &Arc<Tenant>,
-        timeline_id: TimelineId,
-    ) -> Result<(), DeleteTimelineError> {
-        let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
-
-        guard.mark_in_progress()?;
-
-        stop_tasks(&timeline).await?;
-
-        set_deleted_in_remote_index(&timeline).await?;
-
-        create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?;
-
-        fail::fail_point!("timeline-delete-before-schedule", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: timeline-delete-before-schedule"
-            ))?
-        });
-
-        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
-
-        Ok(())
-    }
-
-    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
-        match self {
-            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
-            Self::InProgress { .. } => { /* We're in a retry */ }
-            Self::NotStarted => { /* Fresh start */ }
-        }
-
-        *self = Self::InProgress;
-
-        Ok(())
-    }
-
-    /// Shortcut to create Timeline in stopping state and spawn deletion task.
-    pub async fn resume_deletion(
-        tenant: Arc<Tenant>,
-        timeline_id: TimelineId,
-        local_metadata: &TimelineMetadata,
-        remote_client: Option<RemoteTimelineClient>,
-        init_order: Option<&InitializationOrder>,
-    ) -> anyhow::Result<()> {
-        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
-        // RemoteTimelineClient is the only functioning part.
-        let timeline = tenant
-            .create_timeline_struct(
-                timeline_id,
-                local_metadata,
-                None, // Ancestor is not needed for deletion.
-                remote_client,
-                init_order,
-                // Important. We dont pass ancestor above because it can be missing.
-                // Thus we need to skip the validation here.
-                CreateTimelineCause::Delete,
-            )
-            .context("create_timeline_struct")?;
-
-        let mut guard = DeletionGuard(
-            Arc::clone(&timeline.delete_progress)
-                .try_lock_owned()
-                .expect("cannot happen because we're the only owner"),
-        );
-
-        // We meed to do this because when console retries delete request we shouldnt answer with 404
-        // because 404 means successful deletion.
-        {
-            let mut locked = tenant.timelines.lock().unwrap();
-            locked.insert(timeline_id, Arc::clone(&timeline));
-        }
-
-        guard.mark_in_progress()?;
-
-        // Note that delete mark can be missing on resume
-        // because we create delete mark after we set deleted_at in the index part.
-        create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?;
-
-        Self::schedule_background(guard, tenant.conf, tenant, timeline);
-
-        Ok(())
-    }
-
-    pub async fn cleanup_remaining_timeline_fs_traces(
-        tenant: &Tenant,
-        timeline_id: TimelineId,
-    ) -> anyhow::Result<()> {
-        cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await
-    }
-
-    fn prepare(
-        tenant: &Tenant,
-        timeline_id: TimelineId,
-    ) -> Result<(Arc<Timeline>, DeletionGuard), DeleteTimelineError> {
-        // Note the interaction between this guard and deletion guard.
-        // Here we attempt to lock deletion guard when we're holding a lock on timelines.
-        // This is important because when you take into account `remove_timeline_from_tenant`
-        // we remove timeline from memory when we still hold the deletion guard.
-        // So here when timeline deletion is finished timeline wont be present in timelines map at all
-        // which makes the following sequence impossible:
-        // T1: get preempted right before the try_lock on `Timeline::delete_progress`
-        // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
-        // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
-        // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
-        let timelines = tenant.timelines.lock().unwrap();
-
-        let timeline = match timelines.get(&timeline_id) {
-            Some(t) => t,
-            None => return Err(DeleteTimelineError::NotFound),
-        };
-
-        // Ensure that there are no child timelines **attached to that pageserver**,
-        // because detach removes files, which will break child branches
-        let children: Vec<TimelineId> = timelines
-            .iter()
-            .filter_map(|(id, entry)| {
-                if entry.get_ancestor_timeline_id() == Some(timeline_id) {
-                    Some(*id)
-                } else {
-                    None
-                }
-            })
-            .collect();
-
-        if !children.is_empty() {
-            return Err(DeleteTimelineError::HasChildren(children));
-        }
-
-        // Note that using try_lock here is important to avoid a deadlock.
-        // Here we take lock on timelines and then the deletion guard.
-        // At the end of the operation we're holding the guard and need to lock timelines map
-        // to remove the timeline from it.
-        // Always if you have two locks that are taken in different order this can result in a deadlock.
-        let delete_lock_guard = DeletionGuard(
-            Arc::clone(&timeline.delete_progress)
-                .try_lock_owned()
-                .map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
-        );
-
-        timeline.set_state(TimelineState::Stopping);
-
-        Ok((Arc::clone(timeline), delete_lock_guard))
-    }
-
-    fn schedule_background(
-        guard: DeletionGuard,
-        conf: &'static PageServerConf,
-        tenant: Arc<Tenant>,
-        timeline: Arc<Timeline>,
-    ) {
-        let tenant_id = timeline.tenant_id;
-        let timeline_id = timeline.timeline_id;
-
-        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
-            TaskKind::TimelineDeletionWorker,
-            Some(tenant_id),
-            Some(timeline_id),
-            "timeline_delete",
-            false,
-            async move {
-                if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
-                    error!("Error: {err:#}");
-                    timeline.set_broken(format!("{err:#}"))
-                };
-                Ok(())
-            }
-            .instrument({
-                let span =
-                    tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
-                span.follows_from(Span::current());
-                span
-            }),
-        );
-    }
-
-    async fn background(
-        mut guard: DeletionGuard,
-        conf: &PageServerConf,
-        tenant: &Tenant,
-        timeline: &Timeline,
-    ) -> Result<(), DeleteTimelineError> {
-        delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;
-
-        delete_remote_layers_and_index(timeline).await?;
-
-        pausable_failpoint!("in_progress_delete");
-
-        cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?;
-
-        remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
-
-        *guard.0 = Self::Finished;
-
-        Ok(())
-    }
-}
-
-struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
-
-impl Deref for DeletionGuard {
-    type Target = DeleteTimelineFlow;
-
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
-
-impl DerefMut for DeletionGuard {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.0
-    }
-}
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -2,9 +2,13 @@ use std::{collections::hash_map::Entry, fs, path::PathBuf, sync::Arc};

 use anyhow::Context;
 use tracing::{error, info, info_span, warn};
-use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn};
+use utils::{crashsafe, id::TimelineId, lsn::Lsn};

-use crate::{context::RequestContext, import_datadir, tenant::Tenant};
+use crate::{
+    context::RequestContext,
+    import_datadir,
+    tenant::{ignore_absent_files, Tenant},
+};

 use super::Timeline;

@@ -137,7 +141,7 @@ impl Drop for UninitializedTimeline<'_> {

 pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
    let timeline_path = &uninit_mark.timeline_path;
-    match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
+    match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
        Ok(()) => {
            info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
        }
@@ -181,7 +185,7 @@ impl TimelineUninitMark {
        let uninit_mark_parent = uninit_mark_file
            .parent()
            .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
-        fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
+        ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
            format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
        })?;
        crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -1123,7 +1123,7 @@ mod tests {
    }

    #[tokio::test]
-    async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
+    async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
        let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
        let mut state = dummy_state(&harness).await;
        let current_lsn = Lsn(100_000).align();
@@ -1189,8 +1189,8 @@ mod tests {
    }

    #[tokio::test]
-    async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?;
+    async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?;
        let mut state = dummy_state(&harness).await;
        let current_lsn = Lsn(100_000).align();
        let now = Utc::now().naive_utc();
@@ -1252,8 +1252,8 @@ mod tests {
    }

    #[tokio::test]
-    async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?;
+    async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?;
        let mut state = dummy_state(&harness).await;
        let current_lsn = Lsn(100_000).align();
        let new_lsn = Lsn(100_100).align();
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -149,10 +149,12 @@ impl OpenFiles {
        // old file.
        //
        if let Some(old_file) = slot_guard.file.take() {
-            // the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
-            // distinguish the two.
+            // We do not have information about tenant_id/timeline_id of evicted file.
+            // It is possible to store path together with file or use filepath crate,
+            // but as far as close() is not expected to be fast, it is not so critical to gather
+            // precise per-tenant statistic here.
            STORAGE_IO_TIME
-                .with_label_values(&["close-by-replace"])
+                .with_label_values(&["close", "-", "-"])
                .observe_closure_duration(|| drop(old_file));
        }

@@ -206,7 +208,7 @@ impl VirtualFile {
        }
        let (handle, mut slot_guard) = get_open_files().find_victim_slot();
        let file = STORAGE_IO_TIME
-            .with_label_values(&["open"])
+            .with_label_values(&["open", &tenant_id, &timeline_id])
            .observe_closure_duration(|| open_options.open(path))?;

        // Strip all options other than read and write.
@@ -269,7 +271,7 @@ impl VirtualFile {
                            // Found a cached file descriptor.
                            slot.recently_used.store(true, Ordering::Relaxed);
                            return Ok(STORAGE_IO_TIME
-                                .with_label_values(&[op])
+                                .with_label_values(&[op, &self.tenant_id, &self.timeline_id])
                                .observe_closure_duration(|| func(file)));
                        }
                    }
@@ -296,12 +298,12 @@ impl VirtualFile {

        // Open the physical file
        let file = STORAGE_IO_TIME
-            .with_label_values(&["open"])
+            .with_label_values(&["open", &self.tenant_id, &self.timeline_id])
            .observe_closure_duration(|| self.open_options.open(&self.path))?;

        // Perform the requested operation on it
        let result = STORAGE_IO_TIME
-            .with_label_values(&[op])
+            .with_label_values(&[op, &self.tenant_id, &self.timeline_id])
            .observe_closure_duration(|| func(&file));

        // Store the File in the slot and update the handle in the VirtualFile
@@ -331,11 +333,13 @@ impl Drop for VirtualFile {
        let mut slot_guard = slot.inner.write().unwrap();
        if slot_guard.tag == handle.tag {
            slot.recently_used.store(false, Ordering::Relaxed);
-            // there is also operation "close-by-replace" for closes done on eviction for
-            // comparison.
+            // Unlike files evicted by replacement algorithm, here
+            // we group close time by tenant_id/timeline_id.
+            // At allows to compare number/time of "normal" file closes
+            // with file eviction.
            STORAGE_IO_TIME
-                .with_label_values(&["close"])
-                .observe_closure_duration(|| drop(slot_guard.file.take()));
+                .with_label_values(&["close", &self.tenant_id, &self.timeline_id])
+                .observe_closure_duration(|| slot_guard.file.take());
        }
    }
 }
--- a/pgxn/neon/libpqwalproposer.c
+++ b/pgxn/neon/libpqwalproposer.c
@@ -292,7 +292,7 @@ walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
 	/*
 	 * The docs for PQgetCopyData list the return values as: 0 if the copy is
 	 * still in progress, but no "complete row" is available -1 if the copy is
-	 * done -2 if an error occurred (> 0) if it was successful; that value is
+	 * done -2 if an error occured (> 0) if it was successful; that value is
 	 * the amount transferred.
 	 *
 	 * The protocol we use between walproposer and safekeeper means that we
@@ -353,7 +353,7 @@ walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
 	/*
 	 * The docs for PQputcopyData list the return values as: 1 if the data was
 	 * queued, 0 if it was not queued because of full buffers, or -1 if an
-	 * error occurred
+	 * error occured
 	 */
 	result = PQputCopyData(conn->pg_conn, buf, size);

--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -788,7 +788,7 @@ ReconnectSafekeepers(void)

 /*
 * Performs the logic for advancing the state machine of the specified safekeeper,
- * given that a certain set of events has occurred.
+ * given that a certain set of events has occured.
 */
 static void
 AdvancePollState(Safekeeper *sk, uint32 events)
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -23,7 +23,7 @@
 									 * message header */

 /*
- * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occurred,
+ * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured,
 * because all WL_* events are given flags equal to some (1 << i), starting from i = 0
 */
 #define WL_NO_EVENTS 0
@@ -317,7 +317,7 @@ typedef struct AppendResponse
 	/* this is a criterion for walproposer --sync mode exit */
 	XLogRecPtr	commitLsn;
 	HotStandbyFeedback hs;
-	/* Feedback received from pageserver includes standby_status_update fields */
+	/* Feedback recieved from pageserver includes standby_status_update fields */
 	/* and custom neon feedback. */
 	/* This part of the message is extensible. */
 	PageserverFeedback rf;
--- a/poetry.lock
+++ b/poetry.lock
@@ -740,13 +740,13 @@ typing-extensions = ">=4.1.0"

 [[package]]
 name = "certifi"
-version = "2023.7.22"
+version = "2022.12.7"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
-    {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
+    {file = "certifi-2022.12.7-py3-none-any.whl", hash = "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"},
+    {file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"},
 ]

 [[package]]
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -29,9 +29,9 @@ metrics.workspace = true
 once_cell.workspace = true
 opentelemetry.workspace = true
 parking_lot.workspace = true
+pbkdf2.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
-postgres-protocol.workspace = true
 pq_proto.workspace = true
 prometheus.workspace = true
 rand.workspace = true
@@ -65,13 +65,10 @@ webpki-roots.workspace = true
 x509-parser.workspace = true
 native-tls.workspace = true
 postgres-native-tls.workspace = true
-tokio-native-tls = "0.3.1"

 workspace_hack.workspace = true
 tokio-util.workspace = true

-fallible-iterator = "0.2.0"
-
 [dev-dependencies]
 rcgen.workspace = true
 rstest.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -53,12 +53,6 @@ pub enum BackendType<'a, T> {
    Postgres(Cow<'a, console::provider::mock::Api>, T),
    /// Authentication via a web browser.
    Link(Cow<'a, url::ApiUrl>),
-    /// Test backend.
-    Test(&'a dyn TestBackend),
-}
-
-pub trait TestBackend: Send + Sync + 'static {
-    fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
 }

 impl std::fmt::Display for BackendType<'_, ()> {
@@ -68,7 +62,6 @@ impl std::fmt::Display for BackendType<'_, ()> {
            Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(),
            Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(),
            Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
-            Test(_) => fmt.debug_tuple("Test").finish(),
        }
    }
 }
@@ -82,7 +75,6 @@ impl<T> BackendType<'_, T> {
            Console(c, x) => Console(Cow::Borrowed(c), x),
            Postgres(c, x) => Postgres(Cow::Borrowed(c), x),
            Link(c) => Link(Cow::Borrowed(c)),
-            Test(x) => Test(*x),
        }
    }
 }
@@ -97,7 +89,6 @@ impl<'a, T> BackendType<'a, T> {
            Console(c, x) => Console(c, f(x)),
            Postgres(c, x) => Postgres(c, f(x)),
            Link(c) => Link(c),
-            Test(x) => Test(x),
        }
    }
 }
@@ -111,7 +102,6 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
            Console(c, x) => x.map(|x| Console(c, x)),
            Postgres(c, x) => x.map(|x| Postgres(c, x)),
            Link(c) => Ok(Link(c)),
-            Test(x) => Ok(Test(x)),
        }
    }
 }
@@ -157,7 +147,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
            Console(_, creds) => creds.project.clone(),
            Postgres(_, creds) => creds.project.clone(),
            Link(_) => Some("link".to_owned()),
-            Test(_) => Some("test".to_owned()),
        }
    }
    /// Authenticate the client via the requested backend, possibly using credentials.
@@ -199,9 +188,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
                    .await?
                    .map(CachedNodeInfo::new_uncached)
            }
-            Test(_) => {
-                unreachable!("this function should never be called in the test backend")
-            }
        };

        info!("user successfully authenticated");
@@ -220,7 +206,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
            Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
            Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
            Link(_) => Ok(None),
-            Test(x) => x.wake_compute().map(Some),
        }
    }
 }
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -1,11 +1,8 @@
-use std::ops::ControlFlow;
-
 use super::AuthSuccess;
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
    compute,
    console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
-    proxy::handle_try_wake,
    sasl, scram,
    stream::PqStream,
 };
@@ -51,16 +48,7 @@ pub(super) async fn authenticate(
        }
    };

-    info!("compute node's state has likely changed; requesting a wake-up");
-    let mut num_retries = 0;
-    let mut node = loop {
-        let wake_res = api.wake_compute(extra, creds).await;
-        match handle_try_wake(wake_res, num_retries)? {
-            ControlFlow::Continue(_) => num_retries += 1,
-            ControlFlow::Break(n) => break n,
-        }
-        info!(num_retries, "retrying wake compute");
-    };
+    let mut node = api.wake_compute(extra, creds).await?;
    if let Some(keys) = scram_keys {
        use tokio_postgres::config::AuthKeys;
        node.config.auth_keys(AuthKeys::ScramSha256(keys));
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -48,14 +48,6 @@ impl ClientCredentials<'_> {
 }

 impl<'a> ClientCredentials<'a> {
-    #[cfg(test)]
-    pub fn new_noop() -> Self {
-        ClientCredentials {
-            user: "",
-            project: None,
-        }
-    }
-
    pub fn parse(
        params: &'a StartupMessageParams,
        sni: Option<&str>,
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -14,7 +14,6 @@ pub mod errors {
    use crate::{
        error::{io_error, UserFacingError},
        http,
-        proxy::ShouldRetry,
    };
    use thiserror::Error;

@@ -73,24 +72,6 @@ pub mod errors {
        }
    }

-    impl ShouldRetry for ApiError {
-        fn could_retry(&self) -> bool {
-            match self {
-                // retry some transport errors
-                Self::Transport(io) => io.could_retry(),
-                // retry some temporary failures because the compute was in a bad state
-                // (bad request can be returned when the endpoint was in transition)
-                Self::Console {
-                    status: http::StatusCode::BAD_REQUEST | http::StatusCode::LOCKED,
-                    ..
-                } => true,
-                // retry server errors
-                Self::Console { status, .. } if status.is_server_error() => true,
-                _ => false,
-            }
-        }
-    }
-
    impl From<reqwest::Error> for ApiError {
        fn from(e: reqwest::Error) -> Self {
            io_error(e).into()
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -6,7 +6,7 @@ use std::fmt;
 use std::{collections::HashMap, sync::Arc};
 use tokio::time;

-use crate::{auth, console, pg_client};
+use crate::{auth, console};
 use crate::{compute, config};

 use super::sql_over_http::MAX_RESPONSE_SIZE;
@@ -41,10 +41,8 @@ impl fmt::Display for ConnInfo {
    }
 }

-type PgConn =
-    pg_client::connection::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>;
 struct ConnPoolEntry {
-    conn: PgConn,
+    conn: tokio_postgres::Client,
    _last_access: std::time::Instant,
 }

@@ -80,8 +78,12 @@ impl GlobalConnPool {
        })
    }

-    pub async fn get(&self, conn_info: &ConnInfo, force_new: bool) -> anyhow::Result<PgConn> {
-        let mut client: Option<PgConn> = None;
+    pub async fn get(
+        &self,
+        conn_info: &ConnInfo,
+        force_new: bool,
+    ) -> anyhow::Result<tokio_postgres::Client> {
+        let mut client: Option<tokio_postgres::Client> = None;

        if !force_new {
            let pool = self.get_endpoint_pool(&conn_info.hostname).await;
@@ -112,7 +114,11 @@ impl GlobalConnPool {
        }
    }

-    pub async fn put(&self, conn_info: &ConnInfo, client: PgConn) -> anyhow::Result<()> {
+    pub async fn put(
+        &self,
+        conn_info: &ConnInfo,
+        client: tokio_postgres::Client,
+    ) -> anyhow::Result<()> {
        let pool = self.get_endpoint_pool(&conn_info.hostname).await;

        // return connection to the pool
@@ -185,7 +191,7 @@ struct TokioMechanism<'a> {

 #[async_trait]
 impl ConnectMechanism for TokioMechanism<'_> {
-    type Connection = PgConn;
+    type Connection = tokio_postgres::Client;
    type ConnectError = tokio_postgres::Error;
    type Error = anyhow::Error;

@@ -207,7 +213,7 @@ impl ConnectMechanism for TokioMechanism<'_> {
 async fn connect_to_compute(
    config: &config::ProxyConfig,
    conn_info: &ConnInfo,
-) -> anyhow::Result<PgConn> {
+) -> anyhow::Result<tokio_postgres::Client> {
    let tls = config.tls_config.as_ref();
    let common_names = tls.and_then(|tls| tls.common_names.clone());

@@ -245,7 +251,7 @@ async fn connect_to_compute_once(
    node_info: &console::CachedNodeInfo,
    conn_info: &ConnInfo,
    timeout: time::Duration,
-) -> Result<PgConn, tokio_postgres::Error> {
+) -> Result<tokio_postgres::Client, tokio_postgres::Error> {
    let mut config = (*node_info.config).clone();

    let (client, connection) = config
@@ -257,13 +263,11 @@ async fn connect_to_compute_once(
        .connect(tokio_postgres::NoTls)
        .await?;

-    let stream = connection.stream.into_inner();
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            error!("connection error: {}", e);
+        }
+    });

-    // tokio::spawn(async move {
-    //     if let Err(e) = connection.await {
-    //         error!("connection error: {}", e);
-    //     }
-    // });
-
-    Ok(PgConn::new(stream))
+    Ok(client)
 }
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -1,38 +1,19 @@
-use std::io::ErrorKind;
 use std::sync::Arc;

-use anyhow::bail;
-use bytes::BufMut;
-use fallible_iterator::FallibleIterator;
 use futures::pin_mut;
 use futures::StreamExt;
-use hashbrown::HashMap;
 use hyper::body::HttpBody;
 use hyper::http::HeaderName;
 use hyper::http::HeaderValue;
 use hyper::{Body, HeaderMap, Request};
-use postgres_protocol::message::backend::DataRowBody;
-use postgres_protocol::message::backend::ReadyForQueryBody;
 use serde_json::json;
 use serde_json::Map;
 use serde_json::Value;
-use tokio::io::AsyncRead;
-use tokio::io::AsyncWrite;
 use tokio_postgres::types::Kind;
 use tokio_postgres::types::Type;
-use tokio_postgres::GenericClient;
-use tokio_postgres::IsolationLevel;
 use tokio_postgres::Row;
-use tokio_postgres::RowStream;
-use tokio_postgres::Statement;
 use url::Url;

-use crate::pg_client;
-use crate::pg_client::codec::FrontendMessage;
-use crate::pg_client::connection;
-use crate::pg_client::connection::RequestMessages;
-use crate::pg_client::prepare::TypeinfoPreparedQueries;
-
 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;

@@ -42,21 +23,12 @@ struct QueryData {
    params: Vec<serde_json::Value>,
 }

-#[derive(serde::Deserialize)]
-#[serde(untagged)]
-enum Payload {
-    Single(QueryData),
-    Batch(Vec<QueryData>),
-}
-
 pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB

 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
 static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
-static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level");
-static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only");

 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");

@@ -190,7 +162,7 @@ pub async fn handle(
    request: Request<Body>,
    sni_hostname: Option<String>,
    conn_pool: Arc<GlobalConnPool>,
-) -> anyhow::Result<(Value, HashMap<HeaderName, HeaderValue>)> {
+) -> anyhow::Result<Value> {
    //
    // Determine the destination and connection params
    //
@@ -205,23 +177,6 @@ pub async fn handle(
    // Allow connection pooling only if explicitly requested
    let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);

-    // isolation level and read only
-
-    let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
-    let txn_isolation_level = match txn_isolation_level_raw {
-        Some(ref x) => Some(match x.as_bytes() {
-            b"Serializable" => IsolationLevel::Serializable,
-            b"ReadUncommitted" => IsolationLevel::ReadUncommitted,
-            b"ReadCommitted" => IsolationLevel::ReadCommitted,
-            b"RepeatableRead" => IsolationLevel::RepeatableRead,
-            _ => bail!("invalid isolation level"),
-        }),
-        None => None,
-    };
-
-    let txn_read_only_raw = headers.get(&TXN_READ_ONLY).cloned();
-    let txn_read_only = txn_read_only_raw.as_ref() == Some(&HEADER_VALUE_TRUE);
-
    let request_content_length = match request.body().size_hint().upper() {
        Some(v) => v,
        None => MAX_REQUEST_SIZE + 1,
@@ -237,75 +192,15 @@ pub async fn handle(
    // Read the query and query params from the request body
    //
    let body = hyper::body::to_bytes(request.into_body()).await?;
-    let payload: Payload = serde_json::from_slice(&body)?;
-
-    let mut client = conn_pool.get(&conn_info, !allow_pool).await?;
+    let QueryData { query, params } = serde_json::from_slice(&body)?;
+    let query_params = json_to_pg_text(params)?;

    //
    // Now execute the query and return the result
    //
-    let result = match payload {
-        Payload::Single(query) => query_raw_txt_as_json(&mut client, query, raw_output, array_mode)
-            .await
-            .map(|x| (x, HashMap::default())),
-        Payload::Batch(queries) => {
-            let mut results = Vec::new();
+    let client = conn_pool.get(&conn_info, !allow_pool).await?;

-            client
-                .start_tx(txn_isolation_level, Some(txn_read_only))
-                .await?;
-
-            for query in queries {
-                let result =
-                    query_raw_txt_as_json(&mut client, query, raw_output, array_mode).await;
-                match result {
-                    // TODO: check this tag to see if the client has executed a commit during the non-interactive transactions...
-                    Ok((r, _ready_tag)) => results.push(r),
-                    Err(e) => {
-                        let tag = client.rollback().await?;
-                        if allow_pool && tag.status() == b'I' {
-                            // return connection to the pool
-                            tokio::task::spawn(async move {
-                                let _ = conn_pool.put(&conn_info, client).await;
-                            });
-                        }
-                        return Err(e);
-                    }
-                }
-            }
-            let ready_tag = client.commit().await?;
-            let mut headers = HashMap::default();
-            headers.insert(
-                TXN_READ_ONLY.clone(),
-                HeaderValue::try_from(txn_read_only.to_string())?,
-            );
-            if let Some(txn_isolation_level_raw) = txn_isolation_level_raw {
-                headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level_raw);
-            }
-            Ok(((json!({ "results": results }), ready_tag), headers))
-        }
-    };
-
-    if allow_pool && ready_tag.status() == b'I' {
-        // return connection to the pool
-        tokio::task::spawn(async move {
-            let _ = conn_pool.put(&conn_info, client).await;
-        });
-    }
-
-    result
-}
-
-async fn query_to_json<T: GenericClient>(
-    client: &T,
-    data: QueryData,
-    raw_output: bool,
-    array_mode: bool,
-) -> anyhow::Result<Value> {
-    let query_params = json_to_pg_text(data.params)?;
-    let row_stream = client
-        .query_raw_txt::<String, _>(data.query, query_params)
-        .await?;
+    let row_stream = client.query_raw_txt(query, query_params).await?;

    // Manually drain the stream into a vector to leave row_stream hanging
    // around to get a command tag. Also check that the response is not too
@@ -361,6 +256,13 @@ async fn query_to_json<T: GenericClient>(
        .map(|row| pg_text_row_to_json(row, raw_output, array_mode))
        .collect::<Result<Vec<_>, _>>()?;

+    if allow_pool {
+        // return connection to the pool
+        tokio::task::spawn(async move {
+            let _ = conn_pool.put(&conn_info, client).await;
+        });
+    }
+
    // resulting JSON format is based on the format of node-postgres result
    Ok(json!({
        "command": command_tag_name,
@@ -371,99 +273,6 @@ async fn query_to_json<T: GenericClient>(
    }))
 }

-async fn query_raw_txt_as_json<'a, St, T>(
-    conn: &mut connection::Connection<St, T>,
-    data: QueryData,
-    raw_output: bool,
-    array_mode: bool,
-) -> anyhow::Result<(Value, ReadyForQueryBody)>
-where
-    St: AsyncRead + AsyncWrite + Unpin + Send,
-    T: AsyncRead + AsyncWrite + Unpin + Send,
-{
-    let params = json_to_pg_text(data.params)?;
-    let params = params.into_iter();
-
-    let stmt_name = conn.statement_name();
-    let row_description = conn.prepare(&stmt_name, &data.query).await?;
-
-    let mut fields = vec![];
-    let mut columns = vec![];
-    let mut it = row_description.fields();
-    while let Some(field) = it.next().map_err(pg_client::error::Error::parse)? {
-        fields.push(json!({
-            "name": Value::String(field.name().to_owned()),
-            "dataTypeID": Value::Number(field.type_oid().into()),
-            "tableID": field.table_oid(),
-            "columnID": field.column_id(),
-            "dataTypeSize": field.type_size(),
-            "dataTypeModifier": field.type_modifier(),
-            "format": "text",
-        }));
-
-        let type_ = match Type::from_oid(field.type_oid()) {
-            Some(t) => t,
-            None => TypeinfoPreparedQueries::get_type(conn, field.type_oid()).await?,
-        };
-
-        columns.push(Column {
-            name: field.name().to_string(),
-            type_,
-        });
-    }
-
-    conn.execute("", &stmt_name, params)?;
-    conn.sync().await?;
-
-    let mut rows = vec![];
-
-    let mut row_stream = conn.stream_query_results().await?;
-
-    let mut curret_size = 0;
-    while let Some(row) = row_stream.next().await.transpose()? {
-        // let row = row.map_err(Error::db)?;
-
-        curret_size += row.buffer().len();
-        if curret_size > MAX_RESPONSE_SIZE {
-            return Err(anyhow::anyhow!("response too large"));
-        }
-
-        rows.push(pg_text_row_to_json2(&row, &columns, raw_output, array_mode).unwrap());
-    }
-
-    let command_tag = row_stream.tag();
-    let command_tag = command_tag.tag()?;
-    let mut command_tag_split = command_tag.split(' ');
-    let command_tag_name = command_tag_split.next().unwrap_or_default();
-    let command_tag_count = if command_tag_name == "INSERT" {
-        // INSERT returns OID first and then number of rows
-        command_tag_split.nth(1)
-    } else {
-        // other commands return number of rows (if any)
-        command_tag_split.next()
-    }
-    .and_then(|s| s.parse::<i64>().ok());
-
-    let ready_tag = conn.wait_for_ready().await?;
-
-    // resulting JSON format is based on the format of node-postgres result
-    Ok((
-        json!({
-            "command": command_tag_name,
-            "rowCount": command_tag_count,
-            "rows": rows,
-            "fields": fields,
-            "rowAsArray": array_mode,
-        }),
-        ready_tag,
-    ))
-}
-
-struct Column {
-    name: String,
-    type_: Type,
-}
-
 //
 // Convert postgres row with text-encoded values to JSON object
 //
@@ -483,7 +292,7 @@ pub fn pg_text_row_to_json(
        } else {
            pg_text_to_json(pg_value, column.type_())?
        };
-        Ok((name, json_value))
+        Ok((name.to_string(), json_value))
    });

    if array_mode {
@@ -493,55 +302,7 @@ pub fn pg_text_row_to_json(
            .collect::<Result<Vec<Value>, anyhow::Error>>()?;
        Ok(Value::Array(arr))
    } else {
-        let obj = iter
-            .map(|r| r.map(|(key, val)| (key.to_owned(), val)))
-            .collect::<Result<Map<String, Value>, anyhow::Error>>()?;
-        Ok(Value::Object(obj))
-    }
-}
-
-//
-// Convert postgres row with text-encoded values to JSON object
-//
-fn pg_text_row_to_json2(
-    row: &DataRowBody,
-    columns: &[Column],
-    raw_output: bool,
-    array_mode: bool,
-) -> Result<Value, anyhow::Error> {
-    let ranges: Vec<Option<std::ops::Range<usize>>> = row.ranges().collect()?;
-    let iter = std::iter::zip(ranges, columns)
-        .enumerate()
-        .map(|(i, (range, column))| {
-            let name = &column.name;
-            let pg_value = range
-                .map(|r| {
-                    std::str::from_utf8(&row.buffer()[r])
-                        .map_err(|e| pg_client::error::Error::from_sql(e.into(), i))
-                })
-                .transpose()?;
-            // let pg_value = row.as_text(i)?;
-            let json_value = if raw_output {
-                match pg_value {
-                    Some(v) => Value::String(v.to_string()),
-                    None => Value::Null,
-                }
-            } else {
-                pg_text_to_json(pg_value, &column.type_)?
-            };
-            Ok((name, json_value))
-        });
-
-    if array_mode {
-        // drop keys and aggregate into array
-        let arr = iter
-            .map(|r| r.map(|(_key, val)| val))
-            .collect::<Result<Vec<Value>, anyhow::Error>>()?;
-        Ok(Value::Array(arr))
-    } else {
-        let obj = iter
-            .map(|r| r.map(|(key, val)| (key.to_owned(), val)))
-            .collect::<Result<Map<String, Value>, anyhow::Error>>()?;
+        let obj = iter.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
        Ok(Value::Object(obj))
    }
 }
@@ -552,16 +313,16 @@ fn pg_text_row_to_json2(
 pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
    if let Some(val) = pg_value {
        if let Kind::Array(elem_type) = pg_type.kind() {
-            return pg_array_parse(val, &elem_type);
+            return pg_array_parse(val, elem_type);
        }

-        match pg_type {
-            &Type::BOOL => Ok(Value::Bool(val == "t")),
-            &Type::INT2 | &Type::INT4 => {
+        match *pg_type {
+            Type::BOOL => Ok(Value::Bool(val == "t")),
+            Type::INT2 | Type::INT4 => {
                let val = val.parse::<i32>()?;
                Ok(Value::Number(serde_json::Number::from(val)))
            }
-            &Type::FLOAT4 | &Type::FLOAT8 => {
+            Type::FLOAT4 | Type::FLOAT8 => {
                let fval = val.parse::<f64>()?;
                let num = serde_json::Number::from_f64(fval);
                if let Some(num) = num {
@@ -573,7 +334,7 @@ pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value,
                    Ok(Value::String(val.to_string()))
                }
            }
-            &Type::JSON | &Type::JSONB => Ok(serde_json::from_str(val)?),
+            Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?),
            _ => Ok(Value::String(val.to_string())),
        }
    } else {
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -6,7 +6,6 @@ use crate::{
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
-use hashbrown::HashMap;
 use hyper::{
    server::{
        accept,
@@ -182,15 +181,13 @@ async fn ws_handler(

    // Check if the request is a websocket upgrade request.
    if hyper_tungstenite::is_upgrade_request(&request) {
-        info!(session_id = ?session_id, "performing websocket upgrade");
-
        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
            .map_err(|e| ApiError::BadRequest(e.into()))?;

        tokio::spawn(async move {
            if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
            {
-                error!(session_id = ?session_id, "error in websocket connection: {e:?}");
+                error!("error in websocket connection: {e:?}");
            }
        });

@@ -206,7 +203,7 @@ async fn ws_handler(
            Ok(_) => StatusCode::OK,
            Err(_) => StatusCode::BAD_REQUEST,
        };
-        let (json, headers) = match result {
+        let json = match result {
            Ok(r) => r,
            Err(e) => {
                let message = format!("{:?}", e);
@@ -217,10 +214,7 @@ async fn ws_handler(
                    },
                    None => Value::Null,
                };
-                (
-                    json!({ "message": message, "code": code }),
-                    HashMap::default(),
-                )
+                json!({ "message": message, "code": code })
            }
        };
        json_response(status_code, json).map(|mut r| {
@@ -228,9 +222,6 @@ async fn ws_handler(
                "Access-Control-Allow-Origin",
                hyper::http::HeaderValue::from_static("*"),
            );
-            for (k, v) in headers {
-                r.headers_mut().insert(k, v);
-            }
            r
        })
    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -22,7 +22,6 @@ pub mod scram;
 pub mod stream;
 pub mod url;
 pub mod waiters;
-pub mod pg_client;

 /// Handle unix signals appropriately.
 pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallible> {
--- a/proxy/src/pg_client/codec.rs
+++ b/proxy/src/pg_client/codec.rs
@@ -1,43 +0,0 @@
-use bytes::{Bytes, BytesMut};
-use fallible_iterator::FallibleIterator;
-use postgres_protocol::message::backend::{self, Message};
-use std::io;
-use tokio_util::codec::{Decoder, Encoder};
-
-pub struct FrontendMessage(pub Bytes);
-pub struct BackendMessages(pub BytesMut);
-
-impl BackendMessages {
-    pub fn empty() -> BackendMessages {
-        BackendMessages(BytesMut::new())
-    }
-}
-
-impl FallibleIterator for BackendMessages {
-    type Item = backend::Message;
-    type Error = io::Error;
-
-    fn next(&mut self) -> io::Result<Option<backend::Message>> {
-        backend::Message::parse(&mut self.0)
-    }
-}
-
-pub struct PostgresCodec;
-
-impl Encoder<FrontendMessage> for PostgresCodec {
-    type Error = io::Error;
-
-    fn encode(&mut self, item: FrontendMessage, dst: &mut BytesMut) -> io::Result<()> {
-        dst.extend_from_slice(&item.0);
-        Ok(())
-    }
-}
-
-impl Decoder for PostgresCodec {
-    type Item = Message;
-    type Error = io::Error;
-
-    fn decode(&mut self, src: &mut BytesMut) -> Result<Option<Message>, io::Error> {
-        Message::parse(src)
-    }
-}
--- a/proxy/src/pg_client/connection.rs
+++ b/proxy/src/pg_client/connection.rs
@@ -1,369 +0,0 @@
-use super::codec::{BackendMessages, FrontendMessage, PostgresCodec};
-use super::error::Error;
-use super::prepare::TypeinfoPreparedQueries;
-use bytes::{BufMut, BytesMut};
-use futures::channel::mpsc;
-use futures::{Sink, StreamExt};
-use futures::{SinkExt, Stream};
-use hashbrown::HashMap;
-use postgres_protocol::message::backend::{
-    BackendKeyDataBody, CommandCompleteBody, DataRowBody, ErrorResponseBody, Message,
-    ReadyForQueryBody, RowDescriptionBody,
-};
-use postgres_protocol::message::frontend;
-use postgres_protocol::Oid;
-use std::collections::VecDeque;
-use std::future::poll_fn;
-use std::pin::Pin;
-use std::task::{ready, Context, Poll};
-use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_postgres::maybe_tls_stream::MaybeTlsStream;
-use tokio_postgres::types::Type;
-use tokio_postgres::IsolationLevel;
-use tokio_util::codec::Framed;
-
-pub enum RequestMessages {
-    Single(FrontendMessage),
-}
-
-pub struct Request {
-    pub messages: RequestMessages,
-    pub sender: mpsc::Sender<BackendMessages>,
-}
-
-pub struct Response {
-    sender: mpsc::Sender<BackendMessages>,
-}
-
-/// A connection to a PostgreSQL database.
-pub struct RawConnection<S, T> {
-    stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
-    pending_responses: VecDeque<Message>,
-    pub buf: BytesMut,
-}
-
-impl<S: AsyncRead + AsyncWrite + Unpin, T: AsyncRead + AsyncWrite + Unpin> RawConnection<S, T> {
-    pub fn new(
-        stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
-        buf: BytesMut,
-    ) -> RawConnection<S, T> {
-        RawConnection {
-            stream,
-            pending_responses: VecDeque::new(),
-            buf,
-        }
-    }
-
-    pub async fn send(&mut self) -> Result<(), Error> {
-        poll_fn(|cx| self.poll_send(cx)).await?;
-        let request = FrontendMessage(self.buf.split().freeze());
-        self.stream.start_send_unpin(request).map_err(Error::io)?;
-        poll_fn(|cx| self.poll_flush(cx)).await
-    }
-
-    pub async fn next_message(&mut self) -> Result<Message, Error> {
-        match self.pending_responses.pop_front() {
-            Some(message) => Ok(message),
-            None => poll_fn(|cx| self.poll_read(cx)).await,
-        }
-    }
-
-    fn poll_read(&mut self, cx: &mut Context<'_>) -> Poll<Result<Message, Error>> {
-        let message = match ready!(self.stream.poll_next_unpin(cx)?) {
-            Some(message) => message,
-            None => return Poll::Ready(Err(Error::closed())),
-        };
-        Poll::Ready(Ok(message))
-    }
-
-    fn poll_shutdown(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
-        Pin::new(&mut self.stream).poll_close(cx).map_err(Error::io)
-    }
-
-    fn poll_send(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
-        if let Poll::Ready(msg) = self.poll_read(cx)? {
-            self.pending_responses.push_back(msg);
-        };
-        self.stream.poll_ready_unpin(cx).map_err(Error::io)
-    }
-
-    fn poll_flush(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
-        if let Poll::Ready(msg) = self.poll_read(cx)? {
-            self.pending_responses.push_back(msg);
-        };
-        self.stream.poll_flush_unpin(cx).map_err(Error::io)
-    }
-}
-
-pub struct Connection<S, T> {
-    stmt_counter: usize,
-    pub typeinfo: Option<TypeinfoPreparedQueries>,
-    pub typecache: HashMap<Oid, Type>,
-    pub raw: RawConnection<S, T>,
-    // key: BackendKeyDataBody,
-}
-
-impl<S: AsyncRead + AsyncWrite + Unpin, T: AsyncRead + AsyncWrite + Unpin> Connection<S, T> {
-    pub fn new(stream: MaybeTlsStream<S, T>) -> Connection<S, T> {
-        Connection {
-            stmt_counter: 0,
-            typeinfo: None,
-            typecache: HashMap::new(),
-            raw: RawConnection::new(Framed::new(stream, PostgresCodec), BytesMut::new()),
-        }
-    }
-
-    pub async fn start_tx(
-        &mut self,
-        isolation_level: Option<IsolationLevel>,
-        read_only: Option<bool>,
-    ) -> Result<ReadyForQueryBody, Error> {
-        let mut query = "START TRANSACTION".to_string();
-        let mut first = true;
-
-        if let Some(level) = isolation_level {
-            first = false;
-
-            query.push_str(" ISOLATION LEVEL ");
-            let level = match level {
-                IsolationLevel::ReadUncommitted => "READ UNCOMMITTED",
-                IsolationLevel::ReadCommitted => "READ COMMITTED",
-                IsolationLevel::RepeatableRead => "REPEATABLE READ",
-                IsolationLevel::Serializable => "SERIALIZABLE",
-                _ => return Err(Error::unexpected_message()),
-            };
-            query.push_str(level);
-        }
-
-        if let Some(read_only) = read_only {
-            if !first {
-                query.push(',');
-            }
-            first = false;
-
-            let s = if read_only {
-                " READ ONLY"
-            } else {
-                " READ WRITE"
-            };
-            query.push_str(s);
-        }
-
-        self.execute_simple(&query).await
-    }
-
-    pub async fn rollback(&mut self) -> Result<ReadyForQueryBody, Error> {
-        self.execute_simple("ROLLBACK").await
-    }
-
-    pub async fn commit(&mut self) -> Result<ReadyForQueryBody, Error> {
-        self.execute_simple("COMMIT").await
-    }
-
-    // pub async fn auth_sasl_scram<'a, I>(
-    //     mut raw: RawConnection<S, T>,
-    //     params: I,
-    //     password: &[u8],
-    // ) -> Result<Self, Error>
-    // where
-    //     I: IntoIterator<Item = (&'a str, &'a str)>,
-    // {
-    //     // send a startup message
-    //     frontend::startup_message(params, &mut raw.buf).unwrap();
-    //     raw.send().await?;
-
-    //     // expect sasl authentication message
-    //     let Message::AuthenticationSasl(body) = raw.next_message().await? else { return Err(Error::expecting("sasl authentication")) };
-    //     // expect support for SCRAM_SHA_256
-    //     if body
-    //         .mechanisms()
-    //         .find(|&x| Ok(x == authentication::sasl::SCRAM_SHA_256))?
-    //         .is_none()
-    //     {
-    //         return Err(Error::expecting("SCRAM-SHA-256 auth"));
-    //     }
-
-    //     // initiate SCRAM_SHA_256 authentication without channel binding
-    //     let auth = authentication::sasl::ChannelBinding::unrequested();
-    //     let mut scram = authentication::sasl::ScramSha256::new(password, auth);
-
-    //     frontend::sasl_initial_response(
-    //         authentication::sasl::SCRAM_SHA_256,
-    //         scram.message(),
-    //         &mut raw.buf,
-    //     )
-    //     .unwrap();
-    //     raw.send().await?;
-
-    //     // expect sasl continue
-    //     let Message::AuthenticationSaslContinue(b) = raw.next_message().await? else { return Err(Error::expecting("auth continue")) };
-    //     scram.update(b.data()).unwrap();
-
-    //     // continue sasl
-    //     frontend::sasl_response(scram.message(), &mut raw.buf).unwrap();
-    //     raw.send().await?;
-
-    //     // expect sasl final
-    //     let Message::AuthenticationSaslFinal(b) = raw.next_message().await? else { return Err(Error::expecting("auth final")) };
-    //     scram.finish(b.data()).unwrap();
-
-    //     // expect auth ok
-    //     let Message::AuthenticationOk = raw.next_message().await? else { return Err(Error::expecting("auth ok")) };
-
-    //     // expect connection accepted
-    //     let key = loop {
-    //         match raw.next_message().await? {
-    //             Message::BackendKeyData(key) => break key,
-    //             Message::ParameterStatus(_) => {}
-    //             _ => return Err(Error::expecting("backend ready")),
-    //         }
-    //     };
-
-    //     let Message::ReadyForQuery(b) = raw.next_message().await? else { return Err(Error::expecting("ready for query")) };
-    //     // assert_eq!(b.status(), b'I');
-
-    //     Ok(Self { raw, key })
-    // }
-
-    // pub fn prepare_and_execute(
-    //     &mut self,
-    //     portal: &str,
-    //     name: &str,
-    //     query: &str,
-    //     params: impl IntoIterator<Item = Option<impl AsRef<str>>>,
-    // ) -> std::io::Result<()> {
-    //     self.prepare(name, query)?;
-    //     self.execute(portal, name, params)
-    // }
-
-    pub fn statement_name(&mut self) -> String {
-        self.stmt_counter += 1;
-        format!("s{}", self.stmt_counter)
-    }
-
-    async fn execute_simple(&mut self, query: &str) -> Result<ReadyForQueryBody, Error> {
-        frontend::query(query, &mut self.raw.buf)?;
-        self.raw.send().await?;
-
-        loop {
-            match self.raw.next_message().await? {
-                Message::ReadyForQuery(q) => return Ok(q),
-                Message::CommandComplete(_)
-                | Message::EmptyQueryResponse
-                | Message::RowDescription(_)
-                | Message::DataRow(_) => {}
-                _ => return Err(Error::unexpected_message()),
-            }
-        }
-    }
-
-    pub async fn prepare(&mut self, name: &str, query: &str) -> Result<RowDescriptionBody, Error> {
-        frontend::parse(name, query, std::iter::empty(), &mut self.raw.buf)?;
-        frontend::describe(b'S', name, &mut self.raw.buf)?;
-        self.sync().await?;
-        self.wait_for_prepare().await
-    }
-
-    pub fn execute(
-        &mut self,
-        portal: &str,
-        name: &str,
-        params: impl IntoIterator<Item = Option<impl AsRef<str>>>,
-    ) -> std::io::Result<()> {
-        frontend::bind(
-            portal,
-            name,
-            std::iter::empty(), // all parameters use the default format (text)
-            params,
-            |param, buf| match param {
-                Some(param) => {
-                    buf.put_slice(param.as_ref().as_bytes());
-                    Ok(postgres_protocol::IsNull::No)
-                }
-                None => Ok(postgres_protocol::IsNull::Yes),
-            },
-            Some(0), // all text
-            &mut self.raw.buf,
-        )
-        .map_err(|e| match e {
-            frontend::BindError::Conversion(e) => std::io::Error::new(std::io::ErrorKind::Other, e),
-            frontend::BindError::Serialization(io) => io,
-        })?;
-        frontend::execute(portal, 0, &mut self.raw.buf)
-    }
-
-    pub async fn sync(&mut self) -> Result<(), Error> {
-        frontend::sync(&mut self.raw.buf);
-        self.raw.send().await
-    }
-
-    pub async fn wait_for_prepare(&mut self) -> Result<RowDescriptionBody, Error> {
-        let Message::ParseComplete = self.raw.next_message().await? else { return Err(Error::expecting("parse")) };
-        let Message::ParameterDescription(_) = self.raw.next_message().await? else { return Err(Error::expecting("param description")) };
-        let Message::RowDescription(desc) = self.raw.next_message().await? else { return Err(Error::expecting("row description")) };
-
-        self.wait_for_ready().await?;
-
-        Ok(desc)
-    }
-
-    pub async fn stream_query_results(&mut self) -> Result<RowStream<'_, S, T>, Error> {
-        // let Message::ParseComplete = self.raw.next_message().await? else { return Err(Error::expecting("parse")) };
-        let Message::BindComplete = self.raw.next_message().await? else { return Err(Error::expecting("bind")) };
-        Ok(RowStream::Stream(&mut self.raw))
-    }
-
-    pub async fn wait_for_ready(&mut self) -> Result<ReadyForQueryBody, Error> {
-        loop {
-            match self.raw.next_message().await.unwrap() {
-                Message::ReadyForQuery(b) => break Ok(b),
-                _ => continue,
-            }
-        }
-    }
-}
-
-pub enum RowStream<'a, S, T> {
-    Stream(&'a mut RawConnection<S, T>),
-    Complete(Option<CommandCompleteBody>),
-}
-impl<S, T> Unpin for RowStream<'_, S, T> {}
-
-impl<S: AsyncRead + AsyncWrite + Unpin, T: AsyncRead + AsyncWrite + Unpin> Stream
-    for RowStream<'_, S, T>
-{
-    // this is horrible - first result is for transport/protocol errors errors
-    // second result is for sql errors.
-    type Item = Result<Result<DataRowBody, ErrorResponseBody>, Error>;
-
-    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        match &mut *self {
-            RowStream::Stream(raw) => match ready!(raw.poll_read(cx)?) {
-                Message::DataRow(row) => Poll::Ready(Some(Ok(Ok(row)))),
-                Message::CommandComplete(tag) => {
-                    *self = Self::Complete(Some(tag));
-                    Poll::Ready(None)
-                }
-                Message::EmptyQueryResponse | Message::PortalSuspended => {
-                    *self = Self::Complete(None);
-                    Poll::Ready(None)
-                }
-                Message::ErrorResponse(error) => {
-                    *self = Self::Complete(None);
-                    Poll::Ready(Some(Ok(Err(error))))
-                }
-                _ => Poll::Ready(Some(Err(Error::expecting("command completion")))),
-            },
-            RowStream::Complete(_) => Poll::Ready(None),
-        }
-    }
-}
-
-impl<S, T> RowStream<'_, S, T> {
-    pub fn tag(self) -> Option<CommandCompleteBody> {
-        match self {
-            RowStream::Stream(_) => panic!("should not get tag unless row stream is exhausted"),
-            RowStream::Complete(tag) => tag,
-        }
-    }
-}
--- a/proxy/src/pg_client/error.rs
+++ b/proxy/src/pg_client/error.rs
@@ -1,447 +0,0 @@
-use std::{error, fmt, io};
-
-use fallible_iterator::FallibleIterator;
-use postgres_protocol::message::backend::{ErrorFields, ErrorResponseBody};
-use tokio_native_tls::native_tls;
-use tokio_postgres::error::{ErrorPosition, SqlState};
-
-#[derive(Debug, PartialEq)]
-enum Kind {
-    Io,
-    Tls,
-    UnexpectedMessage,
-    FromSql(usize),
-    Closed,
-    Db,
-    Parse,
-    Encode,
-}
-
-struct ErrorInner {
-    kind: Kind,
-    cause: Option<Box<dyn error::Error + Sync + Send>>,
-}
-
-/// An error communicating with the Postgres server.
-pub struct Error(ErrorInner);
-
-impl fmt::Debug for Error {
-    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fmt.debug_struct("Error")
-            .field("kind", &self.0.kind)
-            .field("cause", &self.0.cause)
-            .finish()
-    }
-}
-
-impl fmt::Display for Error {
-    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match &self.0.kind {
-            Kind::Io => fmt.write_str("error communicating with the server")?,
-            Kind::Tls => fmt.write_str("error establishing tls")?,
-            Kind::UnexpectedMessage => fmt.write_str("unexpected message from server")?,
-            Kind::FromSql(idx) => write!(fmt, "error deserializing column {}", idx)?,
-            Kind::Closed => fmt.write_str("connection closed")?,
-            Kind::Db => fmt.write_str("db error")?,
-            Kind::Parse => fmt.write_str("error parsing response from server")?,
-            Kind::Encode => fmt.write_str("error encoding message to server")?,
-        };
-        if let Some(ref cause) = self.0.cause {
-            write!(fmt, ": {}", cause)?;
-        }
-        Ok(())
-    }
-}
-
-impl error::Error for Error {
-    fn source(&self) -> Option<&(dyn error::Error + 'static)> {
-        self.0.cause.as_ref().map(|e| &**e as _)
-    }
-}
-
-impl From<io::Error> for Error {
-    fn from(value: io::Error) -> Self {
-        Self::io(value)
-    }
-}
-
-impl Error {
-    /// Consumes the error, returning its cause.
-    pub fn into_source(self) -> Option<Box<dyn error::Error + Sync + Send>> {
-        self.0.cause
-    }
-
-    /// Returns the source of this error if it was a `DbError`.
-    ///
-    /// This is a simple convenience method.
-    pub fn as_db_error(&self) -> Option<&DbError> {
-        error::Error::source(self).and_then(|e| e.downcast_ref::<DbError>())
-    }
-
-    /// Determines if the error was associated with closed connection.
-    pub fn is_closed(&self) -> bool {
-        self.0.kind == Kind::Closed
-    }
-
-    /// Returns the SQLSTATE error code associated with the error.
-    ///
-    /// This is a convenience method that downcasts the cause to a `DbError` and returns its code.
-    pub fn code(&self) -> Option<&SqlState> {
-        self.as_db_error().map(DbError::code)
-    }
-
-    fn new(kind: Kind, cause: Option<Box<dyn error::Error + Sync + Send>>) -> Error {
-        Error(ErrorInner { kind, cause })
-    }
-
-    #[allow(clippy::needless_pass_by_value)]
-    pub(crate) fn db(error: ErrorResponseBody) -> Error {
-        match DbError::parse(&mut error.fields()) {
-            Ok(e) => Error::new(Kind::Db, Some(Box::new(e))),
-            Err(e) => Error::new(Kind::Parse, Some(Box::new(e))),
-        }
-    }
-
-    pub(crate) fn from_sql(e: Box<dyn error::Error + Sync + Send>, idx: usize) -> Error {
-        Error::new(Kind::FromSql(idx), Some(e))
-    }
-
-    pub(crate) fn closed() -> Error {
-        Error::new(Kind::Closed, None)
-    }
-
-    pub(crate) fn unexpected_message() -> Error {
-        Error::new(Kind::UnexpectedMessage, None)
-    }
-
-    pub(crate) fn expecting(expected: &str) -> Error {
-        Error::new(Kind::UnexpectedMessage, Some(expected.into()))
-    }
-
-    pub(crate) fn parse(e: io::Error) -> Error {
-        Error::new(Kind::Parse, Some(Box::new(e)))
-    }
-
-    pub(crate) fn encode(e: io::Error) -> Error {
-        Error::new(Kind::Encode, Some(Box::new(e)))
-    }
-
-    pub(crate) fn io(e: io::Error) -> Error {
-        Error::new(Kind::Io, Some(Box::new(e)))
-    }
-
-    pub(crate) fn tls(e: native_tls::Error) -> Error {
-        Error::new(Kind::Tls, Some(Box::new(e)))
-    }
-}
-
-/// The severity of a Postgres error or notice.
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
-pub enum Severity {
-    /// PANIC
-    Panic,
-    /// FATAL
-    Fatal,
-    /// ERROR
-    Error,
-    /// WARNING
-    Warning,
-    /// NOTICE
-    Notice,
-    /// DEBUG
-    Debug,
-    /// INFO
-    Info,
-    /// LOG
-    Log,
-}
-
-impl fmt::Display for Severity {
-    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let s = match *self {
-            Severity::Panic => "PANIC",
-            Severity::Fatal => "FATAL",
-            Severity::Error => "ERROR",
-            Severity::Warning => "WARNING",
-            Severity::Notice => "NOTICE",
-            Severity::Debug => "DEBUG",
-            Severity::Info => "INFO",
-            Severity::Log => "LOG",
-        };
-        fmt.write_str(s)
-    }
-}
-
-impl Severity {
-    fn from_str(s: &str) -> Option<Severity> {
-        match s {
-            "PANIC" => Some(Severity::Panic),
-            "FATAL" => Some(Severity::Fatal),
-            "ERROR" => Some(Severity::Error),
-            "WARNING" => Some(Severity::Warning),
-            "NOTICE" => Some(Severity::Notice),
-            "DEBUG" => Some(Severity::Debug),
-            "INFO" => Some(Severity::Info),
-            "LOG" => Some(Severity::Log),
-            _ => None,
-        }
-    }
-}
-
-/// A Postgres error or notice.
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct DbError {
-    severity: String,
-    parsed_severity: Option<Severity>,
-    code: SqlState,
-    message: String,
-    detail: Option<String>,
-    hint: Option<String>,
-    position: Option<ErrorPosition>,
-    where_: Option<String>,
-    schema: Option<String>,
-    table: Option<String>,
-    column: Option<String>,
-    datatype: Option<String>,
-    constraint: Option<String>,
-    file: Option<String>,
-    line: Option<u32>,
-    routine: Option<String>,
-}
-
-impl DbError {
-    pub(crate) fn parse(fields: &mut ErrorFields<'_>) -> io::Result<DbError> {
-        let mut severity = None;
-        let mut parsed_severity = None;
-        let mut code = None;
-        let mut message = None;
-        let mut detail = None;
-        let mut hint = None;
-        let mut normal_position = None;
-        let mut internal_position = None;
-        let mut internal_query = None;
-        let mut where_ = None;
-        let mut schema = None;
-        let mut table = None;
-        let mut column = None;
-        let mut datatype = None;
-        let mut constraint = None;
-        let mut file = None;
-        let mut line = None;
-        let mut routine = None;
-
-        while let Some(field) = fields.next()? {
-            match field.type_() {
-                b'S' => severity = Some(field.value().to_owned()),
-                b'C' => code = Some(SqlState::from_code(field.value())),
-                b'M' => message = Some(field.value().to_owned()),
-                b'D' => detail = Some(field.value().to_owned()),
-                b'H' => hint = Some(field.value().to_owned()),
-                b'P' => {
-                    normal_position = Some(field.value().parse::<u32>().map_err(|_| {
-                        io::Error::new(
-                            io::ErrorKind::InvalidInput,
-                            "`P` field did not contain an integer",
-                        )
-                    })?);
-                }
-                b'p' => {
-                    internal_position = Some(field.value().parse::<u32>().map_err(|_| {
-                        io::Error::new(
-                            io::ErrorKind::InvalidInput,
-                            "`p` field did not contain an integer",
-                        )
-                    })?);
-                }
-                b'q' => internal_query = Some(field.value().to_owned()),
-                b'W' => where_ = Some(field.value().to_owned()),
-                b's' => schema = Some(field.value().to_owned()),
-                b't' => table = Some(field.value().to_owned()),
-                b'c' => column = Some(field.value().to_owned()),
-                b'd' => datatype = Some(field.value().to_owned()),
-                b'n' => constraint = Some(field.value().to_owned()),
-                b'F' => file = Some(field.value().to_owned()),
-                b'L' => {
-                    line = Some(field.value().parse::<u32>().map_err(|_| {
-                        io::Error::new(
-                            io::ErrorKind::InvalidInput,
-                            "`L` field did not contain an integer",
-                        )
-                    })?);
-                }
-                b'R' => routine = Some(field.value().to_owned()),
-                b'V' => {
-                    parsed_severity = Some(Severity::from_str(field.value()).ok_or_else(|| {
-                        io::Error::new(
-                            io::ErrorKind::InvalidInput,
-                            "`V` field contained an invalid value",
-                        )
-                    })?);
-                }
-                _ => {}
-            }
-        }
-
-        Ok(DbError {
-            severity: severity
-                .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`S` field missing"))?,
-            parsed_severity,
-            code: code
-                .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`C` field missing"))?,
-            message: message
-                .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`M` field missing"))?,
-            detail,
-            hint,
-            position: match normal_position {
-                Some(position) => Some(ErrorPosition::Original(position)),
-                None => match internal_position {
-                    Some(position) => Some(ErrorPosition::Internal {
-                        position,
-                        query: internal_query.ok_or_else(|| {
-                            io::Error::new(
-                                io::ErrorKind::InvalidInput,
-                                "`q` field missing but `p` field present",
-                            )
-                        })?,
-                    }),
-                    None => None,
-                },
-            },
-            where_,
-            schema,
-            table,
-            column,
-            datatype,
-            constraint,
-            file,
-            line,
-            routine,
-        })
-    }
-
-    /// The field contents are ERROR, FATAL, or PANIC (in an error message),
-    /// or WARNING, NOTICE, DEBUG, INFO, or LOG (in a notice message), or a
-    /// localized translation of one of these.
-    pub fn severity(&self) -> &str {
-        &self.severity
-    }
-
-    /// A parsed, nonlocalized version of `severity`. (PostgreSQL 9.6+)
-    pub fn parsed_severity(&self) -> Option<Severity> {
-        self.parsed_severity
-    }
-
-    /// The SQLSTATE code for the error.
-    pub fn code(&self) -> &SqlState {
-        &self.code
-    }
-
-    /// The primary human-readable error message.
-    ///
-    /// This should be accurate but terse (typically one line).
-    pub fn message(&self) -> &str {
-        &self.message
-    }
-
-    /// An optional secondary error message carrying more detail about the
-    /// problem.
-    ///
-    /// Might run to multiple lines.
-    pub fn detail(&self) -> Option<&str> {
-        self.detail.as_deref()
-    }
-
-    /// An optional suggestion what to do about the problem.
-    ///
-    /// This is intended to differ from `detail` in that it offers advice
-    /// (potentially inappropriate) rather than hard facts. Might run to
-    /// multiple lines.
-    pub fn hint(&self) -> Option<&str> {
-        self.hint.as_deref()
-    }
-
-    /// An optional error cursor position into either the original query string
-    /// or an internally generated query.
-    pub fn position(&self) -> Option<&ErrorPosition> {
-        self.position.as_ref()
-    }
-
-    /// An indication of the context in which the error occurred.
-    ///
-    /// Presently this includes a call stack traceback of active procedural
-    /// language functions and internally-generated queries. The trace is one
-    /// entry per line, most recent first.
-    pub fn where_(&self) -> Option<&str> {
-        self.where_.as_deref()
-    }
-
-    /// If the error was associated with a specific database object, the name
-    /// of the schema containing that object, if any. (PostgreSQL 9.3+)
-    pub fn schema(&self) -> Option<&str> {
-        self.schema.as_deref()
-    }
-
-    /// If the error was associated with a specific table, the name of the
-    /// table. (Refer to the schema name field for the name of the table's
-    /// schema.) (PostgreSQL 9.3+)
-    pub fn table(&self) -> Option<&str> {
-        self.table.as_deref()
-    }
-
-    /// If the error was associated with a specific table column, the name of
-    /// the column.
-    ///
-    /// (Refer to the schema and table name fields to identify the table.)
-    /// (PostgreSQL 9.3+)
-    pub fn column(&self) -> Option<&str> {
-        self.column.as_deref()
-    }
-
-    /// If the error was associated with a specific data type, the name of the
-    /// data type. (Refer to the schema name field for the name of the data
-    /// type's schema.) (PostgreSQL 9.3+)
-    pub fn datatype(&self) -> Option<&str> {
-        self.datatype.as_deref()
-    }
-
-    /// If the error was associated with a specific constraint, the name of the
-    /// constraint.
-    ///
-    /// Refer to fields listed above for the associated table or domain.
-    /// (For this purpose, indexes are treated as constraints, even if they
-    /// weren't created with constraint syntax.) (PostgreSQL 9.3+)
-    pub fn constraint(&self) -> Option<&str> {
-        self.constraint.as_deref()
-    }
-
-    /// The file name of the source-code location where the error was reported.
-    pub fn file(&self) -> Option<&str> {
-        self.file.as_deref()
-    }
-
-    /// The line number of the source-code location where the error was
-    /// reported.
-    pub fn line(&self) -> Option<u32> {
-        self.line
-    }
-
-    /// The name of the source-code routine reporting the error.
-    pub fn routine(&self) -> Option<&str> {
-        self.routine.as_deref()
-    }
-}
-
-impl fmt::Display for DbError {
-    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(fmt, "{}: {}", self.severity, self.message)?;
-        if let Some(detail) = &self.detail {
-            write!(fmt, "\nDETAIL: {}", detail)?;
-        }
-        if let Some(hint) = &self.hint {
-            write!(fmt, "\nHINT: {}", hint)?;
-        }
-        Ok(())
-    }
-}
-
-impl error::Error for DbError {}
--- a/proxy/src/pg_client/mod.rs
+++ b/proxy/src/pg_client/mod.rs
@@ -1,5 +0,0 @@
-
-pub mod codec;
-pub mod connection;
-pub mod error;
-pub mod prepare;
--- a/proxy/src/pg_client/prepare.rs
+++ b/proxy/src/pg_client/prepare.rs
@@ -1,293 +0,0 @@
-use fallible_iterator::FallibleIterator;
-use futures::StreamExt;
-use postgres_protocol::message::backend::{DataRowRanges, Message};
-use postgres_protocol::message::frontend;
-use std::future::Future;
-use std::pin::Pin;
-use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_postgres::types::{Field, Kind, Oid, ToSql, Type};
-
-use super::connection::Connection;
-use super::error::Error;
-
-const TYPEINFO_QUERY: &str = "\
-SELECT t.typname, t.typtype, t.typelem, r.rngsubtype, t.typbasetype, n.nspname, t.typrelid
-FROM pg_catalog.pg_type t
-LEFT OUTER JOIN pg_catalog.pg_range r ON r.rngtypid = t.oid
-INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
-WHERE t.oid = $1
-";
-
-const TYPEINFO_ENUM_QUERY: &str = "\
-SELECT enumlabel
-FROM pg_catalog.pg_enum
-WHERE enumtypid = $1
-ORDER BY enumsortorder
-";
-
-const TYPEINFO_COMPOSITE_QUERY: &str = "\
-SELECT attname, atttypid
-FROM pg_catalog.pg_attribute
-WHERE attrelid = $1
-AND NOT attisdropped
-AND attnum > 0
-ORDER BY attnum
-";
-
-#[derive(Clone)]
-pub struct TypeinfoPreparedQueries {
-    query: String,
-    enum_query: String,
-    composite_query: String,
-}
-
-fn map_is_null(x: tokio_postgres::types::IsNull) -> postgres_protocol::IsNull {
-    match x {
-        tokio_postgres::types::IsNull::Yes => postgres_protocol::IsNull::Yes,
-        tokio_postgres::types::IsNull::No => postgres_protocol::IsNull::No,
-    }
-}
-
-fn read_column<'a, T: tokio_postgres::types::FromSql<'a>>(
-    buffer: &'a [u8],
-    type_: &Type,
-    ranges: &mut DataRowRanges<'a>,
-) -> Result<T, Error> {
-    let range = ranges.next()?;
-    match range {
-        Some(range) => T::from_sql_nullable(type_, range.map(|r| &buffer[r])),
-        None => T::from_sql_null(type_),
-    }
-    .map_err(|e| Error::from_sql(e, 0))
-}
-
-impl TypeinfoPreparedQueries {
-    pub async fn new<
-        S: AsyncRead + AsyncWrite + Unpin + Send,
-        T: AsyncRead + AsyncWrite + Unpin + Send,
-    >(
-        c: &mut Connection<S, T>,
-    ) -> Result<Self, Error> {
-        if let Some(ti) = &c.typeinfo {
-            return Ok(ti.clone());
-        }
-
-        let query = c.statement_name();
-        let enum_query = c.statement_name();
-        let composite_query = c.statement_name();
-
-        frontend::parse(&query, TYPEINFO_QUERY, [Type::OID.oid()], &mut c.raw.buf)?;
-        frontend::parse(
-            &enum_query,
-            TYPEINFO_ENUM_QUERY,
-            [Type::OID.oid()],
-            &mut c.raw.buf,
-        )?;
-        c.sync().await?;
-        frontend::parse(
-            &composite_query,
-            TYPEINFO_COMPOSITE_QUERY,
-            [Type::OID.oid()],
-            &mut c.raw.buf,
-        )?;
-        c.sync().await?;
-
-        let Message::ParseComplete = c.raw.next_message().await? else { return Err(Error::expecting("parse")) };
-        let Message::ParseComplete = c.raw.next_message().await? else { return Err(Error::expecting("parse")) };
-        let Message::ParseComplete = c.raw.next_message().await? else { return Err(Error::expecting("parse")) };
-        c.wait_for_ready().await?;
-
-        Ok(c.typeinfo
-            .insert(TypeinfoPreparedQueries {
-                query,
-                enum_query,
-                composite_query,
-            })
-            .clone())
-    }
-
-    fn get_type_rec<
-        S: AsyncRead + AsyncWrite + Unpin + Send,
-        T: AsyncRead + AsyncWrite + Unpin + Send,
-    >(
-        c: &mut Connection<S, T>,
-        oid: Oid,
-    ) -> Pin<Box<dyn Future<Output = Result<Type, Error>> + Send + '_>> {
-        Box::pin(Self::get_type(c, oid))
-    }
-
-    pub async fn get_type<
-        S: AsyncRead + AsyncWrite + Unpin + Send,
-        T: AsyncRead + AsyncWrite + Unpin + Send,
-    >(
-        c: &mut Connection<S, T>,
-        oid: Oid,
-    ) -> Result<Type, Error> {
-        if let Some(type_) = Type::from_oid(oid) {
-            return Ok(type_);
-        }
-
-        if let Some(type_) = c.typecache.get(&oid) {
-            return Ok(type_.clone());
-        }
-
-        let queries = Self::new(c).await?;
-
-        frontend::bind(
-            "",
-            &queries.query,
-            [1], // the only parameter is in binary format
-            [oid],
-            |param, buf| param.to_sql(&Type::OID, buf).map(map_is_null),
-            Some(1), // binary return type
-            &mut c.raw.buf,
-        )
-        .map_err(|e| match e {
-            frontend::BindError::Conversion(e) => std::io::Error::new(std::io::ErrorKind::Other, e),
-            frontend::BindError::Serialization(io) => io,
-        })?;
-        frontend::execute("", 0, &mut c.raw.buf)?;
-
-        c.sync().await?;
-
-        let mut stream = c.stream_query_results().await?;
-
-        let Some(row) = stream.next().await.transpose()? else {
-            todo!()
-        };
-
-        let row = row.map_err(Error::db)?;
-        let b = row.buffer();
-        let mut ranges = row.ranges();
-
-        let name: String = read_column(b, &Type::NAME, &mut ranges)?;
-        let type_: i8 = read_column(b, &Type::CHAR, &mut ranges)?;
-        let elem_oid: Oid = read_column(b, &Type::OID, &mut ranges)?;
-        let rngsubtype: Option<Oid> = read_column(b, &Type::OID, &mut ranges)?;
-        let basetype: Oid = read_column(b, &Type::OID, &mut ranges)?;
-        let schema: String = read_column(b, &Type::NAME, &mut ranges)?;
-        let relid: Oid = read_column(b, &Type::OID, &mut ranges)?;
-
-        {
-            // should be none
-            let None = stream.next().await.transpose()? else {
-                todo!()
-            };
-            drop(stream);
-        }
-
-        let kind = if type_ == b'e' as i8 {
-            let variants = Self::get_enum_variants(c, oid).await?;
-            Kind::Enum(variants)
-        } else if type_ == b'p' as i8 {
-            Kind::Pseudo
-        } else if basetype != 0 {
-            let type_ = Self::get_type_rec(c, basetype).await?;
-            Kind::Domain(type_)
-        } else if elem_oid != 0 {
-            let type_ = Self::get_type_rec(c, elem_oid).await?;
-            Kind::Array(type_)
-        } else if relid != 0 {
-            let fields = Self::get_composite_fields(c, relid).await?;
-            Kind::Composite(fields)
-        } else if let Some(rngsubtype) = rngsubtype {
-            let type_ = Self::get_type_rec(c, rngsubtype).await?;
-            Kind::Range(type_)
-        } else {
-            Kind::Simple
-        };
-
-        let type_ = Type::new(name, oid, kind, schema);
-        c.typecache.insert(oid, type_.clone());
-
-        Ok(type_)
-    }
-
-    async fn get_enum_variants<
-        S: AsyncRead + AsyncWrite + Unpin + Send,
-        T: AsyncRead + AsyncWrite + Unpin + Send,
-    >(
-        c: &mut Connection<S, T>,
-        oid: Oid,
-    ) -> Result<Vec<String>, Error> {
-        let queries = Self::new(c).await?;
-
-        frontend::bind(
-            "",
-            &queries.enum_query,
-            [1], // the only parameter is in binary format
-            [oid],
-            |param, buf| param.to_sql(&Type::OID, buf).map(map_is_null),
-            Some(1), // binary return type
-            &mut c.raw.buf,
-        )
-        .map_err(|e| match e {
-            frontend::BindError::Conversion(e) => std::io::Error::new(std::io::ErrorKind::Other, e),
-            frontend::BindError::Serialization(io) => io,
-        })?;
-        frontend::execute("", 0, &mut c.raw.buf)?;
-
-        c.sync().await?;
-
-        let mut stream = c.stream_query_results().await?;
-        let mut variants = Vec::new();
-        while let Some(row) = stream.next().await.transpose()? {
-            let row = row.map_err(Error::db)?;
-
-            let variant: String = read_column(row.buffer(), &Type::NAME, &mut row.ranges())?;
-            variants.push(variant);
-        }
-
-        c.wait_for_ready().await?;
-
-        Ok(variants)
-    }
-
-    async fn get_composite_fields<
-        S: AsyncRead + AsyncWrite + Unpin + Send,
-        T: AsyncRead + AsyncWrite + Unpin + Send,
-    >(
-        c: &mut Connection<S, T>,
-        oid: Oid,
-    ) -> Result<Vec<Field>, Error> {
-        let queries = Self::new(c).await?;
-
-        frontend::bind(
-            "",
-            &queries.composite_query,
-            [1], // the only parameter is in binary format
-            [oid],
-            |param, buf| param.to_sql(&Type::OID, buf).map(map_is_null),
-            Some(1), // binary return type
-            &mut c.raw.buf,
-        )
-        .map_err(|e| match e {
-            frontend::BindError::Conversion(e) => std::io::Error::new(std::io::ErrorKind::Other, e),
-            frontend::BindError::Serialization(io) => io,
-        })?;
-        frontend::execute("", 0, &mut c.raw.buf)?;
-
-        c.sync().await?;
-
-        let mut stream = c.stream_query_results().await?;
-        let mut fields = Vec::new();
-        while let Some(row) = stream.next().await.transpose()? {
-            let row = row.map_err(Error::db)?;
-
-            let mut ranges = row.ranges();
-            let name: String = read_column(row.buffer(), &Type::NAME, &mut ranges)?;
-            let oid: Oid = read_column(row.buffer(), &Type::OID, &mut ranges)?;
-            fields.push((name, oid));
-        }
-
-        c.wait_for_ready().await?;
-
-        let mut output_fields = Vec::with_capacity(fields.len());
-        for (name, oid) in fields {
-            let type_ = Self::get_type_rec(c, oid).await?;
-            output_fields.push(Field::new(name, type_))
-        }
-
-        Ok(output_fields)
-    }
-}
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -6,15 +6,18 @@ use crate::{
    cancellation::{self, CancelMap},
    compute::{self, PostgresConnection},
    config::{ProxyConfig, TlsConfig},
-    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
+    console::{
+        self,
+        errors::{ApiError, WakeComputeError},
+        messages::MetricsAuxInfo,
+    },
    stream::{PqStream, Stream},
 };
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use metrics::{
-    exponential_buckets, register_histogram, register_int_counter_vec, Histogram, IntCounterVec,
-};
+use hyper::StatusCode;
+use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use std::{error::Error, io, ops::ControlFlow, sync::Arc};
@@ -28,37 +31,25 @@ use utils::measured_stream::MeasuredStream;

 /// Number of times we should retry the `/proxy_wake_compute` http request.
 /// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
-pub const NUM_RETRIES_CONNECT: u32 = 10;
+const NUM_RETRIES_CONNECT: u32 = 10;
 const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
 const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);

 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";

-static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
+static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
        "proxy_accepted_connections_total",
-        "Number of TCP client connections accepted.",
-        &["protocol"],
+        "Number of TCP client connections accepted."
    )
    .unwrap()
 });

-static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
+static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
        "proxy_closed_connections_total",
-        "Number of TCP client connections closed.",
-        &["protocol"],
-    )
-    .unwrap()
-});
-
-static COMPUTE_CONNECTION_LATENCY: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_compute_connection_latency_seconds",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // largest bucket = 2^16 * 0.5ms = 32s
-        exponential_buckets(0.0005, 2.0, 16).unwrap(),
+        "Number of TCP client connections closed."
    )
    .unwrap()
 });
@@ -146,13 +137,6 @@ pub enum ClientMode {

 /// Abstracts the logic of handling TCP vs WS clients
 impl ClientMode {
-    fn protocol_label(&self) -> &'static str {
-        match self {
-            ClientMode::Tcp => "tcp",
-            ClientMode::Websockets { .. } => "ws",
-        }
-    }
-
    fn allow_cleartext(&self) -> bool {
        match self {
            ClientMode::Tcp => false,
@@ -191,17 +175,10 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    stream: S,
    mode: ClientMode,
 ) -> anyhow::Result<()> {
-    info!(
-        protocol = mode.protocol_label(),
-        "handling interactive connection from client"
-    );
-
    // The `closed` counter will increase when this future is destroyed.
-    NUM_CONNECTIONS_ACCEPTED_COUNTER
-        .with_label_values(&[mode.protocol_label()])
-        .inc();
+    NUM_CONNECTIONS_ACCEPTED_COUNTER.inc();
    scopeguard::defer! {
-        NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[mode.protocol_label()]).inc();
+        NUM_CONNECTIONS_CLOSED_COUNTER.inc();
    }

    let tls = config.tls_config.as_ref();
@@ -347,6 +324,11 @@ async fn connect_to_compute_once(
        .await
 }

+enum ConnectionState<E> {
+    Cached(console::CachedNodeInfo),
+    Invalid(compute::ConnCfg, E),
+}
+
 #[async_trait]
 pub trait ConnectMechanism {
    type Connection;
@@ -398,91 +380,88 @@ where
    M::ConnectError: ShouldRetry + std::fmt::Debug,
    M::Error: From<WakeComputeError>,
 {
-    let _timer = COMPUTE_CONNECTION_LATENCY.start_timer();
-
    mechanism.update_connect_config(&mut node_info.config);

-    // try once
-    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-        Ok(res) => return Ok(res),
-        Err(e) => {
-            error!(error = ?e, "could not connect to compute node");
-            (invalidate_cache(node_info), e)
-        }
-    };
+    let mut num_retries = 0;
+    let mut state = ConnectionState::<M::ConnectError>::Cached(node_info);

-    let mut num_retries = 1;
-
-    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
-    info!("compute node's state has likely changed; requesting a wake-up");
-    let node_info = loop {
-        let wake_res = match creds {
-            auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
-            auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
-            // nothing to do?
-            auth::BackendType::Link(_) => return Err(err.into()),
-            // test backend
-            auth::BackendType::Test(x) => x.wake_compute(),
-        };
-
-        match handle_try_wake(wake_res, num_retries)? {
-            // failed to wake up but we can continue to retry
-            ControlFlow::Continue(_) => {}
-            // successfully woke up a compute node and can break the wakeup loop
-            ControlFlow::Break(mut node_info) => {
-                node_info.config.reuse_password(&config);
-                mechanism.update_connect_config(&mut node_info.config);
-                break node_info;
-            }
-        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-
-        time::sleep(wait_duration).await;
-        info!(num_retries, "retrying wake compute");
-    };
-
-    // now that we have a new node, try connect to it repeatedly.
-    // this can error for a few reasons, for instance:
-    // * DNS connection settings haven't quite propagated yet
-    info!("wake_compute success. attempting to connect");
    loop {
-        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-            Ok(res) => return Ok(res),
-            Err(e) => {
-                error!(error = ?e, "could not connect to compute node");
-                if !e.should_retry(num_retries) {
-                    return Err(e.into());
+        match state {
+            ConnectionState::Invalid(config, err) => {
+                match try_wake(&config, extra, creds).await {
+                    // we can't wake up the compute node
+                    Ok(None) => return Err(err.into()),
+                    // there was an error communicating with the control plane
+                    Err(e) => return Err(e.into()),
+                    // failed to wake up but we can continue to retry
+                    Ok(Some(ControlFlow::Continue(()))) => {
+                        state = ConnectionState::Invalid(config, err);
+                        let wait_duration = retry_after(num_retries);
+                        num_retries += 1;
+
+                        info!(num_retries, "retrying wake compute");
+                        time::sleep(wait_duration).await;
+                        continue;
+                    }
+                    // successfully woke up a compute node and can break the wakeup loop
+                    Ok(Some(ControlFlow::Break(mut node_info))) => {
+                        mechanism.update_connect_config(&mut node_info.config);
+                        state = ConnectionState::Cached(node_info)
+                    }
+                }
+            }
+            ConnectionState::Cached(node_info) => {
+                match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+                    Ok(res) => return Ok(res),
+                    Err(e) => {
+                        error!(error = ?e, "could not connect to compute node");
+                        if !e.should_retry(num_retries) {
+                            return Err(e.into());
+                        }
+
+                        // after the first connect failure,
+                        // we should invalidate the cache and wake up a new compute node
+                        if num_retries == 0 {
+                            state = ConnectionState::Invalid(invalidate_cache(node_info), e);
+                        } else {
+                            state = ConnectionState::Cached(node_info);
+                        }
+
+                        let wait_duration = retry_after(num_retries);
+                        num_retries += 1;
+
+                        info!(num_retries, "retrying wake compute");
+                        time::sleep(wait_duration).await;
+                    }
                }
            }
        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-
-        time::sleep(wait_duration).await;
-        info!(num_retries, "retrying connect_once");
    }
 }

 /// Attempts to wake up the compute node.
-/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
-/// * Returns Ok(Break(node)) if the wakeup succeeded
-/// * Returns Err(e) if there was an error
-pub fn handle_try_wake(
-    result: Result<console::CachedNodeInfo, WakeComputeError>,
-    num_retries: u32,
-) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
-    match result {
-        Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
-                Ok(ControlFlow::Continue(err))
-            }
-            _ => Err(err),
-        },
-        // Ready to try again.
-        Ok(new) => Ok(ControlFlow::Break(new)),
+/// * Returns Ok(Some(true)) if there was an error waking but retries are acceptable
+/// * Returns Ok(Some(false)) if the wakeup succeeded
+/// * Returns Ok(None) or Err(e) if there was an error
+async fn try_wake(
+    config: &compute::ConnCfg,
+    extra: &console::ConsoleReqExtra<'_>,
+    creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
+) -> Result<Option<ControlFlow<console::CachedNodeInfo>>, WakeComputeError> {
+    info!("compute node's state has likely changed; requesting a wake-up");
+    match creds.wake_compute(extra).await {
+        // retry wake if the compute was in an invalid state
+        Err(WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::BAD_REQUEST,
+            ..
+        })) => Ok(Some(ControlFlow::Continue(()))),
+        // Update `node_info` and try again.
+        Ok(Some(mut new)) => {
+            new.config.reuse_password(config);
+            Ok(Some(ControlFlow::Break(new)))
+        }
+        Err(e) => Err(e),
+        Ok(None) => Ok(None),
    }
 }

@@ -490,6 +469,8 @@ pub trait ShouldRetry {
    fn could_retry(&self) -> bool;
    fn should_retry(&self, num_retries: u32) -> bool {
        match self {
+            // retry all errors at least once
+            _ if num_retries == 0 => true,
            _ if num_retries >= NUM_RETRIES_CONNECT => false,
            err => err.could_retry(),
        }
@@ -541,9 +522,14 @@ impl ShouldRetry for compute::ConnectionError {
    }
 }

-fn retry_after(num_retries: u32) -> time::Duration {
-    // 1.5 seems to be an ok growth factor heuristic
-    BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32))
+pub fn retry_after(num_retries: u32) -> time::Duration {
+    match num_retries {
+        0 => time::Duration::ZERO,
+        _ => {
+            // 3/2 = 1.5 which seems to be an ok growth factor heuristic
+            BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries)
+        }
+    }
 }

 /// Finish client connection initialization: confirm auth success, send params, etc.
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -1,10 +1,6 @@
 //! A group of high-level tests for connection establishing logic and auth.
-//!
 use super::*;
-use crate::auth::backend::TestBackend;
-use crate::auth::ClientCredentials;
-use crate::console::{CachedNodeInfo, NodeInfo};
-use crate::{auth, http, sasl, scram};
+use crate::{auth, sasl, scram};
 use async_trait::async_trait;
 use rstest::rstest;
 use tokio_postgres::config::SslMode;
@@ -99,8 +95,9 @@ struct Scram(scram::ServerSecret);

 impl Scram {
    fn new(password: &str) -> anyhow::Result<Self> {
-        let secret =
-            scram::ServerSecret::build(password).context("failed to generate scram secret")?;
+        let salt = rand::random::<[u8; 16]>();
+        let secret = scram::ServerSecret::build(password, &salt, 256)
+            .context("failed to generate scram secret")?;
        Ok(Scram(secret))
    }

@@ -301,230 +298,9 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 #[test]
 fn connect_compute_total_wait() {
    let mut total_wait = tokio::time::Duration::ZERO;
-    for num_retries in 1..10 {
+    for num_retries in 0..10 {
        total_wait += retry_after(num_retries);
    }
    assert!(total_wait < tokio::time::Duration::from_secs(12));
    assert!(total_wait > tokio::time::Duration::from_secs(10));
 }
-
-#[derive(Clone, Copy, Debug)]
-enum ConnectAction {
-    Wake,
-    WakeFail,
-    WakeRetry,
-    Connect,
-    Retry,
-    Fail,
-}
-
-struct TestConnectMechanism {
-    counter: Arc<std::sync::Mutex<usize>>,
-    sequence: Vec<ConnectAction>,
-}
-
-impl TestConnectMechanism {
-    fn verify(&self) {
-        let counter = self.counter.lock().unwrap();
-        assert_eq!(
-            *counter,
-            self.sequence.len(),
-            "sequence does not proceed to the end"
-        );
-    }
-}
-
-impl TestConnectMechanism {
-    fn new(sequence: Vec<ConnectAction>) -> Self {
-        Self {
-            counter: Arc::new(std::sync::Mutex::new(0)),
-            sequence,
-        }
-    }
-}
-
-#[derive(Debug)]
-struct TestConnection;
-
-#[derive(Debug)]
-struct TestConnectError {
-    retryable: bool,
-}
-
-impl std::fmt::Display for TestConnectError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:?}", self)
-    }
-}
-
-impl std::error::Error for TestConnectError {}
-
-impl ShouldRetry for TestConnectError {
-    fn could_retry(&self) -> bool {
-        self.retryable
-    }
-}
-
-#[async_trait]
-impl ConnectMechanism for TestConnectMechanism {
-    type Connection = TestConnection;
-    type ConnectError = TestConnectError;
-    type Error = anyhow::Error;
-
-    async fn connect_once(
-        &self,
-        _node_info: &console::CachedNodeInfo,
-        _timeout: time::Duration,
-    ) -> Result<Self::Connection, Self::ConnectError> {
-        let mut counter = self.counter.lock().unwrap();
-        let action = self.sequence[*counter];
-        *counter += 1;
-        match action {
-            ConnectAction::Connect => Ok(TestConnection),
-            ConnectAction::Retry => Err(TestConnectError { retryable: true }),
-            ConnectAction::Fail => Err(TestConnectError { retryable: false }),
-            x => panic!("expecting action {:?}, connect is called instead", x),
-        }
-    }
-
-    fn update_connect_config(&self, _conf: &mut compute::ConnCfg) {}
-}
-
-impl TestBackend for TestConnectMechanism {
-    fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
-        let mut counter = self.counter.lock().unwrap();
-        let action = self.sequence[*counter];
-        *counter += 1;
-        match action {
-            ConnectAction::Wake => Ok(helper_create_cached_node_info()),
-            ConnectAction::WakeFail => {
-                let err = console::errors::ApiError::Console {
-                    status: http::StatusCode::FORBIDDEN,
-                    text: "TEST".into(),
-                };
-                assert!(!err.could_retry());
-                Err(console::errors::WakeComputeError::ApiError(err))
-            }
-            ConnectAction::WakeRetry => {
-                let err = console::errors::ApiError::Console {
-                    status: http::StatusCode::INTERNAL_SERVER_ERROR,
-                    text: "TEST".into(),
-                };
-                assert!(err.could_retry());
-                Err(console::errors::WakeComputeError::ApiError(err))
-            }
-            x => panic!("expecting action {:?}, wake_compute is called instead", x),
-        }
-    }
-}
-
-fn helper_create_cached_node_info() -> CachedNodeInfo {
-    let node = NodeInfo {
-        config: compute::ConnCfg::new(),
-        aux: Default::default(),
-        allow_self_signed_compute: false,
-    };
-    CachedNodeInfo::new_uncached(node)
-}
-
-fn helper_create_connect_info(
-    mechanism: &TestConnectMechanism,
-) -> (
-    CachedNodeInfo,
-    console::ConsoleReqExtra<'static>,
-    auth::BackendType<'_, ClientCredentials<'static>>,
-) {
-    let cache = helper_create_cached_node_info();
-    let extra = console::ConsoleReqExtra {
-        session_id: uuid::Uuid::new_v4(),
-        application_name: Some("TEST"),
-    };
-    let creds = auth::BackendType::Test(mechanism);
-    (cache, extra, creds)
-}
-
-#[tokio::test]
-async fn connect_to_compute_success() {
-    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Connect]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds)
-        .await
-        .unwrap();
-    mechanism.verify();
-}
-
-#[tokio::test]
-async fn connect_to_compute_retry() {
-    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds)
-        .await
-        .unwrap();
-    mechanism.verify();
-}
-
-/// Test that we don't retry if the error is not retryable.
-#[tokio::test]
-async fn connect_to_compute_non_retry_1() {
-    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds)
-        .await
-        .unwrap_err();
-    mechanism.verify();
-}
-
-/// Even for non-retryable errors, we should retry at least once.
-#[tokio::test]
-async fn connect_to_compute_non_retry_2() {
-    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds)
-        .await
-        .unwrap();
-    mechanism.verify();
-}
-
-/// Retry for at most `NUM_RETRIES_CONNECT` times.
-#[tokio::test]
-async fn connect_to_compute_non_retry_3() {
-    assert_eq!(NUM_RETRIES_CONNECT, 10);
-    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![
-        Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
-        /* the 11th time */ Retry,
-    ]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds)
-        .await
-        .unwrap_err();
-    mechanism.verify();
-}
-
-/// Should retry wake compute.
-#[tokio::test]
-async fn wake_retry() {
-    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds)
-        .await
-        .unwrap();
-    mechanism.verify();
-}
-
-/// Wake failed with a non-retryable error.
-#[tokio::test]
-async fn wake_non_retry() {
-    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds)
-        .await
-        .unwrap_err();
-    mechanism.verify();
-}
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -12,6 +12,9 @@ mod messages;
 mod secret;
 mod signature;

+#[cfg(any(test, doc))]
+mod password;
+
 pub use exchange::Exchange;
 pub use key::ScramKey;
 pub use secret::ServerSecret;
@@ -54,21 +57,27 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {

 #[cfg(test)]
 mod tests {
-    use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
-
    use crate::sasl::{Mechanism, Step};

-    use super::{Exchange, ServerSecret};
+    use super::{password::SaltedPassword, Exchange, ServerSecret};

    #[test]
-    fn snapshot() {
+    fn happy_path() {
        let iterations = 4096;
-        let salt = "QSXCR+Q6sek8bf92";
-        let stored_key = "FO+9jBb3MUukt6jJnzjPZOWc5ow/Pu6JtPyju0aqaE8=";
-        let server_key = "qxJ1SbmSAi5EcS0J5Ck/cKAm/+Ixa+Kwp63f4OHDgzo=";
-        let secret = format!("SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}",);
-        let secret = ServerSecret::parse(&secret).unwrap();
+        let salt_base64 = "QSXCR+Q6sek8bf92";
+        let pw = SaltedPassword::new(
+            b"pencil",
+            base64::decode(salt_base64).unwrap().as_slice(),
+            iterations,
+        );

+        let secret = ServerSecret {
+            iterations,
+            salt_base64: salt_base64.to_owned(),
+            stored_key: pw.client_key().sha256(),
+            server_key: pw.server_key(),
+            doomed: false,
+        };
        const NONCE: [u8; 18] = [
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        ];
@@ -106,40 +115,4 @@ mod tests {
            ]
        );
    }
-
-    fn run_round_trip_test(client_password: &str) {
-        let secret = ServerSecret::build("pencil").unwrap();
-        let mut exchange = Exchange::new(&secret, rand::random, None);
-
-        let mut client =
-            ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported());
-
-        let client_first = std::str::from_utf8(client.message()).unwrap();
-        exchange = match exchange.exchange(client_first).unwrap() {
-            Step::Continue(exchange, message) => {
-                client.update(message.as_bytes()).unwrap();
-                exchange
-            }
-            Step::Success(_, _) => panic!("expected continue, got success"),
-            Step::Failure(f) => panic!("{f}"),
-        };
-
-        let client_final = std::str::from_utf8(client.message()).unwrap();
-        match exchange.exchange(client_final).unwrap() {
-            Step::Success(_, message) => client.finish(message.as_bytes()).unwrap(),
-            Step::Continue(_, _) => panic!("expected success, got continue"),
-            Step::Failure(f) => panic!("{f}"),
-        };
-    }
-
-    #[test]
-    fn round_trip() {
-        run_round_trip_test("pencil")
-    }
-
-    #[test]
-    #[should_panic(expected = "password doesn't match")]
-    fn failure() {
-        run_round_trip_test("eraser")
-    }
 }
--- a/proxy/src/scram/key.rs
+++ b/proxy/src/scram/key.rs
@@ -3,7 +3,7 @@
 /// Faithfully taken from PostgreSQL.
 pub const SCRAM_KEY_LEN: usize = 32;

-/// One of the keys derived from the user's password.
+/// One of the keys derived from the [password](super::password::SaltedPassword).
 /// We use the same structure for all keys, i.e.
 /// `ClientKey`, `StoredKey`, and `ServerKey`.
 #[derive(Default, PartialEq, Eq)]
--- a/proxy/src/scram/password.rs
+++ b/proxy/src/scram/password.rs
@@ -0,0 +1,74 @@
+//! Password hashing routines.
+
+use super::key::ScramKey;
+
+pub const SALTED_PASSWORD_LEN: usize = 32;
+
+/// Salted hashed password is essential for [key](super::key) derivation.
+#[repr(transparent)]
+pub struct SaltedPassword {
+    bytes: [u8; SALTED_PASSWORD_LEN],
+}
+
+impl SaltedPassword {
+    /// See `scram-common.c : scram_SaltedPassword` for details.
+    /// Further reading: <https://datatracker.ietf.org/doc/html/rfc2898> (see `PBKDF2`).
+    pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
+        pbkdf2::pbkdf2_hmac_array::<sha2::Sha256, 32>(password, salt, iterations).into()
+    }
+
+    /// Derive `ClientKey` from a salted hashed password.
+    pub fn client_key(&self) -> ScramKey {
+        super::hmac_sha256(&self.bytes, [b"Client Key".as_ref()]).into()
+    }
+
+    /// Derive `ServerKey` from a salted hashed password.
+    pub fn server_key(&self) -> ScramKey {
+        super::hmac_sha256(&self.bytes, [b"Server Key".as_ref()]).into()
+    }
+}
+
+impl From<[u8; SALTED_PASSWORD_LEN]> for SaltedPassword {
+    #[inline(always)]
+    fn from(bytes: [u8; SALTED_PASSWORD_LEN]) -> Self {
+        Self { bytes }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::SaltedPassword;
+
+    fn legacy_pbkdf2_impl(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
+        let one = 1_u32.to_be_bytes(); // magic
+
+        let mut current = super::super::hmac_sha256(password, [salt, &one]);
+        let mut result = current;
+        for _ in 1..iterations {
+            current = super::super::hmac_sha256(password, [current.as_ref()]);
+            // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094
+            for (i, x) in current.iter().enumerate() {
+                result[i] ^= x;
+            }
+        }
+
+        result.into()
+    }
+
+    #[test]
+    fn pbkdf2() {
+        let password = "a-very-secure-password";
+        let salt = "such-a-random-salt";
+        let iterations = 4096;
+        let output = [
+            203, 18, 206, 81, 4, 154, 193, 100, 147, 41, 211, 217, 177, 203, 69, 210, 194, 211,
+            101, 1, 248, 156, 96, 0, 8, 223, 30, 87, 158, 41, 20, 42,
+        ];
+
+        let actual = SaltedPassword::new(password.as_bytes(), salt.as_bytes(), iterations);
+        let expected = legacy_pbkdf2_impl(password.as_bytes(), salt.as_bytes(), iterations);
+
+        assert_eq!(actual.bytes, output);
+        assert_eq!(actual.bytes, expected.bytes);
+    }
+}
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -3,7 +3,7 @@
 use super::base64_decode_array;
 use super::key::ScramKey;

-/// Server secret is produced from user's password,
+/// Server secret is produced from [password](super::password::SaltedPassword)
 /// and is used throughout the authentication process.
 pub struct ServerSecret {
    /// Number of iterations for `PBKDF2` function.
@@ -58,10 +58,21 @@ impl ServerSecret {
    /// Build a new server secret from the prerequisites.
    /// XXX: We only use this function in tests.
    #[cfg(test)]
-    pub fn build(password: &str) -> Option<Self> {
-        Self::parse(&postgres_protocol::password::scram_sha_256(
-            password.as_bytes(),
-        ))
+    pub fn build(password: &str, salt: &[u8], iterations: u32) -> Option<Self> {
+        // TODO: implement proper password normalization required by the RFC
+        if !password.is_ascii() {
+            return None;
+        }
+
+        let password = super::password::SaltedPassword::new(password.as_bytes(), salt, iterations);
+
+        Some(Self {
+            iterations,
+            salt_base64: base64::encode(salt),
+            stored_key: password.client_key().sha256(),
+            server_key: password.server_key(),
+            doomed: false,
+        })
    }
 }

@@ -91,4 +102,20 @@ mod tests {
        assert_eq!(base64::encode(parsed.stored_key), stored_key);
        assert_eq!(base64::encode(parsed.server_key), server_key);
    }
+
+    #[test]
+    fn build_scram_secret() {
+        let salt = b"salt";
+        let secret = ServerSecret::build("password", salt, 4096).unwrap();
+        assert_eq!(secret.iterations, 4096);
+        assert_eq!(secret.salt_base64, base64::encode(salt));
+        assert_eq!(
+            base64::encode(secret.stored_key.as_ref()),
+            "lF4cRm/Jky763CN4HtxdHnjV4Q8AWTNlKvGmEFFU8IQ="
+        );
+        assert_eq!(
+            base64::encode(secret.server_key.as_ref()),
+            "ub8OgRsftnk2ccDMOt7ffHXNcikRkQkq1lh4xaAqrSw="
+        );
+    }
 }
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -234,10 +234,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
                listen_pg_addr_tenant_only
            );
            let listener = tcp_listener::bind(listen_pg_addr_tenant_only.clone()).map_err(|e| {
-                error!(
-                    "failed to bind to address {}: {}",
-                    listen_pg_addr_tenant_only, e
-                );
+                error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
                e
            })?;
            Some(listener)
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -11,7 +11,6 @@ use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};

 use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED};
-use crate::timeline::TimelineError;
 use crate::wal_service::ConnectionId;
 use crate::{GlobalTimelines, SafeKeeperConf};
 use postgres_backend::QueryError;
@@ -46,7 +45,6 @@ enum SafekeeperPostgresCommand {
    StartWalPush,
    StartReplication { start_lsn: Lsn },
    IdentifySystem,
-    TimelineStatus,
    JSONCtrl { cmd: AppendLogicalMessage },
 }

@@ -66,8 +64,6 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
        Ok(SafekeeperPostgresCommand::StartReplication { start_lsn })
    } else if cmd.starts_with("IDENTIFY_SYSTEM") {
        Ok(SafekeeperPostgresCommand::IdentifySystem)
-    } else if cmd.starts_with("TIMELINE_STATUS") {
-        Ok(SafekeeperPostgresCommand::TimelineStatus)
    } else if cmd.starts_with("JSON_CTRL") {
        let cmd = cmd.strip_prefix("JSON_CTRL").context("invalid prefix")?;
        Ok(SafekeeperPostgresCommand::JSONCtrl {
@@ -82,7 +78,6 @@ fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str {
    match cmd {
        SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH",
        SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION",
-        SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS",
        SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM",
        SafekeeperPostgresCommand::JSONCtrl { .. } => "JSON_CTRL",
    }
@@ -224,7 +219,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                    .await
            }
            SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await,
-            SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await,
            SafekeeperPostgresCommand::JSONCtrl { ref cmd } => {
                handle_json_ctrl(self, pgb, cmd).await
            }
@@ -269,38 +263,6 @@ impl SafekeeperPostgresHandler {
        check_permission(claims, tenant_id)
    }

-    async fn handle_timeline_status<IO: AsyncRead + AsyncWrite + Unpin>(
-        &mut self,
-        pgb: &mut PostgresBackend<IO>,
-    ) -> Result<(), QueryError> {
-        // Get timeline, handling "not found" error
-        let tli = match GlobalTimelines::get(self.ttid) {
-            Ok(tli) => Ok(Some(tli)),
-            Err(TimelineError::NotFound(_)) => Ok(None),
-            Err(e) => Err(QueryError::Other(e.into())),
-        }?;
-
-        // Write row description
-        pgb.write_message_noflush(&BeMessage::RowDescription(&[
-            RowDescriptor::text_col(b"flush_lsn"),
-            RowDescriptor::text_col(b"commit_lsn"),
-        ]))?;
-
-        // Write row if timeline exists
-        if let Some(tli) = tli {
-            let (inmem, _state) = tli.get_state().await;
-            let flush_lsn = tli.get_flush_lsn().await;
-            let commit_lsn = inmem.commit_lsn;
-            pgb.write_message_noflush(&BeMessage::DataRow(&[
-                Some(flush_lsn.to_string().as_bytes()),
-                Some(commit_lsn.to_string().as_bytes()),
-            ]))?;
-        }
-
-        pgb.write_message_noflush(&BeMessage::CommandComplete(b"TIMELINE_STATUS"))?;
-        Ok(())
-    }
-
    ///
    /// Handle IDENTIFY_SYSTEM replication command
    ///
--- a/scripts/combine_control_files.py
+++ b/scripts/combine_control_files.py
@@ -1,80 +0,0 @@
-#! /usr/bin/env python3
-# Script to generate ext_index.json metadata file
-# that stores content of the control files and location of extension archives
-# for all extensions in extensions subdir.
-import argparse
-import json
-import subprocess
-from pathlib import Path
-
-"""
-# ext_index.json example:
-{
-    "public_extensions": [
-        "anon"
-    ],
-    "library_index": {
-        "anon": "anon",
-        "kq_imcx": "kq_imcx"
-        // would be more complicated for something like postgis where multiple library names all map to postgis
-    },
-    "extension_data": {
-        "kq_imcx": {
-            "control_data": {
-                "kq_imcx.control": "# This file is generated content from add_postgresql_extension.\n# No point in modifying it, it will be overwritten anyway.\n\n# Default version, always set\ndefault_version = '0.1'\n\n# Module pathname generated from target shared library name. Use\n# MODULE_PATHNAME in script file.\nmodule_pathname = '$libdir/kq_imcx.so'\n\n# Comment for extension. Set using COMMENT option. Can be set in\n# script file as well.\ncomment = 'ketteQ In-Memory Calendar Extension (IMCX)'\n\n# Encoding for script file. Set using ENCODING option.\n#encoding = ''\n\n# Required extensions. Set using REQUIRES option (multi-valued).\n#requires = ''\ntrusted = true\n"
-            },
-            "archive_path": "5648391853/v15/extensions/kq_imcx.tar.zst"
-        },
-        "anon": {
-            "control_data": {
-                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
-            },
-            "archive_path": "5648391853/v15/extensions/anon.tar.zst"
-        }
-    }
-}
-"""
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="generate ext_index.json")
-    parser.add_argument("pg_version", type=str, choices=["v14", "v15"], help="pg_version")
-    parser.add_argument("BUILD_TAG", type=str, help="BUILD_TAG for this compute image")
-    parser.add_argument("--public_extensions", type=str, help="list of public extensions")
-    args = parser.parse_args()
-    pg_version = args.pg_version
-    BUILD_TAG = args.BUILD_TAG
-    public_ext_list = args.public_extensions.split(",")
-
-    ext_index = {}
-    library_index = {}
-    EXT_PATH = Path("extensions")
-    for extension in EXT_PATH.iterdir():
-        if extension.is_dir():
-            control_data = {}
-            for control_file in extension.glob("*.control"):
-                if control_file.suffix != ".control":
-                    continue
-                with open(control_file, "r") as f:
-                    control_data[control_file.name] = f.read()
-            ext_index[extension.name] = {
-                "control_data": control_data,
-                "archive_path": f"{BUILD_TAG}/{pg_version}/extensions/{extension.name}.tar.zst",
-            }
-        elif extension.suffix == ".zst":
-            file_list = (
-                str(subprocess.check_output(["tar", "tf", str(extension)]), "utf-8")
-                .strip()
-                .split("\n")
-            )
-            for file in file_list:
-                if file.endswith(".so") and file.startswith("lib/"):
-                    lib_name = file[4:-3]
-                    library_index[lib_name] = extension.name.replace(".tar.zst", "")
-
-    all_data = {
-        "public_extensions": public_ext_list,
-        "library_index": library_index,
-        "extension_data": ext_index,
-    }
-    with open("ext_index.json", "w") as f:
-        json.dump(all_data, f)
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -40,13 +40,10 @@ def parse_metrics(text: str, name: str = "") -> Metrics:
    return metrics


-def histogram(prefix_without_trailing_underscore: str) -> List[str]:
-    assert not prefix_without_trailing_underscore.endswith("_")
-    return [f"{prefix_without_trailing_underscore}_{x}" for x in ["bucket", "count", "sum"]]
-
-
 PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
    "pageserver_remote_timeline_client_calls_unfinished",
+    *[f"pageserver_remote_timeline_client_calls_started_{x}" for x in ["bucket", "count", "sum"]],
+    *[f"pageserver_remote_operation_seconds_{x}" for x in ["bucket", "count", "sum"]],
    "pageserver_remote_physical_size",
    "pageserver_remote_timeline_client_bytes_started_total",
    "pageserver_remote_timeline_client_bytes_finished_total",
@@ -70,29 +67,34 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
    "pageserver_getpage_reconstruct_seconds_count",
    "pageserver_getpage_reconstruct_seconds_sum",
    *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
-    *histogram("pageserver_read_num_fs_layers"),
-    *histogram("pageserver_getpage_get_reconstruct_data_seconds"),
-    *histogram("pageserver_wait_lsn_seconds"),
-    *histogram("pageserver_remote_operation_seconds"),
-    *histogram("pageserver_remote_timeline_client_calls_started"),
-    *histogram("pageserver_io_operations_seconds"),
-    "pageserver_tenant_states_count",
 )

 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
    "pageserver_current_logical_size",
    "pageserver_resident_physical_size",
+    "pageserver_getpage_get_reconstruct_data_seconds_bucket",
+    "pageserver_getpage_get_reconstruct_data_seconds_count",
+    "pageserver_getpage_get_reconstruct_data_seconds_sum",
    "pageserver_io_operations_bytes_total",
+    "pageserver_io_operations_seconds_bucket",
+    "pageserver_io_operations_seconds_count",
+    "pageserver_io_operations_seconds_sum",
    "pageserver_last_record_lsn",
+    "pageserver_read_num_fs_layers_bucket",
+    "pageserver_read_num_fs_layers_count",
+    "pageserver_read_num_fs_layers_sum",
    "pageserver_smgr_query_seconds_bucket",
    "pageserver_smgr_query_seconds_count",
    "pageserver_smgr_query_seconds_sum",
    "pageserver_storage_operations_seconds_count_total",
    "pageserver_storage_operations_seconds_sum_total",
+    "pageserver_wait_lsn_seconds_bucket",
+    "pageserver_wait_lsn_seconds_count",
+    "pageserver_wait_lsn_seconds_sum",
    "pageserver_created_persistent_files_total",
    "pageserver_written_persistent_bytes_total",
+    "pageserver_tenant_states_count",
    "pageserver_evictions_total",
    "pageserver_evictions_with_low_residence_duration_total",
    *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
-    # pageserver_broken_tenants_count is a leaked "metric" which is "cleared" on restart or reload
 )
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -542,7 +542,7 @@ class S3Storage:
    access_key: str
    secret_key: str
    endpoint: Optional[str] = None
-    prefix_in_bucket: Optional[str] = ""
+    prefix_in_bucket: Optional[str] = None

    def access_env_vars(self) -> Dict[str, str]:
        return {
@@ -1504,7 +1504,6 @@ class NeonCli(AbstractNeonCli):
        safekeepers: Optional[List[int]] = None,
        tenant_id: Optional[TenantId] = None,
        lsn: Optional[Lsn] = None,
-        branch_name: Optional[str] = None,
    ) -> "subprocess.CompletedProcess[str]":
        args = [
            "endpoint",
@@ -1518,11 +1517,8 @@ class NeonCli(AbstractNeonCli):
            args.append(f"--lsn={lsn}")
        args.extend(["--pg-port", str(pg_port)])
        args.extend(["--http-port", str(http_port)])
-
        if safekeepers is not None:
            args.extend(["--safekeepers", (",".join(map(str, safekeepers)))])
-        if branch_name is not None:
-            args.extend(["--branch-name", branch_name])
        if endpoint_id is not None:
            args.append(endpoint_id)

--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -194,18 +194,14 @@ def wait_for_upload_queue_empty(


 def wait_timeline_detail_404(
-    pageserver_http: PageserverHttpClient,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    wait_longer: bool = False,
+    pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
 ):
    last_exc = None
-    iterations = 10 if wait_longer else 2
-    for _ in range(iterations):
+    for _ in range(2):
        time.sleep(0.250)
        try:
            data = pageserver_http.timeline_detail(tenant_id, timeline_id)
-            log.info(f"detail {data}")
+            log.error(f"detail {data}")
        except PageserverApiException as e:
            log.debug(e)
            if e.status_code == 404:
@@ -220,8 +216,7 @@ def timeline_delete_wait_completed(
    pageserver_http: PageserverHttpClient,
    tenant_id: TenantId,
    timeline_id: TimelineId,
-    wait_longer: bool = False,  # Use when running with RemoteStorageKind.REAL_S3
    **delete_args,
 ):
    pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
-    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, wait_longer)
+    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id)
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -61,7 +61,6 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc
        durations = {
            "wait_for_spec_ms": f"{i}_wait_for_spec",
            "sync_safekeepers_ms": f"{i}_sync_safekeepers",
-            "sync_sk_check_ms": f"{i}_sync_sk_check",
            "basebackup_ms": f"{i}_basebackup",
            "start_postgres_ms": f"{i}_start_postgres",
            "config_ms": f"{i}_config",
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -123,7 +123,7 @@ def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests):
@pytest.mark.parametrize("content_type", [None, "application/json"])
 def test_empty_body(positive_env: NeonEnv, content_type: Optional[str]):
    """
-    For backwards-compatibility: if we send an empty body,
+    For backwards-compatiblity: if we send an empty body,
    the request should be accepted and the config should be the default config.
    """
    env = positive_env
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -4,7 +4,7 @@ import shutil
 import subprocess
 import tempfile
 from pathlib import Path
-from typing import Any, List, Optional
+from typing import Any, Optional

 import pytest
 import toml  # TODO: replace with tomllib for Python >= 3.11
@@ -14,6 +14,7 @@ from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    PgBin,
    PortDistributor,
+    parse_project_git_version_output,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -62,6 +63,7 @@ def test_create_snapshot(
    neon_env_builder.pg_version = pg_version
    neon_env_builder.num_safekeepers = 3
    neon_env_builder.enable_local_fs_remote_storage()
+    neon_env_builder.preserve_database_files = True

    env = neon_env_builder.init_start()
    endpoint = env.endpoints.create_start("main")
@@ -257,15 +259,36 @@ def prepare_snapshot(
        shutil.rmtree(repo_dir / "pgdatadirs")
    os.mkdir(repo_dir / "endpoints")

+    # Remove wal-redo temp directory if it exists. Newer pageserver versions don't create
+    # them anymore, but old versions did.
+    for tenant in (repo_dir / "tenants").glob("*"):
+        wal_redo_dir = tenant / "wal-redo-datadir.___temp"
+        if wal_redo_dir.exists() and wal_redo_dir.is_dir():
+            shutil.rmtree(wal_redo_dir)
+
    # Update paths and ports in config files
    pageserver_toml = repo_dir / "pageserver.toml"
    pageserver_config = toml.load(pageserver_toml)
    pageserver_config["remote_storage"]["local_path"] = str(repo_dir / "local_fs_remote_storage")
-    for param in ("listen_http_addr", "listen_pg_addr", "broker_endpoint"):
-        pageserver_config[param] = port_distributor.replace_with_new_port(pageserver_config[param])
+    pageserver_config["listen_http_addr"] = port_distributor.replace_with_new_port(
+        pageserver_config["listen_http_addr"]
+    )
+    pageserver_config["listen_pg_addr"] = port_distributor.replace_with_new_port(
+        pageserver_config["listen_pg_addr"]
+    )
+    # since storage_broker these are overriden by neon_local during pageserver
+    # start; remove both to prevent unknown options during etcd ->
+    # storage_broker migration. TODO: remove once broker is released
+    pageserver_config.pop("broker_endpoint", None)
+    pageserver_config.pop("broker_endpoints", None)
+    etcd_broker_endpoints = [f"http://localhost:{port_distributor.get_port()}/"]
+    if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0":
+        pageserver_config["broker_endpoints"] = etcd_broker_endpoints  # old etcd version

-    # We don't use authentication in compatibility tests
-    # so just remove authentication related settings.
+    # Older pageserver versions had just one `auth_type` setting. Now there
+    # are separate settings for pg and http ports. We don't use authentication
+    # in compatibility tests so just remove authentication related settings.
+    pageserver_config.pop("auth_type", None)
    pageserver_config.pop("pg_auth_type", None)
    pageserver_config.pop("http_auth_type", None)

@@ -277,16 +300,31 @@ def prepare_snapshot(

    snapshot_config_toml = repo_dir / "config"
    snapshot_config = toml.load(snapshot_config_toml)
-    for param in ("listen_http_addr", "listen_pg_addr"):
-        snapshot_config["pageserver"][param] = port_distributor.replace_with_new_port(
-            snapshot_config["pageserver"][param]
-        )
-    snapshot_config["broker"]["listen_addr"] = port_distributor.replace_with_new_port(
-        snapshot_config["broker"]["listen_addr"]
+
+    # Provide up/downgrade etcd <-> storage_broker to make forward/backward
+    # compatibility test happy. TODO: leave only the new part once broker is released.
+    if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0":
+        # old etcd version
+        snapshot_config["etcd_broker"] = {
+            "etcd_binary_path": shutil.which("etcd"),
+            "broker_endpoints": etcd_broker_endpoints,
+        }
+        snapshot_config.pop("broker", None)
+    else:
+        # new storage_broker version
+        broker_listen_addr = f"127.0.0.1:{port_distributor.get_port()}"
+        snapshot_config["broker"] = {"listen_addr": broker_listen_addr}
+        snapshot_config.pop("etcd_broker", None)
+
+    snapshot_config["pageserver"]["listen_http_addr"] = port_distributor.replace_with_new_port(
+        snapshot_config["pageserver"]["listen_http_addr"]
+    )
+    snapshot_config["pageserver"]["listen_pg_addr"] = port_distributor.replace_with_new_port(
+        snapshot_config["pageserver"]["listen_pg_addr"]
    )
    for sk in snapshot_config["safekeepers"]:
-        for param in ("http_port", "pg_port", "pg_tenant_only_port"):
-            sk[param] = port_distributor.replace_with_new_port(sk[param])
+        sk["http_port"] = port_distributor.replace_with_new_port(sk["http_port"])
+        sk["pg_port"] = port_distributor.replace_with_new_port(sk["pg_port"])

    if pg_distrib_dir:
        snapshot_config["pg_distrib_dir"] = str(pg_distrib_dir)
@@ -312,6 +350,12 @@ def prepare_snapshot(
    ), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}"


+# get git SHA of neon binary
+def get_neon_version(neon_binpath: Path):
+    out = subprocess.check_output([neon_binpath / "neon_local", "--version"]).decode("utf-8")
+    return parse_project_git_version_output(out)
+
+
 def check_neon_works(
    repo_dir: Path,
    neon_target_binpath: Path,
@@ -337,6 +381,7 @@ def check_neon_works(
    config.pg_version = pg_version
    config.initial_tenant = snapshot_config["default_tenant_id"]
    config.pg_distrib_dir = pg_distrib_dir
+    config.preserve_database_files = True

    # Use the "target" binaries to launch the storage nodes
    config_target = config
@@ -393,14 +438,6 @@ def check_neon_works(
        test_output_dir / "dump-from-wal.filediff",
    )

-    # TODO: Run pg_amcheck unconditionally after the next release
-    try:
-        pg_bin.run(["psql", connstr, "--command", "CREATE EXTENSION IF NOT EXISTS amcheck"])
-    except subprocess.CalledProcessError:
-        log.info("Extension amcheck is not available, skipping pg_amcheck")
-    else:
-        pg_bin.run_capture(["pg_amcheck", connstr, "--install-missing", "--verbose"])
-
    # Check that we can interract with the data
    pg_bin.run_capture(["pgbench", "--time=10", "--progress=2", connstr])

@@ -408,15 +445,10 @@ def check_neon_works(
    assert not initial_dump_differs, "initial dump differs"


-def dump_differs(
-    first: Path, second: Path, output: Path, allowed_diffs: Optional[List[str]] = None
-) -> bool:
+def dump_differs(first: Path, second: Path, output: Path) -> bool:
    """
    Runs diff(1) command on two SQL dumps and write the output to the given output file.
-    The function supports allowed diffs, if the diff is in the allowed_diffs list, it's not considered as a difference.
-    See the example of it in https://github.com/neondatabase/neon/pull/4425/files#diff-15c5bfdd1d5cc1411b9221091511a60dd13a9edf672bdfbb57dd2ef8bb7815d6
-
-    Returns True if the dumps differ and produced diff is not allowed, False otherwise (in most cases we want it to return False).
+    Returns True if the dumps differ, False otherwise.
    """

    with output.open("w") as stdout:
@@ -434,30 +466,51 @@ def dump_differs(

    differs = res.returncode != 0

-    allowed_diffs = allowed_diffs or []
-    if differs and len(allowed_diffs) > 0:
-        for allowed_diff in allowed_diffs:
-            with tempfile.NamedTemporaryFile(mode="w") as tmp:
-                tmp.write(allowed_diff)
-                tmp.flush()
+    # TODO: Remove after https://github.com/neondatabase/neon/pull/4425 is merged, and a couple of releases are made
+    if differs:
+        with tempfile.NamedTemporaryFile(mode="w") as tmp:
+            tmp.write(PR4425_ALLOWED_DIFF)
+            tmp.flush()

-                allowed = subprocess.run(
-                    [
-                        "diff",
-                        "--unified",  # Make diff output more readable
-                        r"--ignore-matching-lines=^---",  # Ignore diff headers
-                        r"--ignore-matching-lines=^\+\+\+",  # Ignore diff headers
-                        "--ignore-matching-lines=^@@",  # Ignore diff blocks location
-                        "--ignore-matching-lines=^ *$",  # Ignore lines with only spaces
-                        "--ignore-matching-lines=^ --.*",  # Ignore SQL comments in diff
-                        "--ignore-blank-lines",
-                        str(output),
-                        str(tmp.name),
-                    ],
-                )
+            allowed = subprocess.run(
+                [
+                    "diff",
+                    "--unified",  # Make diff output more readable
+                    r"--ignore-matching-lines=^---",  # Ignore diff headers
+                    r"--ignore-matching-lines=^\+\+\+",  # Ignore diff headers
+                    "--ignore-matching-lines=^@@",  # Ignore diff blocks location
+                    "--ignore-matching-lines=^ *$",  # Ignore lines with only spaces
+                    "--ignore-matching-lines=^ --.*",  # Ignore the " --" lines for compatibility with PG14
+                    "--ignore-blank-lines",
+                    str(output),
+                    str(tmp.name),
+                ],
+            )

-                differs = allowed.returncode != 0
-                if not differs:
-                    break
+            differs = allowed.returncode != 0

    return differs
+
+
+PR4425_ALLOWED_DIFF = """
+--- /tmp/test_output/test_backward_compatibility[release-pg15]/compatibility_snapshot/dump.sql 2023-06-08 18:12:45.000000000 +0000
+++ /tmp/test_output/test_backward_compatibility[release-pg15]/dump.sql        2023-06-13 07:25:35.211733653 +0000
+@@ -13,12 +13,20 @@
+
+ CREATE ROLE cloud_admin;
+ ALTER ROLE cloud_admin WITH SUPERUSER INHERIT CREATEROLE CREATEDB LOGIN REPLICATION BYPASSRLS;
+CREATE ROLE neon_superuser;
+ALTER ROLE neon_superuser WITH NOSUPERUSER INHERIT CREATEROLE CREATEDB NOLOGIN NOREPLICATION NOBYPASSRLS;
+
+ --
+ -- User Configurations
+ --
+
+
+--
+-- Role memberships
+--
+
+GRANT pg_read_all_data TO neon_superuser GRANTED BY cloud_admin;
+GRANT pg_write_all_data TO neon_superuser GRANTED BY cloud_admin;
+"""
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -136,6 +136,8 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
        for sample in ps_metrics.query_all(
            name="pageserver_remote_operation_seconds_count",
            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
                "file_kind": str(file_kind),
                "op_kind": str(op_kind),
            },
--- a/test_runner/regress/test_gc_cutoff.py
+++ b/test_runner/regress/test_gc_cutoff.py
@@ -14,6 +14,10 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    env = neon_env_builder.init_start()

+    # These warnings are expected, when the pageserver is restarted abruptly
+    env.pageserver.allowed_errors.append(".*found future image layer.*")
+    env.pageserver.allowed_errors.append(".*found future delta layer.*")
+
    pageserver_http = env.pageserver.http_client()

    # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -140,6 +140,8 @@ def test_metric_collection(
        for sample in ps_metrics.query_all(
            name="pageserver_remote_operation_seconds_count",
            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
                "file_kind": str(file_kind),
                "op_kind": str(op_kind),
            },
--- a/test_runner/regress/test_multixact_conc.py
+++ b/test_runner/regress/test_multixact_conc.py
@@ -0,0 +1,92 @@
+import random
+import threading
+from threading import Thread
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
+from fixtures.utils import query_scalar
+
+
+#
+# Test multixact state after branching
+# Now this test is very minimalistic -
+# it only checks next_multixact_id field in restored pg_control,
+# since we don't have functions to check multixact internals.
+#
+def test_multixact_conc(neon_simple_env: NeonEnv, test_output_dir):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_multixact", "empty")
+    endpoint = env.endpoints.create_start("test_multixact")
+
+    log.info("postgres is running on 'test_multixact' branch")
+
+    n_records = 100
+    n_threads =   5
+    n_iters =  1000
+    n_restarts = 10
+
+    cur = endpoint.connect().cursor()
+    cur.execute(
+        f"""
+        CREATE TABLE t1(pk int primary key, val integer);
+        INSERT INTO t1 values (generate_series(1, {n_records}), 0);
+    """
+    )
+
+    next_multixact_id_old = query_scalar(
+        cur, "SELECT next_multixact_id FROM pg_control_checkpoint()"
+    )
+
+    # Lock entries using parallel connections in a round-robin fashion.
+    def do_updates():
+        conn = endpoint.connect(autocommit=False)
+        for i in range(n_iters):
+            pk = random.randrange(1, n_records)
+            conn.cursor().execute(f"update t1 set val=val+1 where pk={pk}")
+            conn.cursor().execute("select * from t1 for key share")
+            conn.commit()
+        conn.close()
+
+    for iter in range(n_restarts):
+        threads: List[threading.Thread] = []
+        for i in range(n_threads):
+            threads.append(threading.Thread(target=do_updates, args=(), daemon=False))
+            threads[-1].start()
+
+        for thread in threads:
+            thread.join()
+
+        # Restart endpoint
+        endpoint.stop()
+        endpoint.start()
+
+        conn = endpoint.connect()
+        cur = conn.cursor()
+        cur.execute("select count(*) from t1")
+        assert cur.fetchone() == (n_records,)
+
+    # force wal flush
+    cur.execute("checkpoint")
+
+    cur.execute(
+        "SELECT next_multixact_id, pg_current_wal_insert_lsn() FROM pg_control_checkpoint()"
+    )
+    res = cur.fetchone()
+    assert res is not None
+    next_multixact_id = res[0]
+    lsn = res[1]
+
+    # Ensure that we did lock some tuples
+    assert int(next_multixact_id) > int(next_multixact_id_old)
+
+    # Branch at this point
+    env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn)
+    endpoint_new = env.endpoints.create_start("test_multixact_new")
+
+    log.info("postgres is running on 'test_multixact_new' branch")
+    next_multixact_id_new = endpoint_new.safe_psql(
+        "SELECT next_multixact_id FROM pg_control_checkpoint()"
+    )[0][0]
+
+    # Check that we restored pg_controlfile correctly
+    assert next_multixact_id_new == next_multixact_id
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -16,13 +16,11 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por
            endpoint_id="ep-basic-main", pg_port=pg_port, http_port=http_port
        )

-        branch_name = "migration-check"
-
-        env.neon_cli.create_branch(new_branch_name=branch_name)
+        env.neon_cli.create_branch(new_branch_name="migration_check")
        pg_port = port_distributor.get_port()
        http_port = port_distributor.get_port()
        env.neon_cli.endpoint_start(
-            f"ep-{branch_name}", pg_port, http_port, branch_name=branch_name
+            endpoint_id="ep-migration_check", pg_port=pg_port, http_port=http_port
        )
    finally:
        env.neon_cli.stop()
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -27,16 +27,15 @@ from fixtures.types import Lsn
 from fixtures.utils import query_scalar, wait_until


-def get_num_downloaded_layers(client: PageserverHttpClient):
-    """
-    This assumes that the pageserver only has a single tenant.
-    """
+def get_num_downloaded_layers(client: PageserverHttpClient, tenant_id, timeline_id):
    value = client.get_metric_value(
        "pageserver_remote_operation_seconds_count",
        {
            "file_kind": "layer",
            "op_kind": "download",
            "status": "success",
+            "tenant_id": tenant_id,
+            "timeline_id": timeline_id,
        },
    )
    if value is None:
@@ -58,8 +57,7 @@ def test_ondemand_download_large_rel(
        test_name="test_ondemand_download_large_rel",
    )

-    # thinking about using a shared environment? the test assumes that global
-    # metrics are for single tenant.
+    ##### First start, insert secret data and upload it to the remote storage
    env = neon_env_builder.init_start(
        initial_tenant_conf={
            # disable background GC
@@ -131,7 +129,7 @@ def test_ondemand_download_large_rel(
    # safekeepers, that have now been shut down.
    endpoint = env.endpoints.create_start("main", lsn=current_lsn)

-    before_downloads = get_num_downloaded_layers(client)
+    before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
    assert before_downloads != 0, "basebackup should on-demand non-zero layers"

    # Probe in the middle of the table. There's a high chance that the beginning
@@ -142,7 +140,7 @@ def test_ondemand_download_large_rel(
    with endpoint.cursor() as cur:
        assert query_scalar(cur, "select count(*) from tbl where id = 500000") == 1

-    after_downloads = get_num_downloaded_layers(client)
+    after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
    log.info(f"layers downloaded before {before_downloads} and after {after_downloads}")
    assert after_downloads > before_downloads

@@ -161,11 +159,13 @@ def test_ondemand_download_timetravel(
        test_name="test_ondemand_download_timetravel",
    )

-    # thinking about using a shared environment? the test assumes that global
-    # metrics are for single tenant.
+    ##### First start, insert data and upload it to the remote storage
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()

-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
+    # Override defaults, to create more layers
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
            # Disable background GC & compaction
            # We don't want GC, that would break the assertion about num downloads.
            # We don't want background compaction, we force a compaction every time we do explicit checkpoint.
@@ -178,7 +178,7 @@ def test_ondemand_download_timetravel(
            "compaction_target_size": f"{1 * 1024 ** 2}",  # 1 MB
        }
    )
-    pageserver_http = env.pageserver.http_client()
+    env.initial_tenant = tenant

    endpoint = env.endpoints.create_start("main")

@@ -283,7 +283,7 @@ def test_ondemand_download_timetravel(
                == table_len
            )

-        after_downloads = get_num_downloaded_layers(client)
+        after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
        num_layers_downloaded.append(after_downloads)
        log.info(f"num_layers_downloaded[-1]={num_layers_downloaded[-1]}")

@@ -324,8 +324,11 @@ def test_download_remote_layers_api(
    )

    ##### First start, insert data and upload it to the remote storage
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
+    env = neon_env_builder.init_start()
+
+    # Override defaults, to create more layers
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
            # Disable background GC & compaction
            # We don't want GC, that would break the assertion about num downloads.
            # We don't want background compaction, we force a compaction every time we do explicit checkpoint.
@@ -338,6 +341,7 @@ def test_download_remote_layers_api(
            "compaction_target_size": f"{1 * 1024 ** 2}",  # 1 MB
        }
    )
+    env.initial_tenant = tenant

    endpoint = env.endpoints.create_start("main")

@@ -485,6 +489,8 @@ def test_compaction_downloads_on_demand_without_image_creation(
        test_name="test_compaction_downloads_on_demand_without_image_creation",
    )

+    env = neon_env_builder.init_start()
+
    conf = {
        # Disable background GC & compaction
        "gc_period": "0s",
@@ -500,8 +506,6 @@ def test_compaction_downloads_on_demand_without_image_creation(
        # pitr_interval and gc_horizon are not interesting because we dont run gc
    }

-    env = neon_env_builder.init_start(initial_tenant_conf=stringify(conf))
-
    def downloaded_bytes_and_count(pageserver_http: PageserverHttpClient) -> Tuple[int, int]:
        m = pageserver_http.get_metrics()
        # these are global counters
@@ -513,12 +517,11 @@ def test_compaction_downloads_on_demand_without_image_creation(
        assert count < 2**53 and count.is_integer(), "count should still be safe integer-in-f64"
        return (int(total_bytes), int(count))

+    # Override defaults, to create more layers
+    tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
+    env.initial_tenant = tenant_id
    pageserver_http = env.pageserver.http_client()

-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-    assert timeline_id is not None
-
    with env.endpoints.create_start("main") as endpoint:
        # no particular reason to create the layers like this, but we are sure
        # not to hit the image_creation_threshold here.
@@ -574,6 +577,8 @@ def test_compaction_downloads_on_demand_with_image_creation(
        test_name="test_compaction_downloads_on_demand",
    )

+    env = neon_env_builder.init_start()
+
    conf = {
        # Disable background GC & compaction
        "gc_period": "0s",
@@ -588,11 +593,9 @@ def test_compaction_downloads_on_demand_with_image_creation(
        # pitr_interval and gc_horizon are not interesting because we dont run gc
    }

-    env = neon_env_builder.init_start(initial_tenant_conf=stringify(conf))
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-    assert timeline_id is not None
-
+    # Override defaults, to create more layers
+    tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
+    env.initial_tenant = tenant_id
    pageserver_http = env.pageserver.http_client()

    endpoint = env.endpoints.create_start("main")
@@ -661,6 +664,10 @@ def test_compaction_downloads_on_demand_with_image_creation(
    assert dict(kinds_after) == {"Delta": 4, "Image": 1}


+def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
+    return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
+
+
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_ondemand_download_failure_to_replace(
    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
@@ -684,12 +691,15 @@ def test_ondemand_download_failure_to_replace(

    env = neon_env_builder.init_start()

-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-    assert timeline_id is not None
+    tenant_id, timeline_id = env.neon_cli.create_tenant()

+    env.initial_tenant = tenant_id
    pageserver_http = env.pageserver.http_client()

+    lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
+
+    wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
+
    # remove layers so that they will be redownloaded
    pageserver_http.tenant_detach(tenant_id)
    pageserver_http.tenant_attach(tenant_id)
@@ -700,10 +710,8 @@ def test_ondemand_download_failure_to_replace(
    # requesting details with non-incremental size should trigger a download of the only layer
    # this will need to be adjusted if an index for logical sizes is ever implemented
    with pytest.raises(PageserverApiException):
-        # PageserverApiException is expected because of the failpoint (timeline_detail building does something)
-        # ReadTimeout can happen on our busy CI, but it should not, because there is no more busylooping
-        # but should it be added back, we would wait for 15s here.
-        pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=15)
+        # error message is not useful
+        pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2)

    actual_message = ".* ERROR .*layermap-replace-notfound"
    assert env.pageserver.log_contains(actual_message) is not None
@@ -716,7 +724,3 @@ def test_ondemand_download_failure_to_replace(
    env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'")

    # if the above returned, then we didn't have a livelock, and all is well
-
-
-def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
-    return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -72,6 +72,10 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()

+    # These warnings are expected, when the pageserver is restarted abruptly
+    env.pageserver.allowed_errors.append(".*found future image layer.*")
+    env.pageserver.allowed_errors.append(".*found future delta layer.*")
+
    # Use a tiny checkpoint distance, to create a lot of layers quickly.
    # That allows us to stress the compaction and layer flushing logic more.
    tenant, _ = env.neon_cli.create_tenant(
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -1,6 +1,6 @@
 import json
 import subprocess
-from typing import Any, List, Optional, Tuple
+from typing import Any, List, Optional

 import psycopg2
 import pytest
@@ -260,73 +260,3 @@ def test_sql_over_http_output_options(static_proxy: NeonProxy):

    rows = q("select 1 as n, 'a' as s, '{1,2,3}'::int4[] as arr", True, True)["rows"]
    assert rows == [["1", "a", "{1,2,3}"]]
-
-
-def test_sql_over_http_batch(static_proxy: NeonProxy):
-    static_proxy.safe_psql("create role http with login password 'http' superuser")
-
-    def qq(queries: List[Tuple[str, Optional[List[Any]]]], read_only: bool = False) -> Any:
-        connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
-        response = requests.post(
-            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
-            data=json.dumps(list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))),
-            headers={
-                "Content-Type": "application/sql",
-                "Neon-Connection-String": connstr,
-                "Neon-Batch-Isolation-Level": "Serializable",
-                "Neon-Batch-Read-Only": "true" if read_only else "false",
-            },
-            verify=str(static_proxy.test_output_dir / "proxy.crt"),
-        )
-        assert response.status_code == 200
-        return response.json()["results"], response.headers
-
-    result, headers = qq(
-        [
-            ("select 42 as answer", None),
-            ("select $1 as answer", [42]),
-            ("select $1 * 1 as answer", [42]),
-            ("select $1::int[] as answer", [[1, 2, 3]]),
-            ("select $1::json->'a' as answer", [{"a": {"b": 42}}]),
-            ("select * from pg_class limit 1", None),
-            ("create table t(id serial primary key, val int)", None),
-            ("insert into t(val) values (10), (20), (30) returning id", None),
-            ("select * from t", None),
-            ("drop table t", None),
-        ]
-    )
-
-    assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
-    assert headers["Neon-Batch-Read-Only"] == "false"
-
-    assert result[0]["rows"] == [{"answer": 42}]
-    assert result[1]["rows"] == [{"answer": "42"}]
-    assert result[2]["rows"] == [{"answer": 42}]
-    assert result[3]["rows"] == [{"answer": [1, 2, 3]}]
-    assert result[4]["rows"] == [{"answer": {"b": 42}}]
-    assert len(result[5]["rows"]) == 1
-    res = result[6]
-    assert res["command"] == "CREATE"
-    assert res["rowCount"] is None
-    res = result[7]
-    assert res["command"] == "INSERT"
-    assert res["rowCount"] == 3
-    assert res["rows"] == [{"id": 1}, {"id": 2}, {"id": 3}]
-    res = result[8]
-    assert res["command"] == "SELECT"
-    assert res["rowCount"] == 3
-    res = result[9]
-    assert res["command"] == "DROP"
-    assert res["rowCount"] is None
-    assert len(result) == 10
-
-    result, headers = qq(
-        [
-            ("select 42 as answer", None),
-        ],
-        True,
-    )
-    assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
-    assert headers["Neon-Batch-Read-Only"] == "true"
-
-    assert result[0]["rows"] == [{"answer": 42}]
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -15,6 +15,10 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    env.pageserver.is_testing_enabled_or_skip()

+    # These warnings are expected, when the pageserver is restarted abruptly
+    env.pageserver.allowed_errors.append(".*found future delta layer.*")
+    env.pageserver.allowed_errors.append(".*found future image layer.*")
+
    # Create a branch for us
    env.neon_cli.create_branch("test_pageserver_recovery", "main")

--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -378,10 +378,12 @@ def test_remote_timeline_client_calls_started_metric(
        test_name="test_remote_timeline_client_metrics",
    )

-    # thinking about using a shared environment? the test assumes that global
-    # metrics are for single tenant.
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
+    env = neon_env_builder.init_start()
+
+    # create tenant with config that will determinstically allow
+    # compaction and gc
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        conf={
            # small checkpointing and compaction targets to ensure we generate many upload operations
            "checkpoint_distance": f"{128 * 1024}",
            "compaction_threshold": "1",
@@ -396,10 +398,6 @@ def test_remote_timeline_client_calls_started_metric(
        }
    )

-    tenant_id = env.initial_tenant
-    assert env.initial_timeline is not None
-    timeline_id: TimelineId = env.initial_timeline
-
    client = env.pageserver.http_client()

    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
@@ -421,7 +419,6 @@ def test_remote_timeline_client_calls_started_metric(
                "VACUUM foo",
            ]
        )
-        assert timeline_id is not None
        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)

    calls_started: Dict[Tuple[str, str], List[int]] = {
@@ -431,14 +428,13 @@ def test_remote_timeline_client_calls_started_metric(
    }

    def fetch_calls_started():
-        assert timeline_id is not None
        for (file_kind, op_kind), observations in calls_started.items():
-            val = client.get_metric_value(
-                name="pageserver_remote_timeline_client_calls_started_count",
-                filter={
-                    "file_kind": str(file_kind),
-                    "op_kind": str(op_kind),
-                },
+            val = client.get_remote_timeline_client_metric(
+                "pageserver_remote_timeline_client_calls_started_count",
+                tenant_id,
+                timeline_id,
+                file_kind,
+                op_kind,
            )
            assert val is not None, f"expecting metric to be present: {file_kind} {op_kind}"
            val = int(val)
@@ -522,8 +518,12 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
        test_name="test_timeline_deletion_with_files_stuck_in_upload_queue",
    )

-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
+    env = neon_env_builder.init_start()
+
+    # create tenant with config that will determinstically allow
+    # compaction and gc
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        conf={
            # small checkpointing and compaction targets to ensure we generate many operations
            "checkpoint_distance": f"{64 * 1024}",
            "compaction_threshold": "1",
@@ -535,10 +535,6 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
            "pitr_interval": "0s",
        }
    )
-    tenant_id = env.initial_tenant
-    assert env.initial_timeline is not None
-    timeline_id: TimelineId = env.initial_timeline
-
    timeline_path = env.timeline_dir(tenant_id, timeline_id)

    client = env.pageserver.http_client()
@@ -791,8 +787,12 @@ def test_compaction_delete_before_upload(
        test_name="test_compaction_delete_before_upload",
    )

-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
+    env = neon_env_builder.init_start()
+
+    # create tenant with config that will determinstically allow
+    # compaction and disables gc
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        conf={
            # Set a small compaction threshold
            "compaction_threshold": "3",
            # Disable GC
@@ -802,10 +802,6 @@ def test_compaction_delete_before_upload(
        }
    )

-    tenant_id = env.initial_tenant
-    assert env.initial_timeline is not None
-    timeline_id: TimelineId = env.initial_timeline
-
    client = env.pageserver.http_client()

    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -2,7 +2,6 @@ import asyncio
 import random
 import time
 from threading import Thread
-from typing import List, Optional

 import asyncpg
 import pytest
@@ -22,7 +21,6 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
-from prometheus_client.samples import Sample


 def do_gc_target(
@@ -856,89 +854,3 @@ def ensure_test_data(data_id: int, data: str, endpoint: Endpoint):
        assert (
            query_scalar(cur, f"SELECT secret FROM test WHERE id = {data_id};") == data
        ), "Should have timeline data back"
-
-
-@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
-def test_metrics_while_ignoring_broken_tenant_and_reloading(
-    neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
-):
-    neon_env_builder.enable_remote_storage(
-        remote_storage_kind=remote_storage_kind,
-        test_name="test_metrics_while_ignoring_broken_tenant_and_reloading",
-    )
-
-    env = neon_env_builder.init_start()
-
-    client = env.pageserver.http_client()
-    env.pageserver.allowed_errors.append(
-        r".* Changing Active tenant to Broken state, reason: broken from test"
-    )
-
-    def only_int(samples: List[Sample]) -> Optional[int]:
-        if len(samples) == 1:
-            return int(samples[0].value)
-        assert len(samples) == 0
-        return None
-
-    wait_until_tenant_state(client, env.initial_tenant, "Active", 10, 0.5)
-
-    client.tenant_break(env.initial_tenant)
-
-    found_broken = False
-    active, broken, broken_set = ([], [], [])
-    for _ in range(10):
-        m = client.get_metrics()
-        active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
-        broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
-        broken_set = m.query_all(
-            "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
-        )
-        found_broken = only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
-
-        if found_broken:
-            break
-        log.info(f"active: {active}, broken: {broken}, broken_set: {broken_set}")
-        time.sleep(0.5)
-    assert (
-        found_broken
-    ), f"tenant shows up as broken; active={active}, broken={broken}, broken_set={broken_set}"
-
-    client.tenant_ignore(env.initial_tenant)
-
-    found_broken = False
-    broken, broken_set = ([], [])
-    for _ in range(10):
-        m = client.get_metrics()
-        broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
-        broken_set = m.query_all(
-            "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
-        )
-        found_broken = only_int(broken) == 0 and only_int(broken_set) == 1
-
-        if found_broken:
-            break
-        time.sleep(0.5)
-    assert (
-        found_broken
-    ), f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
-
-    client.tenant_load(env.initial_tenant)
-
-    found_active = False
-    active, broken_set = ([], [])
-    for _ in range(10):
-        m = client.get_metrics()
-        active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
-        broken_set = m.query_all(
-            "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
-        )
-        found_active = only_int(active) == 1 and len(broken_set) == 0
-
-        if found_active:
-            break
-        time.sleep(0.5)
-
-    assert (
-        found_active
-    ), f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -213,9 +213,6 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):

    # Test (a subset of) pageserver global metrics
    for metric in PAGESERVER_GLOBAL_METRICS:
-        if metric.startswith("pageserver_remote"):
-            continue
-
        ps_samples = ps_metrics.query_all(metric, {})
        assert len(ps_samples) > 0, f"expected at least one sample for {metric}"
        for sample in ps_samples:
@@ -383,8 +380,10 @@ def test_pageserver_with_empty_tenants(
    ps_metrics = client.get_metrics()
    broken_tenants_metric_filter = {
        "tenant_id": str(tenant_without_timelines_dir),
+        "state": "Broken",
    }
    active_tenants_metric_filter = {
+        "tenant_id": str(tenant_with_empty_timelines),
        "state": "Active",
    }

@@ -400,7 +399,7 @@ def test_pageserver_with_empty_tenants(

    tenant_broken_count = int(
        ps_metrics.query_one(
-            "pageserver_broken_tenants_count", filter=broken_tenants_metric_filter
+            "pageserver_tenant_states_count", filter=broken_tenants_metric_filter
        ).value
    )

--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -38,12 +38,6 @@ def test_threshold_based_eviction(
    env = neon_env_builder.init_start()
    env.pageserver.allowed_errors.append(metrics_refused_log_line)

-    # these can happen whenever we run consumption metrics collection
-    env.pageserver.allowed_errors.append(r".*failed to calculate logical size at \S+: cancelled")
-    env.pageserver.allowed_errors.append(
-        r".*failed to calculate synthetic size for tenant \S+: failed to calculate some logical_sizes"
-    )
-
    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
    assert isinstance(timeline_id, TimelineId)

--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -1,4 +1,3 @@
-import enum
 import os
 import queue
 import shutil
@@ -12,12 +11,9 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
-    PgBin,
    RemoteStorageKind,
    S3Storage,
    available_remote_storages,
-    last_flush_lsn_upload,
-    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
@@ -121,183 +117,59 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
        ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)


-class Check(enum.Enum):
-    RETRY_WITHOUT_RESTART = enum.auto()
-    RETRY_WITH_RESTART = enum.auto()
-
-
-DELETE_FAILPOINTS = [
-    "timeline-delete-before-index-deleted-at",
-    "timeline-delete-before-schedule",
-    "timeline-delete-before-rm",
-    "timeline-delete-during-rm",
-    "timeline-delete-after-rm",
-    "timeline-delete-before-index-delete",
-    "timeline-delete-after-index-delete",
-    "timeline-delete-after-rm-metadata",
-    "timeline-delete-after-rm-dir",
-]
-
-
-def combinations():
-    result = []
-
-    remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
-    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
-        remotes.append(RemoteStorageKind.REAL_S3)
-
-    for remote_storage_kind in remotes:
-        for delete_failpoint in DELETE_FAILPOINTS:
-            if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in (
-                "timeline-delete-before-index-delete",
-                "timeline-delete-after-index-delete",
-            ):
-                # the above failpoints are not relevant for config without remote storage
-                continue
-
-            result.append((remote_storage_kind, delete_failpoint))
-    return result
-
-
 # cover the two cases: remote storage configured vs not configured
-@pytest.mark.parametrize("remote_storage_kind, failpoint", combinations())
-@pytest.mark.parametrize("check", list(Check))
-def test_delete_timeline_exercise_crash_safety_failpoints(
-    neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
-    failpoint: str,
-    check: Check,
-    pg_bin: PgBin,
+@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
+def test_delete_timeline_post_rm_failure(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
 ):
    """
-    If there is a failure during deletion in one of the associated failpoints (or crash restart happens at this point) the delete operation
-    should be retryable and should be successfully resumed.
-
-    We iterate over failpoints list, changing failpoint to the next one.
-
-    1. Set settings to generate many layers
-    2. Create branch.
-    3. Insert something
-    4. Go with the test.
-    5. Iterate over failpoints
-    6. Execute delete for each failpoint
-    7. Ensure failpoint is hit
-    8. Retry or restart without the failpoint and check the result.
+    If there is a failure after removing the timeline directory, the delete operation
+    should be retryable.
    """

    if remote_storage_kind is not None:
        neon_env_builder.enable_remote_storage(
-            remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints"
+            remote_storage_kind, "test_delete_timeline_post_rm_failure"
        )

-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            "gc_period": "0s",
-            "compaction_period": "0s",
-            "checkpoint_distance": f"{1024 ** 2}",
-            "image_creation_threshold": "100",
-        }
-    )
+    env = neon_env_builder.init_start()
+    assert env.initial_timeline
+
+    env.pageserver.allowed_errors.append(".*Error: failpoint: timeline-delete-after-rm")
+    env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline")

    ps_http = env.pageserver.http_client()

-    timeline_id = env.neon_cli.create_timeline("delete")
-    with env.endpoints.create_start("delete") as endpoint:
-        # generate enough layers
-        pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", endpoint.connstr()])
-        if remote_storage_kind is RemoteStorageKind.NOOP:
-            wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline_id)
-        else:
-            last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
+    failpoint_name = "timeline-delete-after-rm"
+    ps_http.configure_failpoints((failpoint_name, "return"))

-    env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
-    # It appears when we stopped flush loop during deletion and then pageserver is stopped
-    env.pageserver.allowed_errors.append(
-        ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited"
+    ps_http.timeline_delete(env.initial_tenant, env.initial_timeline)
+    wait_until_timeline_state(
+        pageserver_http=ps_http,
+        tenant_id=env.initial_tenant,
+        timeline_id=env.initial_timeline,
+        expected_state="Broken",
+        iterations=2,  # effectively try immediately and retry once in one second
    )
-    # This happens when we fail before scheduling background operation.
-    # Timeline is left in stopping state and retry tries to stop it again.
+
+    # FIXME: #4719
+    # timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm"
+
+    at_failpoint_log_message = f".*{env.initial_timeline}.*at failpoint {failpoint_name}.*"
+    env.pageserver.allowed_errors.append(at_failpoint_log_message)
    env.pageserver.allowed_errors.append(
-        ".*Ignoring new state, equal to the existing one: Stopping"
+        f".*DELETE.*{env.initial_timeline}.*InternalServerError.*{failpoint_name}"
    )
-    # This happens when we retry delete requests for broken timelines
-    env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline")
-    # This happens when timeline remains are cleaned up during loading
-    env.pageserver.allowed_errors.append(".*Timeline dir entry become invalid.*")
-    # In one of the branches we poll for tenant to become active. Polls can generate this log message:
-    env.pageserver.allowed_errors.append(f".*Tenant {env.initial_tenant} is not active*")

-    ps_http.configure_failpoints((failpoint, "return"))
+    # retry without failpoint, it should succeed
+    ps_http.configure_failpoints((failpoint_name, "off"))

-    # These failpoints are earlier than background task is spawned.
-    # so they result in api request failure.
-    if failpoint in (
-        "timeline-delete-before-index-deleted-at",
-        "timeline-delete-before-schedule",
-    ):
-        with pytest.raises(PageserverApiException, match=failpoint):
-            ps_http.timeline_delete(env.initial_tenant, timeline_id)
-
-    else:
-        ps_http.timeline_delete(env.initial_tenant, timeline_id)
-        timeline_info = wait_until_timeline_state(
-            pageserver_http=ps_http,
-            tenant_id=env.initial_tenant,
-            timeline_id=timeline_id,
-            expected_state="Broken",
-            iterations=2,  # effectively try immediately and retry once in one second
-        )
-
-        reason = timeline_info["state"]["Broken"]["reason"]
-        log.info(f"timeline broken: {reason}")
-
-        # failpoint may not be the only error in the stack
-        assert reason.endswith(f"failpoint: {failpoint}"), reason
-
-    wait_longer = remote_storage_kind is RemoteStorageKind.REAL_S3
-    if check is Check.RETRY_WITH_RESTART:
-        env.pageserver.stop()
-        env.pageserver.start()
-        if failpoint == "timeline-delete-before-index-deleted-at":
-            # We crashed before persisting this to remote storage, need to retry delete request
-
-            # Wait till tenant is loaded. Shouldnt take longer than 2 seconds (we shouldnt block tenant loading)
-            wait_until_tenant_active(ps_http, env.initial_tenant, iterations=2)
-
-            timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id)
-        else:
-            # Pageserver should've resumed deletion after restart.
-            wait_timeline_detail_404(
-                ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer
-            )
-    elif check is Check.RETRY_WITHOUT_RESTART:
-        # this should succeed
-        # this also checks that delete can be retried even when timeline is in Broken state
-        ps_http.configure_failpoints((failpoint, "off"))
-
-        timeline_delete_wait_completed(
-            ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer
-        )
-
-    # Check remote is impty
-    if remote_storage_kind is RemoteStorageKind.MOCK_S3:
-        assert_prefix_empty(
-            neon_env_builder,
-            prefix="/".join(
-                (
-                    "tenants",
-                    str(env.initial_tenant),
-                    "timelines",
-                    str(timeline_id),
-                )
-            ),
-        )
-
-    timeline_dir = env.timeline_dir(env.initial_tenant, timeline_id)
-    # Check local is empty
-    assert not timeline_dir.exists()
-    # Check no delete mark present
-    assert not (timeline_dir.parent / f"{timeline_id}.___deleted").exists()
+    # this should succeed
+    # this also checks that delete can be retried even when timeline is in Broken state
+    timeline_delete_wait_completed(ps_http, env.initial_tenant, env.initial_timeline)
+    env.pageserver.allowed_errors.append(
+        f".*{env.initial_timeline}.*timeline directory not found, proceeding anyway.*"
+    )


@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
@@ -455,7 +327,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
    )

    ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)
-    timeline_info = wait_until_timeline_state(
+    wait_until_timeline_state(
        pageserver_http=ps_http,
        tenant_id=env.initial_tenant,
        timeline_id=leaf_timeline_id,
@@ -463,7 +335,8 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
        iterations=2,  # effectively try immediately and retry once in one second
    )

-    assert timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-before-rm"
+    # FIXME: #4719
+    # timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm"

    assert leaf_timeline_path.exists(), "the failpoint didn't work"

@@ -715,7 +588,6 @@ def test_timeline_delete_works_for_remote_smoke(
    assert tenant_id == env.initial_tenant
    assert main_timeline_id == env.initial_timeline

-    assert env.initial_timeline is not None
    timeline_ids = [env.initial_timeline]
    for i in range(2):
        branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main")
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/Show More
+++ b/Show More