diff --git a/Cargo.lock b/Cargo.lock
index 44edbabaf6..dbbf330cf9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2915,6 +2915,12 @@ version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
 
+[[package]]
+name = "linux-raw-sys"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0b5399f6804fbab912acbd8878ed3532d506b7c951b8f9f164ef90fef39e3f4"
+
 [[package]]
 name = "lock_api"
 version = "0.4.10"
@@ -3564,6 +3570,7 @@ dependencies = [
  "serde",
  "serde_json",
  "svg_fmt",
+ "thiserror",
  "tokio",
  "tokio-util",
  "toml_edit",
@@ -5122,6 +5129,7 @@ dependencies = [
  "futures-util",
  "hex",
  "histogram",
+ "humantime",
  "itertools",
  "once_cell",
  "pageserver",
@@ -5812,6 +5820,7 @@ dependencies = [
  "anyhow",
  "clap",
  "comfy-table",
+ "humantime",
  "hyper 0.14.26",
  "pageserver_api",
  "pageserver_client",
@@ -6157,7 +6166,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
 dependencies = [
  "futures",
  "nix 0.26.4",
@@ -6669,11 +6678,12 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
 dependencies = [
  "bytes",
  "io-uring",
  "libc",
+ "linux-raw-sys 0.6.4",
 ]
 
 [[package]]
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 87fb218245..90b8868b43 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -89,7 +89,7 @@ RUN apt update && \
 # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
 RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
     echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
-    mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
+    mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
     cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
     DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
     make clean && cp -R /sfcgal/* /
@@ -98,7 +98,7 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"
 
 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
     echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
-    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
+    mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
     find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     ./autogen.sh && \
     ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
@@ -124,7 +124,7 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postg
 
 RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
     echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
-    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
+    mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \
     mkdir build && cd build && \
     cmake -DCMAKE_BUILD_TYPE=Release .. && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -149,7 +149,7 @@ RUN apt update && \
 
 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
     echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
-    mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
+    mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \
     # generate and copy upgrade scripts
     mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
     cp upgrade/* /usr/local/pgsql/share/extension/ && \
@@ -194,7 +194,7 @@ RUN case "$(uname -m)" in \
 
 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
     echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
-    mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \
+    mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \
     mkdir build && cd build && \
     cmake .. -DCMAKE_BUILD_TYPE=Release && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -204,7 +204,7 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz
 
 RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
     echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
-    mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
+    mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -222,7 +222,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
     echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \
-    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
+    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
@@ -243,12 +243,12 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY patches/pgvector.patch /pgvector.patch
 
-# By default, pgvector Makefile uses `-march=native`. We don't want that, 
+# By default, pgvector Makefile uses `-march=native`. We don't want that,
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
-    echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
-    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.1.tar.gz -O pgvector.tar.gz && \
+    echo "fe6c8cb4e0cd1a8cb60f5badf9e1701e0fcabcfc260931c26d01e155c4dd21d1 pgvector.tar.gz" | sha256sum --check && \
+    mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
     patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -266,7 +266,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 # 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
 RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
     echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \
-    mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \
+    mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
 
@@ -281,7 +281,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
     echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
-    mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
+    mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
@@ -297,7 +297,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
     echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
-    mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
+    mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
@@ -313,7 +313,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
     echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
-    mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \
+    mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
@@ -329,7 +329,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
     echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \
-    mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \
+    mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
@@ -345,7 +345,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
     echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
-    mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
+    mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
@@ -361,7 +361,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
     echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
-    mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
+    mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
@@ -377,7 +377,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
     echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
-    mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
+    mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
@@ -393,7 +393,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
     echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
-    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
+    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
@@ -424,7 +424,7 @@ RUN case "${PG_VERSION}" in \
     apt-get install -y cmake && \
     wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
     echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
-    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
+    mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \
     ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
     cd build && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -462,7 +462,7 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \
     echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \
-    mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xvzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
+    mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make install -j $(getconf _NPROCESSORS_ONLN) && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control
@@ -481,7 +481,7 @@ RUN apt-get update && \
     apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \
     wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
     echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
-    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
+    mkdir kq_imcx-src && cd kq_imcx-src && tar xzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
     find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     mkdir build && cd build && \
     cmake -DCMAKE_BUILD_TYPE=Release .. && \
@@ -505,7 +505,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
     echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
-    mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
+    mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
@@ -531,7 +531,7 @@ RUN apt-get update && \
 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
     echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
-    mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
+    mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
     cmake \
         -D RDK_BUILD_CAIRO_SUPPORT=OFF \
         -D RDK_BUILD_INCHI_SUPPORT=ON \
@@ -571,7 +571,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
     echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
-    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
+    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
@@ -588,7 +588,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
     echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
-    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
+    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
@@ -605,7 +605,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
     echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
-    mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \
+    mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
@@ -631,7 +631,7 @@ RUN case "${PG_VERSION}" in \
     esac && \
     wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
     echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
-    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
+    mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install
 
@@ -647,7 +647,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
     echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
-    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
+    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
     find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
@@ -696,7 +696,7 @@ ARG PG_VERSION
 
 RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
     echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
-    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
+    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
@@ -713,7 +713,7 @@ ARG PG_VERSION
 
 RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
     echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
-    mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
+    mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
     # it's needed to enable extension because it uses untrusted C language
@@ -733,7 +733,7 @@ ARG PG_VERSION
 # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
 RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
     echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
-    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
+    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
 
@@ -749,7 +749,7 @@ ARG PG_VERSION
 
 RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
     echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
-    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
+    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
     echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
     wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
     patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
@@ -771,7 +771,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
     echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
-    mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
+    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install
 
@@ -787,7 +787,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
     echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
-    mkdir pg_ivm-src && cd pg_ivm-src && tar xvzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
+    mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
@@ -804,7 +804,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
     echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
-    mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \
+    mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml
index 61eb7fa4e4..ed3462961f 100644
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -9,6 +9,7 @@ license.workspace = true
 anyhow.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+humantime.workspace = true
 hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index c19bc96cdb..05c4acdf90 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -7,8 +7,9 @@ use pageserver_api::{
         TenantDescribeResponse, TenantPolicyRequest,
     },
     models::{
-        LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
-        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
+        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
+        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
+        TenantShardSplitRequest, TenantShardSplitResponse,
     },
     shard::{ShardStripeSize, TenantShardId},
 };
@@ -125,6 +126,28 @@ enum Command {
         #[arg(long)]
         tenant_id: TenantId,
     },
+    /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
+    /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
+    TenantDrop {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        unclean: bool,
+    },
+    NodeDrop {
+        #[arg(long)]
+        node_id: NodeId,
+        #[arg(long)]
+        unclean: bool,
+    },
+    TenantSetTimeBasedEviction {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        period: humantime::Duration,
+        #[arg(long)]
+        threshold: humantime::Duration,
+    },
 }
 
 #[derive(Parser)]
@@ -674,6 +697,46 @@ async fn main() -> anyhow::Result<()> {
                 }
             }
         }
+        Command::TenantDrop { tenant_id, unclean } => {
+            if !unclean {
+                anyhow::bail!("This command is not a tenant deletion, and uncleanly drops all controller state for the tenant.  If you know what you're doing, add `--unclean` to proceed.")
+            }
+            storcon_client
+                .dispatch::<(), ()>(
+                    Method::POST,
+                    format!("debug/v1/tenant/{tenant_id}/drop"),
+                    None,
+                )
+                .await?;
+        }
+        Command::NodeDrop { node_id, unclean } => {
+            if !unclean {
+                anyhow::bail!("This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it.  If you know what you're doing, add `--unclean` to proceed.")
+            }
+            storcon_client
+                .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
+                .await?;
+        }
+        Command::TenantSetTimeBasedEviction {
+            tenant_id,
+            period,
+            threshold,
+        } => {
+            vps_client
+                .tenant_config(&TenantConfigRequest {
+                    tenant_id,
+                    config: TenantConfig {
+                        eviction_policy: Some(EvictionPolicy::LayerAccessThreshold(
+                            EvictionPolicyLayerAccessThreshold {
+                                period: period.into(),
+                                threshold: threshold.into(),
+                            },
+                        )),
+                        ..Default::default()
+                    },
+                })
+                .await?;
+        }
     }
 
     Ok(())
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 2511de00d5..997c1cc43a 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,7 +1,7 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
-use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::RepOriginId;
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::{fmt, ops::Range};
@@ -39,6 +39,9 @@ pub const RELATION_SIZE_PREFIX: u8 = 0x61;
 /// The key prefix of AUX file keys.
 pub const AUX_KEY_PREFIX: u8 = 0x62;
 
+/// The key prefix of ReplOrigin keys.
+pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63;
+
 /// Check if the key falls in the range of metadata keys.
 pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
     key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
@@ -53,14 +56,8 @@ impl Key {
     /// Encode a metadata key to a storage key.
     pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
         assert!(is_metadata_key_slice(key), "key not in metadata key range");
-        Key {
-            field1: key[0],
-            field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
-            field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
-            field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
-            field5: key[11],
-            field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
-        }
+        // Metadata key space ends at 0x7F so it's fine to directly convert it to i128.
+        Self::from_i128(i128::from_be_bytes(*key))
     }
 
     /// Encode a metadata key to a storage key.
@@ -68,17 +65,6 @@ impl Key {
         Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
     }
 
-    /// Extract a metadata key to a writer. The result should always be 16 bytes.
-    pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
-        writer.put_u8(self.field1);
-        assert!(self.field2 <= 0xFFFF);
-        writer.put_u16(self.field2 as u16);
-        writer.put_u32(self.field3);
-        writer.put_u32(self.field4);
-        writer.put_u8(self.field5);
-        writer.put_u32(self.field6);
-    }
-
     /// Get the range of metadata keys.
     pub const fn metadata_key_range() -> Range<Self> {
         Key {
@@ -121,7 +107,7 @@ impl Key {
     /// As long as Neon does not support tablespace (because of lack of access to local file system),
     /// we can assume that only some predefined namespace OIDs are used which can fit in u16
     pub fn to_i128(&self) -> i128 {
-        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
+        assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
         (((self.field1 & 0x7F) as i128) << 120)
             | (((self.field2 & 0xFFFF) as i128) << 104)
             | ((self.field3 as i128) << 72)
@@ -175,7 +161,7 @@ impl Key {
     }
 
     /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::from_metadata_key`] instead.
+    /// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys).
     pub fn from_slice(b: &[u8]) -> Self {
         Key {
             field1: b[0],
@@ -188,7 +174,7 @@ impl Key {
     }
 
     /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::extract_metadata_key_to_writer`] instead.
+    /// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys).
     pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
         buf[0] = self.field1;
         BE::write_u32(&mut buf[1..5], self.field2);
@@ -399,7 +385,14 @@ pub fn rel_size_to_key(rel: RelTag) -> Key {
         field3: rel.dbnode,
         field4: rel.relnode,
         field5: rel.forknum,
-        field6: 0xffffffff,
+        field6: 0xffff_ffff,
+    }
+}
+
+impl Key {
+    #[inline(always)]
+    pub fn is_rel_size_key(&self) -> bool {
+        self.field1 == 0 && self.field6 == u32::MAX
     }
 }
 
@@ -440,6 +433,25 @@ pub fn slru_dir_to_key(kind: SlruKind) -> Key {
     }
 }
 
+#[inline(always)]
+pub fn slru_dir_kind(key: &Key) -> Option<Result<SlruKind, u32>> {
+    if key.field1 == 0x01
+        && key.field3 == 0
+        && key.field4 == 0
+        && key.field5 == 0
+        && key.field6 == 0
+    {
+        match key.field2 {
+            0 => Some(Ok(SlruKind::Clog)),
+            1 => Some(Ok(SlruKind::MultiXactMembers)),
+            2 => Some(Ok(SlruKind::MultiXactOffsets)),
+            x => Some(Err(x)),
+        }
+    } else {
+        None
+    }
+}
+
 #[inline(always)]
 pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
     Key {
@@ -468,7 +480,17 @@ pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
         field3: 1,
         field4: segno,
         field5: 0,
-        field6: 0xffffffff,
+        field6: 0xffff_ffff,
+    }
+}
+
+impl Key {
+    pub fn is_slru_segment_size_key(&self) -> bool {
+        self.field1 == 0x01
+            && self.field2 < 0x03
+            && self.field3 == 0x01
+            && self.field5 == 0
+            && self.field6 == u32::MAX
     }
 }
 
@@ -569,6 +591,37 @@ pub const AUX_FILES_KEY: Key = Key {
     field6: 2,
 };
 
+#[inline(always)]
+pub fn repl_origin_key(origin_id: RepOriginId) -> Key {
+    Key {
+        field1: REPL_ORIGIN_KEY_PREFIX,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: origin_id as u32,
+    }
+}
+
+/// Get the range of replorigin keys.
+pub fn repl_origin_key_range() -> Range<Key> {
+    Key {
+        field1: REPL_ORIGIN_KEY_PREFIX,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }..Key {
+        field1: REPL_ORIGIN_KEY_PREFIX,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: 0x10000,
+    }
+}
+
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.
 
@@ -577,73 +630,78 @@ pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
 /// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
 pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
 
-// AUX_FILES currently stores only data for logical replication (slots etc), and
-// we don't preserve these on a branch because safekeepers can't follow timeline
-// switch (and generally it likely should be optional), so ignore these.
-#[inline(always)]
-pub fn is_inherited_key(key: Key) -> bool {
-    !NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key)
-}
+impl Key {
+    // AUX_FILES currently stores only data for logical replication (slots etc), and
+    // we don't preserve these on a branch because safekeepers can't follow timeline
+    // switch (and generally it likely should be optional), so ignore these.
+    #[inline(always)]
+    pub fn is_inherited_key(self) -> bool {
+        !NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self)
+    }
 
-#[inline(always)]
-pub fn is_rel_fsm_block_key(key: Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
-}
+    #[inline(always)]
+    pub fn is_rel_fsm_block_key(self) -> bool {
+        self.field1 == 0x00
+            && self.field4 != 0
+            && self.field5 == FSM_FORKNUM
+            && self.field6 != 0xffffffff
+    }
 
-#[inline(always)]
-pub fn is_rel_vm_block_key(key: Key) -> bool {
-    key.field1 == 0x00
-        && key.field4 != 0
-        && key.field5 == VISIBILITYMAP_FORKNUM
-        && key.field6 != 0xffffffff
-}
+    #[inline(always)]
+    pub fn is_rel_vm_block_key(self) -> bool {
+        self.field1 == 0x00
+            && self.field4 != 0
+            && self.field5 == VISIBILITYMAP_FORKNUM
+            && self.field6 != 0xffffffff
+    }
 
-#[inline(always)]
-pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
-    Ok(match key.field1 {
-        0x01 => {
-            let kind = match key.field2 {
-                0x00 => SlruKind::Clog,
-                0x01 => SlruKind::MultiXactMembers,
-                0x02 => SlruKind::MultiXactOffsets,
-                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
-            };
-            let segno = key.field4;
-            let blknum = key.field6;
+    #[inline(always)]
+    pub fn to_slru_block(self) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
+        Ok(match self.field1 {
+            0x01 => {
+                let kind = match self.field2 {
+                    0x00 => SlruKind::Clog,
+                    0x01 => SlruKind::MultiXactMembers,
+                    0x02 => SlruKind::MultiXactOffsets,
+                    _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", self.field2),
+                };
+                let segno = self.field4;
+                let blknum = self.field6;
 
-            (kind, segno, blknum)
-        }
-        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
-    })
-}
+                (kind, segno, blknum)
+            }
+            _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
+        })
+    }
 
-#[inline(always)]
-pub fn is_slru_block_key(key: Key) -> bool {
-    key.field1 == 0x01                // SLRU-related
-        && key.field3 == 0x00000001   // but not SlruDir
-        && key.field6 != 0xffffffff // and not SlruSegSize
-}
+    #[inline(always)]
+    pub fn is_slru_block_key(self) -> bool {
+        self.field1 == 0x01                // SLRU-related
+        && self.field3 == 0x00000001   // but not SlruDir
+        && self.field6 != 0xffffffff // and not SlruSegSize
+    }
 
-#[inline(always)]
-pub fn is_rel_block_key(key: &Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
-}
+    #[inline(always)]
+    pub fn is_rel_block_key(&self) -> bool {
+        self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff
+    }
 
-/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
-#[inline(always)]
-pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
-    Ok(match key.field1 {
-        0x00 => (
-            RelTag {
-                spcnode: key.field2,
-                dbnode: key.field3,
-                relnode: key.field4,
-                forknum: key.field5,
-            },
-            key.field6,
-        ),
-        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
-    })
+    /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
+    #[inline(always)]
+    pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
+        Ok(match self.field1 {
+            0x00 => (
+                RelTag {
+                    spcnode: self.field2,
+                    dbnode: self.field3,
+                    relnode: self.field4,
+                    forknum: self.field5,
+                },
+                self.field6,
+            ),
+            _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
+        })
+    }
 }
 
 impl std::str::FromStr for Key {
@@ -687,10 +745,15 @@ mod tests {
         let mut metadata_key = vec![AUX_KEY_PREFIX];
         metadata_key.extend_from_slice(&[0xFF; 15]);
         let encoded_key = Key::from_metadata_key(&metadata_key);
-        let mut output_key = Vec::new();
-        encoded_key.extract_metadata_key_to_writer(&mut output_key);
+        let output_key = encoded_key.to_i128().to_be_bytes();
         assert_eq!(metadata_key, output_key);
         assert!(encoded_key.is_metadata_key());
         assert!(is_metadata_key_slice(&metadata_key));
     }
+
+    #[test]
+    fn test_possible_largest_key() {
+        Key::from_i128(0x7FFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF);
+        // TODO: put this key into the system and see if anything breaks.
+    }
 }
diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs
index 38693ab847..010a9c2932 100644
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -3,7 +3,7 @@ use std::cmp::Ordering;
 use std::fmt;
 
 use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
-use postgres_ffi::relfile_utils::forknumber_to_name;
+use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM};
 use postgres_ffi::Oid;
 
 ///
@@ -68,6 +68,57 @@ impl fmt::Display for RelTag {
     }
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum ParseRelTagError {
+    #[error("invalid forknum")]
+    InvalidForknum(#[source] std::num::ParseIntError),
+    #[error("missing triplet member {}", .0)]
+    MissingTripletMember(usize),
+    #[error("invalid triplet member {}", .0)]
+    InvalidTripletMember(usize, #[source] std::num::ParseIntError),
+}
+
+impl std::str::FromStr for RelTag {
+    type Err = ParseRelTagError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        use ParseRelTagError::*;
+
+        // FIXME: in postgres logs this separator is dot
+        // Example:
+        //     could not read block 2 in rel 1663/208101/2620.1 from page server at lsn 0/2431E6F0
+        // with a regex we could get this more painlessly
+        let (triplet, forknum) = match s.split_once('_').or_else(|| s.split_once('.')) {
+            Some((t, f)) => {
+                let forknum = forkname_to_number(Some(f));
+                let forknum = if let Ok(f) = forknum {
+                    f
+                } else {
+                    f.parse::<u8>().map_err(InvalidForknum)?
+                };
+
+                (t, Some(forknum))
+            }
+            None => (s, None),
+        };
+
+        let mut split = triplet
+            .splitn(3, '/')
+            .enumerate()
+            .map(|(i, s)| s.parse::<u32>().map_err(|e| InvalidTripletMember(i, e)));
+        let spcnode = split.next().ok_or(MissingTripletMember(0))??;
+        let dbnode = split.next().ok_or(MissingTripletMember(1))??;
+        let relnode = split.next().ok_or(MissingTripletMember(2))??;
+
+        Ok(RelTag {
+            spcnode,
+            forknum: forknum.unwrap_or(MAIN_FORKNUM),
+            dbnode,
+            relnode,
+        })
+    }
+}
+
 impl RelTag {
     pub fn to_segfile_name(&self, segno: u32) -> String {
         let mut name = if self.spcnode == GLOBALTABLESPACE_OID {
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 1c05a01926..8c5a4e6168 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,9 +1,6 @@
 use std::{ops::RangeInclusive, str::FromStr};
 
-use crate::{
-    key::{is_rel_block_key, Key},
-    models::ShardParameters,
-};
+use crate::{key::Key, models::ShardParameters};
 use hex::FromHex;
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
@@ -428,6 +425,12 @@ impl<'de> Deserialize<'de> for TenantShardId {
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);
 
+impl Default for ShardStripeSize {
+    fn default() -> Self {
+        DEFAULT_STRIPE_SIZE
+    }
+}
+
 /// Layout version: for future upgrades where we might change how the key->shard mapping works
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardLayout(u8);
@@ -666,7 +669,7 @@ fn key_is_shard0(key: &Key) -> bool {
     // because they must be included in basebackups.
     let is_initfork = key.field5 == INIT_FORKNUM;
 
-    !is_rel_block_key(key) || is_initfork
+    !key.is_rel_block_key() || is_initfork
 }
 
 /// Provide the same result as the function in postgres `hashfn.h` with the same name
@@ -713,6 +716,25 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
     ShardNumber((hash % count.0 as u32) as u8)
 }
 
+/// For debugging, while not exposing the internals.
+#[derive(Debug)]
+#[allow(unused)] // used by debug formatting by pagectl
+struct KeyShardingInfo {
+    shard0: bool,
+    shard_number: ShardNumber,
+}
+
+pub fn describe(
+    key: &Key,
+    shard_count: ShardCount,
+    stripe_size: ShardStripeSize,
+) -> impl std::fmt::Debug {
+    KeyShardingInfo {
+        shard0: key_is_shard0(key),
+        shard_number: key_to_shard_number(shard_count, stripe_size, key),
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use utils::Hex;
diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs
index 8e6761d6d3..370d9e9a6f 100644
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -126,6 +126,7 @@ fn main() -> anyhow::Result<()> {
             .allowlist_type("PageHeaderData")
             .allowlist_type("DBState")
             .allowlist_type("RelMapFile")
+            .allowlist_type("RepOriginId")
             // Because structs are used for serialization, tell bindgen to emit
             // explicit padding fields.
             .explicit_padding(true)
diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index 0d6986778a..729f57f829 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -110,6 +110,7 @@ pub mod pg_constants;
 pub mod relfile_utils;
 
 // Export some widely used datatypes that are unlikely to change across Postgres versions
+pub use v14::bindings::RepOriginId;
 pub use v14::bindings::{uint32, uint64, Oid};
 pub use v14::bindings::{BlockNumber, OffsetNumber};
 pub use v14::bindings::{MultiXactId, TransactionId};
diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs
index 2701ddf5e0..54b032d138 100644
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -102,7 +102,7 @@ pub const XACT_XINFO_HAS_SUBXACTS: u32 = 1u32 << 1;
 pub const XACT_XINFO_HAS_RELFILENODES: u32 = 1u32 << 2;
 pub const XACT_XINFO_HAS_INVALS: u32 = 1u32 << 3;
 pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
-// pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
+pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
 // pub const XACT_XINFO_HAS_AE_LOCKS: u32 = 1u32 << 6;
 // pub const XACT_XINFO_HAS_GID: u32 = 1u32 << 7;
 
@@ -167,6 +167,7 @@ pub const RM_RELMAP_ID: u8 = 7;
 pub const RM_STANDBY_ID: u8 = 8;
 pub const RM_HEAP2_ID: u8 = 9;
 pub const RM_HEAP_ID: u8 = 10;
+pub const RM_REPLORIGIN_ID: u8 = 19;
 pub const RM_LOGICALMSG_ID: u8 = 21;
 
 // from neon_rmgr.h
@@ -223,6 +224,10 @@ pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
 pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_LONG_HEADER: u16 = 0x0002;
 
+/* From xlog.h */
+pub const XLOG_REPLORIGIN_SET: u8 = 0x00;
+pub const XLOG_REPLORIGIN_DROP: u8 = 0x10;
+
 /* From replication/slot.h */
 pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4  /* offset of `slotdata` in ReplicationSlotOnDisk  */
    + 64 /* NameData */  + 4*4;
@@ -237,6 +242,9 @@ pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32;
 pub const VM_HEAPBLOCKS_PER_PAGE: u32 =
     (BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
 
+/* From origin.c */
+pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE;
+
 // List of subdirectories inside pgdata.
 // Copied from src/bin/initdb/initdb.c
 pub const PGDATA_SUBDIRS: [&str; 22] = [
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 24c1248304..aca22c6b3e 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -26,13 +26,14 @@ use futures::stream::Stream;
 use futures_util::StreamExt;
 use futures_util::TryStreamExt;
 use http_types::{StatusCode, Url};
+use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 
+use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
 use crate::{
-    error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
-    DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
-    TimeTravelError, TimeoutOrCancel,
+    error::Cancelled, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing,
+    ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
 };
 
 pub struct AzureBlobStorage {
@@ -137,6 +138,8 @@ impl AzureBlobStorage {
         let mut last_modified = None;
         let mut metadata = HashMap::new();
 
+        let started_at = start_measuring_requests(kind);
+
         let download = async {
             let response = builder
                 // convert to concrete Pageable
@@ -200,13 +203,22 @@ impl AzureBlobStorage {
             })
         };
 
-        tokio::select! {
+        let download = tokio::select! {
             bufs = download => bufs,
             cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
-                TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
-                TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
+                TimeoutOrCancel::Timeout => return Err(DownloadError::Timeout),
+                TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled),
             },
-        }
+        };
+        let started_at = ScopeGuard::into_inner(started_at);
+        let outcome = match &download {
+            Ok(_) => AttemptOutcome::Ok,
+            Err(_) => AttemptOutcome::Err,
+        };
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, outcome, started_at);
+        download
     }
 
     async fn permit(
@@ -340,7 +352,10 @@ impl RemoteStorage for AzureBlobStorage {
         metadata: Option<StorageMetadata>,
         cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Put, cancel).await?;
+        let kind = RequestKind::Put;
+        let _permit = self.permit(kind, cancel).await?;
+
+        let started_at = start_measuring_requests(kind);
 
         let op = async {
             let blob_client = self.client.blob_client(self.relative_path_to_name(to));
@@ -364,14 +379,25 @@ impl RemoteStorage for AzureBlobStorage {
             match fut.await {
                 Ok(Ok(_response)) => Ok(()),
                 Ok(Err(azure)) => Err(azure.into()),
-                Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()),
+                Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
             }
         };
 
-        tokio::select! {
+        let res = tokio::select! {
             res = op => res,
-            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
-        }
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
+
+        let outcome = match res {
+            Ok(_) => AttemptOutcome::Ok,
+            Err(_) => AttemptOutcome::Err,
+        };
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, outcome, started_at);
+
+        res
     }
 
     async fn download(
@@ -417,12 +443,13 @@ impl RemoteStorage for AzureBlobStorage {
         paths: &'a [RemotePath],
         cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Delete, cancel).await?;
+        let kind = RequestKind::Delete;
+        let _permit = self.permit(kind, cancel).await?;
+        let started_at = start_measuring_requests(kind);
 
         let op = async {
-            // TODO batch requests are also not supported by the SDK
+            // TODO batch requests are not supported by the SDK
             // https://github.com/Azure/azure-sdk-for-rust/issues/1068
-            // https://github.com/Azure/azure-sdk-for-rust/issues/1249
             for path in paths {
                 let blob_client = self.client.blob_client(self.relative_path_to_name(path));
 
@@ -447,10 +474,16 @@ impl RemoteStorage for AzureBlobStorage {
             Ok(())
         };
 
-        tokio::select! {
+        let res = tokio::select! {
             res = op => res,
-            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
-        }
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
+
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+        res
     }
 
     async fn copy(
@@ -459,7 +492,9 @@ impl RemoteStorage for AzureBlobStorage {
         to: &RemotePath,
         cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Copy, cancel).await?;
+        let kind = RequestKind::Copy;
+        let _permit = self.permit(kind, cancel).await?;
+        let started_at = start_measuring_requests(kind);
 
         let timeout = tokio::time::sleep(self.timeout);
 
@@ -503,15 +538,21 @@ impl RemoteStorage for AzureBlobStorage {
             }
         };
 
-        tokio::select! {
+        let res = tokio::select! {
             res = op => res,
-            _ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
+            _ = cancel.cancelled() => return Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
             _ = timeout => {
                 let e = anyhow::Error::new(TimeoutOrCancel::Timeout);
                 let e = e.context(format!("Timeout, last status: {copy_status:?}"));
                 Err(e)
             },
-        }
+        };
+
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+        res
     }
 
     async fn time_travel_recover(
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 708662f20f..8c984abed2 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -12,6 +12,7 @@
 mod azure_blob;
 mod error;
 mod local_fs;
+mod metrics;
 mod s3_bucket;
 mod simulate_failures;
 mod support;
@@ -121,8 +122,8 @@ impl RemotePath {
         self.0.file_name()
     }
 
-    pub fn join(&self, segment: &Utf8Path) -> Self {
-        Self(self.0.join(segment))
+    pub fn join(&self, path: impl AsRef<Utf8Path>) -> Self {
+        Self(self.0.join(path))
     }
 
     pub fn get_path(&self) -> &Utf8PathBuf {
diff --git a/libs/remote_storage/src/s3_bucket/metrics.rs b/libs/remote_storage/src/metrics.rs
similarity index 76%
rename from libs/remote_storage/src/s3_bucket/metrics.rs
rename to libs/remote_storage/src/metrics.rs
index beca755920..bbb51590f3 100644
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -15,6 +15,7 @@ pub(crate) enum RequestKind {
     TimeTravel = 5,
 }
 
+use scopeguard::ScopeGuard;
 use RequestKind::*;
 
 impl RequestKind {
@@ -33,10 +34,10 @@ impl RequestKind {
     }
 }
 
-pub(super) struct RequestTyped<C>([C; 6]);
+pub(crate) struct RequestTyped<C>([C; 6]);
 
 impl<C> RequestTyped<C> {
-    pub(super) fn get(&self, kind: RequestKind) -> &C {
+    pub(crate) fn get(&self, kind: RequestKind) -> &C {
         &self.0[kind.as_index()]
     }
 
@@ -58,19 +59,19 @@ impl<C> RequestTyped<C> {
 }
 
 impl RequestTyped<Histogram> {
-    pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
+    pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
         self.get(kind).observe(started_at.elapsed().as_secs_f64())
     }
 }
 
-pub(super) struct PassFailCancelledRequestTyped<C> {
+pub(crate) struct PassFailCancelledRequestTyped<C> {
     success: RequestTyped<C>,
     fail: RequestTyped<C>,
     cancelled: RequestTyped<C>,
 }
 
 #[derive(Debug, Clone, Copy)]
-pub(super) enum AttemptOutcome {
+pub(crate) enum AttemptOutcome {
     Ok,
     Err,
     Cancelled,
@@ -86,7 +87,7 @@ impl<T, E> From<&Result<T, E>> for AttemptOutcome {
 }
 
 impl AttemptOutcome {
-    pub(super) fn as_str(&self) -> &'static str {
+    pub(crate) fn as_str(&self) -> &'static str {
         match self {
             AttemptOutcome::Ok => "ok",
             AttemptOutcome::Err => "err",
@@ -96,7 +97,7 @@ impl AttemptOutcome {
 }
 
 impl<C> PassFailCancelledRequestTyped<C> {
-    pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
+    pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
         let target = match outcome {
             AttemptOutcome::Ok => &self.success,
             AttemptOutcome::Err => &self.fail,
@@ -119,7 +120,7 @@ impl<C> PassFailCancelledRequestTyped<C> {
 }
 
 impl PassFailCancelledRequestTyped<Histogram> {
-    pub(super) fn observe_elapsed(
+    pub(crate) fn observe_elapsed(
         &self,
         kind: RequestKind,
         outcome: impl Into<AttemptOutcome>,
@@ -130,19 +131,44 @@ impl PassFailCancelledRequestTyped<Histogram> {
     }
 }
 
-pub(super) struct BucketMetrics {
+/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`].
+pub(crate) fn start_counting_cancelled_wait(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
+        crate::metrics::BUCKET_METRICS
+            .cancelled_waits
+            .get(kind)
+            .inc()
+    })
+}
+
+/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
+pub(crate) fn start_measuring_requests(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
+        crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+            kind,
+            AttemptOutcome::Cancelled,
+            started_at,
+        )
+    })
+}
+
+pub(crate) struct BucketMetrics {
     /// Full request duration until successful completion, error or cancellation.
-    pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
+    pub(crate) req_seconds: PassFailCancelledRequestTyped<Histogram>,
     /// Total amount of seconds waited on queue.
-    pub(super) wait_seconds: RequestTyped<Histogram>,
+    pub(crate) wait_seconds: RequestTyped<Histogram>,
 
     /// Track how many semaphore awaits were cancelled per request type.
     ///
     /// This is in case cancellations are happening more than expected.
-    pub(super) cancelled_waits: RequestTyped<IntCounter>,
+    pub(crate) cancelled_waits: RequestTyped<IntCounter>,
 
     /// Total amount of deleted objects in batches or single requests.
-    pub(super) deleted_objects_total: IntCounter,
+    pub(crate) deleted_objects_total: IntCounter,
 }
 
 impl Default for BucketMetrics {
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index c3d6c75e20..76cf3eac80 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -46,15 +46,16 @@ use utils::backoff;
 
 use super::StorageMetadata;
 use crate::{
-    error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
-    Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
-    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    error::Cancelled,
+    metrics::{start_counting_cancelled_wait, start_measuring_requests},
+    support::PermitCarrying,
+    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
+    S3Config, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
-pub(super) mod metrics;
-
-use self::metrics::AttemptOutcome;
-pub(super) use self::metrics::RequestKind;
+use crate::metrics::AttemptOutcome;
+pub(super) use crate::metrics::RequestKind;
 
 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -227,7 +228,7 @@ impl S3Bucket {
         };
 
         let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
+        crate::metrics::BUCKET_METRICS
             .wait_seconds
             .observe_elapsed(kind, started_at);
 
@@ -248,7 +249,7 @@ impl S3Bucket {
         };
 
         let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
+        crate::metrics::BUCKET_METRICS
             .wait_seconds
             .observe_elapsed(kind, started_at);
         Ok(permit)
@@ -287,7 +288,7 @@ impl S3Bucket {
                 // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                 // an error: we expect to sometimes fetch an object and find it missing,
                 // e.g. when probing for timeline indices.
-                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
                     kind,
                     AttemptOutcome::Ok,
                     started_at,
@@ -295,7 +296,7 @@ impl S3Bucket {
                 return Err(DownloadError::NotFound);
             }
             Err(e) => {
-                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
                     kind,
                     AttemptOutcome::Err,
                     started_at,
@@ -371,12 +372,12 @@ impl S3Bucket {
             };
 
             let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
+            crate::metrics::BUCKET_METRICS
                 .req_seconds
                 .observe_elapsed(kind, &resp, started_at);
 
             let resp = resp.context("request deletion")?;
-            metrics::BUCKET_METRICS
+            crate::metrics::BUCKET_METRICS
                 .deleted_objects_total
                 .inc_by(chunk.len() as u64);
 
@@ -435,14 +436,14 @@ pin_project_lite::pin_project! {
     /// Times and tracks the outcome of the request.
     struct TimedDownload<S> {
         started_at: std::time::Instant,
-        outcome: metrics::AttemptOutcome,
+        outcome: AttemptOutcome,
         #[pin]
         inner: S
     }
 
     impl<S> PinnedDrop for TimedDownload<S> {
         fn drop(mut this: Pin<&mut Self>) {
-            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
+            crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
         }
     }
 }
@@ -451,7 +452,7 @@ impl<S> TimedDownload<S> {
     fn new(started_at: std::time::Instant, inner: S) -> Self {
         TimedDownload {
             started_at,
-            outcome: metrics::AttemptOutcome::Cancelled,
+            outcome: AttemptOutcome::Cancelled,
             inner,
         }
     }
@@ -468,8 +469,8 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
         let res = ready!(this.inner.poll_next(cx));
         match &res {
             Some(Ok(_)) => {}
-            Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
-            None => *this.outcome = metrics::AttemptOutcome::Ok,
+            Some(Err(_)) => *this.outcome = AttemptOutcome::Err,
+            None => *this.outcome = AttemptOutcome::Ok,
         }
 
         Poll::Ready(res)
@@ -543,7 +544,7 @@ impl RemoteStorage for S3Bucket {
 
             let started_at = ScopeGuard::into_inner(started_at);
 
-            metrics::BUCKET_METRICS
+            crate::metrics::BUCKET_METRICS
                 .req_seconds
                 .observe_elapsed(kind, &response, started_at);
 
@@ -625,7 +626,7 @@ impl RemoteStorage for S3Bucket {
         if let Ok(inner) = &res {
             // do not incl. timeouts as errors in metrics but cancellations
             let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
+            crate::metrics::BUCKET_METRICS
                 .req_seconds
                 .observe_elapsed(kind, inner, started_at);
         }
@@ -673,7 +674,7 @@ impl RemoteStorage for S3Bucket {
         };
 
         let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
+        crate::metrics::BUCKET_METRICS
             .req_seconds
             .observe_elapsed(kind, &res, started_at);
 
@@ -977,28 +978,6 @@ impl RemoteStorage for S3Bucket {
     }
 }
 
-/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
-fn start_counting_cancelled_wait(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
-        metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
-    })
-}
-
-/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
-fn start_measuring_requests(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
-        metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-            kind,
-            AttemptOutcome::Cancelled,
-            started_at,
-        )
-    })
-}
-
 // Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
 struct VerOrDelete {
     kind: VerOrDeleteKind,
diff --git a/libs/remote_storage/src/support.rs b/libs/remote_storage/src/support.rs
index d146b5445b..1ed9ed9305 100644
--- a/libs/remote_storage/src/support.rs
+++ b/libs/remote_storage/src/support.rs
@@ -78,6 +78,10 @@ where
                 let e = Err(std::io::Error::from(e));
                 return Poll::Ready(Some(e));
             }
+        } else {
+            // this would be perfectly valid behaviour for doing a graceful completion on the
+            // download for example, but not one we expect to do right now.
+            tracing::warn!("continuing polling after having cancelled or timeouted");
         }
 
         this.inner.poll_next(cx)
@@ -89,13 +93,22 @@ where
 }
 
 /// Fires only on the first cancel or timeout, not on both.
-pub(crate) async fn cancel_or_timeout(
+pub(crate) fn cancel_or_timeout(
     timeout: Duration,
     cancel: CancellationToken,
-) -> TimeoutOrCancel {
-    tokio::select! {
-        _ = tokio::time::sleep(timeout) => TimeoutOrCancel::Timeout,
-        _ = cancel.cancelled() => TimeoutOrCancel::Cancel,
+) -> impl std::future::Future<Output = TimeoutOrCancel> + 'static {
+    // futures are lazy, they don't do anything before being polled.
+    //
+    // "precalculate" the wanted deadline before returning the future, so that we can use pause
+    // failpoint to trigger a timeout in test.
+    let deadline = tokio::time::Instant::now() + timeout;
+    async move {
+        tokio::select! {
+            _ = tokio::time::sleep_until(deadline) => TimeoutOrCancel::Timeout,
+            _ = cancel.cancelled() => {
+                TimeoutOrCancel::Cancel
+            },
+        }
     }
 }
 
@@ -172,4 +185,31 @@ mod tests {
             _ = tokio::time::sleep(Duration::from_secs(121)) => {},
         }
     }
+
+    #[tokio::test]
+    async fn notified_but_pollable_after() {
+        let inner = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from_static(
+            b"hello world",
+        ))));
+        let timeout = Duration::from_secs(120);
+        let cancel = CancellationToken::new();
+
+        cancel.cancel();
+        let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
+        let mut stream = std::pin::pin!(stream);
+
+        let next = stream.next().await;
+        let ioe = next.unwrap().unwrap_err();
+        assert!(
+            matches!(
+                ioe.get_ref().unwrap().downcast_ref::<DownloadError>(),
+                Some(&DownloadError::Cancelled)
+            ),
+            "{ioe:?}"
+        );
+
+        let next = stream.next().await;
+        let bytes = next.unwrap().unwrap();
+        assert_eq!(&b"hello world"[..], bytes);
+    }
 }
diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs
index 90ba348a02..8e53d2c79b 100644
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -3,6 +3,9 @@ use std::{fs, io, path::Path};
 
 use anyhow::Context;
 
+mod rename_noreplace;
+pub use rename_noreplace::rename_noreplace;
+
 pub trait PathExt {
     /// Returns an error if `self` is not a directory.
     fn is_empty_dir(&self) -> io::Result<bool>;
diff --git a/libs/utils/src/fs_ext/rename_noreplace.rs b/libs/utils/src/fs_ext/rename_noreplace.rs
new file mode 100644
index 0000000000..897e30d7f1
--- /dev/null
+++ b/libs/utils/src/fs_ext/rename_noreplace.rs
@@ -0,0 +1,109 @@
+use nix::NixPath;
+
+/// Rename a file without replacing an existing file.
+///
+/// This is a wrapper around platform-specific APIs.
+pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
+    src: &P1,
+    dst: &P2,
+) -> nix::Result<()> {
+    {
+        #[cfg(target_os = "linux")]
+        {
+            nix::fcntl::renameat2(
+                None,
+                src,
+                None,
+                dst,
+                nix::fcntl::RenameFlags::RENAME_NOREPLACE,
+            )
+        }
+        #[cfg(target_os = "macos")]
+        {
+            let res = src.with_nix_path(|src| {
+                dst.with_nix_path(|dst|
+                    // SAFETY: `src` and `dst` are valid C strings as per the NixPath trait and they outlive the call to renamex_np.
+                    unsafe {
+                        nix::libc::renamex_np(src.as_ptr(), dst.as_ptr(), nix::libc::RENAME_EXCL)
+                })
+            })??;
+            nix::errno::Errno::result(res).map(drop)
+        }
+        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
+        {
+            std::compile_error!("OS does not support no-replace renames");
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::{fs, path::PathBuf};
+
+    use super::*;
+
+    fn testdir() -> camino_tempfile::Utf8TempDir {
+        match crate::env::var("NEON_UTILS_RENAME_NOREPLACE_TESTDIR") {
+            Some(path) => {
+                let path: camino::Utf8PathBuf = path;
+                camino_tempfile::tempdir_in(path).unwrap()
+            }
+            None => camino_tempfile::tempdir().unwrap(),
+        }
+    }
+
+    #[test]
+    fn test_absolute_paths() {
+        let testdir = testdir();
+        println!("testdir: {}", testdir.path());
+
+        let src = testdir.path().join("src");
+        let dst = testdir.path().join("dst");
+
+        fs::write(&src, b"").unwrap();
+        fs::write(&dst, b"").unwrap();
+
+        let src = src.canonicalize().unwrap();
+        assert!(src.is_absolute());
+        let dst = dst.canonicalize().unwrap();
+        assert!(dst.is_absolute());
+
+        let result = rename_noreplace(&src, &dst);
+        assert_eq!(result.unwrap_err(), nix::Error::EEXIST);
+    }
+
+    #[test]
+    fn test_relative_paths() {
+        let testdir = testdir();
+        println!("testdir: {}", testdir.path());
+
+        // this is fine because we run in nextest => process per test
+        std::env::set_current_dir(testdir.path()).unwrap();
+
+        let src = PathBuf::from("src");
+        let dst = PathBuf::from("dst");
+
+        fs::write(&src, b"").unwrap();
+        fs::write(&dst, b"").unwrap();
+
+        let result = rename_noreplace(&src, &dst);
+        assert_eq!(result.unwrap_err(), nix::Error::EEXIST);
+    }
+
+    #[test]
+    fn test_works_when_not_exists() {
+        let testdir = testdir();
+        println!("testdir: {}", testdir.path());
+
+        let src = testdir.path().join("src");
+        let dst = testdir.path().join("dst");
+
+        fs::write(&src, b"content").unwrap();
+
+        rename_noreplace(src.as_std_path(), dst.as_std_path()).unwrap();
+        assert_eq!(
+            "content",
+            String::from_utf8(std::fs::read(&dst).unwrap()).unwrap()
+        );
+    }
+}
diff --git a/libs/utils/src/hex.rs b/libs/utils/src/hex.rs
index fc0bb7e4a2..382f805a96 100644
--- a/libs/utils/src/hex.rs
+++ b/libs/utils/src/hex.rs
@@ -19,13 +19,13 @@
 /// // right: [0x68; 1]
 /// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
 /// ```
-#[derive(PartialEq)]
-pub struct Hex<'a>(pub &'a [u8]);
+pub struct Hex<S>(pub S);
 
-impl std::fmt::Debug for Hex<'_> {
+impl<S: AsRef<[u8]>> std::fmt::Debug for Hex<S> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "[")?;
-        for (i, c) in self.0.chunks(16).enumerate() {
+        let chunks = self.0.as_ref().chunks(16);
+        for (i, c) in chunks.enumerate() {
             if i > 0 && !c.is_empty() {
                 writeln!(f, ", ")?;
             }
@@ -36,6 +36,15 @@ impl std::fmt::Debug for Hex<'_> {
                 write!(f, "0x{b:02x}")?;
             }
         }
-        write!(f, "; {}]", self.0.len())
+        write!(f, "; {}]", self.0.as_ref().len())
+    }
+}
+
+impl<R: AsRef<[u8]>, L: AsRef<[u8]>> PartialEq<Hex<R>> for Hex<L> {
+    fn eq(&self, other: &Hex<R>) -> bool {
+        let left = self.0.as_ref();
+        let right = other.0.as_ref();
+
+        left == right
     }
 }
diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml
index 843f5dd862..be5626040b 100644
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -17,6 +17,7 @@ pageserver = { path = ".." }
 pageserver_api.workspace = true
 remote_storage = { path = "../../libs/remote_storage" }
 postgres_ffi.workspace = true
+thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
 toml_edit.workspace = true
diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs
index 2998b5c732..a33cae6769 100644
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -26,7 +26,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
 
             let output = Output {
                 layer_metadata: &des.layer_metadata,
-                disk_consistent_lsn: des.get_disk_consistent_lsn(),
+                disk_consistent_lsn: des.metadata.disk_consistent_lsn(),
                 timeline_metadata: &des.metadata,
             };
 
diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs
new file mode 100644
index 0000000000..af4b5a21ab
--- /dev/null
+++ b/pageserver/ctl/src/key.rs
@@ -0,0 +1,475 @@
+use anyhow::Context;
+use clap::Parser;
+use pageserver_api::{
+    key::Key,
+    reltag::{BlockNumber, RelTag, SlruKind},
+    shard::{ShardCount, ShardStripeSize},
+};
+use std::str::FromStr;
+
+#[derive(Parser)]
+pub(super) struct DescribeKeyCommand {
+    /// Key material in one of the forms: hex, span attributes captured from log, reltag blocknum
+    input: Vec<String>,
+
+    /// The number of shards to calculate what Keys placement would be.
+    #[arg(long)]
+    shard_count: Option<CustomShardCount>,
+
+    /// The sharding stripe size.
+    ///
+    /// The default is hardcoded. It makes no sense to provide this without providing
+    /// `--shard-count`.
+    #[arg(long, requires = "shard_count")]
+    stripe_size: Option<u32>,
+}
+
+/// Sharded shard count without unsharded count, which the actual ShardCount supports.
+#[derive(Clone, Copy)]
+pub(super) struct CustomShardCount(std::num::NonZeroU8);
+
+#[derive(Debug, thiserror::Error)]
+pub(super) enum InvalidShardCount {
+    #[error(transparent)]
+    ParsingFailed(#[from] std::num::ParseIntError),
+    #[error("too few shards")]
+    TooFewShards,
+}
+
+impl FromStr for CustomShardCount {
+    type Err = InvalidShardCount;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let inner: std::num::NonZeroU8 = s.parse()?;
+        if inner.get() < 2 {
+            Err(InvalidShardCount::TooFewShards)
+        } else {
+            Ok(CustomShardCount(inner))
+        }
+    }
+}
+
+impl From<CustomShardCount> for ShardCount {
+    fn from(value: CustomShardCount) -> Self {
+        ShardCount::new(value.0.get())
+    }
+}
+
+impl DescribeKeyCommand {
+    pub(super) fn execute(self) {
+        let DescribeKeyCommand {
+            input,
+            shard_count,
+            stripe_size,
+        } = self;
+
+        let material = KeyMaterial::try_from(input.as_slice()).unwrap();
+        let kind = material.kind();
+        let key = Key::from(material);
+
+        println!("parsed from {kind}: {key}:");
+        println!();
+        println!("{key:?}");
+
+        macro_rules! kind_query {
+            ([$($name:ident),*$(,)?]) => {{[$(kind_query!($name)),*]}};
+            ($name:ident) => {{
+                let s: &'static str = stringify!($name);
+                let s = s.strip_prefix("is_").unwrap_or(s);
+                let s = s.strip_suffix("_key").unwrap_or(s);
+
+                #[allow(clippy::needless_borrow)]
+                (s, key.$name())
+            }};
+        }
+
+        // the current characterization is a mess of these boolean queries and separate
+        // "recognization". I think it accurately represents how strictly we model the Key
+        // right now, but could of course be made less confusing.
+
+        let queries = kind_query!([
+            is_rel_block_key,
+            is_rel_vm_block_key,
+            is_rel_fsm_block_key,
+            is_slru_block_key,
+            is_inherited_key,
+            is_rel_size_key,
+            is_slru_segment_size_key,
+        ]);
+
+        let recognized_kind = "recognized kind";
+        let metadata_key = "metadata key";
+        let shard_placement = "shard placement";
+
+        let longest = queries
+            .iter()
+            .map(|t| t.0)
+            .chain([recognized_kind, metadata_key, shard_placement])
+            .map(|s| s.len())
+            .max()
+            .unwrap();
+
+        let colon = 1;
+        let padding = 1;
+
+        for (name, is) in queries {
+            let width = longest - name.len() + colon + padding;
+            println!("{}{:width$}{}", name, ":", is);
+        }
+
+        let width = longest - recognized_kind.len() + colon + padding;
+        println!(
+            "{}{:width$}{:?}",
+            recognized_kind,
+            ":",
+            RecognizedKeyKind::new(key),
+        );
+
+        if let Some(shard_count) = shard_count {
+            // seeing the sharding placement might be confusing, so leave it out unless shard
+            // count was given.
+
+            let stripe_size = stripe_size.map(ShardStripeSize).unwrap_or_default();
+            println!(
+                "# placement with shard_count: {} and stripe_size: {}:",
+                shard_count.0, stripe_size.0
+            );
+            let width = longest - shard_placement.len() + colon + padding;
+            println!(
+                "{}{:width$}{:?}",
+                shard_placement,
+                ":",
+                pageserver_api::shard::describe(&key, shard_count.into(), stripe_size)
+            );
+        }
+    }
+}
+
+/// Hand-wavy "inputs we accept" for a key.
+#[derive(Debug)]
+pub(super) enum KeyMaterial {
+    Hex(Key),
+    String(SpanAttributesFromLogs),
+    Split(RelTag, BlockNumber),
+}
+
+impl KeyMaterial {
+    fn kind(&self) -> &'static str {
+        match self {
+            KeyMaterial::Hex(_) => "hex",
+            KeyMaterial::String(_) | KeyMaterial::Split(_, _) => "split",
+        }
+    }
+}
+
+impl From<KeyMaterial> for Key {
+    fn from(value: KeyMaterial) -> Self {
+        match value {
+            KeyMaterial::Hex(key) => key,
+            KeyMaterial::String(SpanAttributesFromLogs(rt, blocknum))
+            | KeyMaterial::Split(rt, blocknum) => {
+                pageserver_api::key::rel_block_to_key(rt, blocknum)
+            }
+        }
+    }
+}
+
+impl<S: AsRef<str>> TryFrom<&[S]> for KeyMaterial {
+    type Error = anyhow::Error;
+
+    fn try_from(value: &[S]) -> Result<Self, Self::Error> {
+        match value {
+            [] => anyhow::bail!(
+                "need 1..N positional arguments describing the key, try hex or a log line"
+            ),
+            [one] => {
+                let one = one.as_ref();
+
+                let key = Key::from_hex(one).map(KeyMaterial::Hex);
+
+                let attrs = SpanAttributesFromLogs::from_str(one).map(KeyMaterial::String);
+
+                match (key, attrs) {
+                    (Ok(key), _) => Ok(key),
+                    (_, Ok(s)) => Ok(s),
+                    (Err(e1), Err(e2)) => anyhow::bail!(
+                        "failed to parse {one:?} as hex or span attributes:\n- {e1:#}\n- {e2:#}"
+                    ),
+                }
+            }
+            more => {
+                // assume going left to right one of these is a reltag and then we find a blocknum
+                // this works, because we don't have plain numbers at least right after reltag in
+                // logs. for some definition of "works".
+
+                let Some((reltag_at, reltag)) = more
+                    .iter()
+                    .map(AsRef::as_ref)
+                    .enumerate()
+                    .find_map(|(i, s)| {
+                        s.split_once("rel=")
+                            .map(|(_garbage, actual)| actual)
+                            .unwrap_or(s)
+                            .parse::<RelTag>()
+                            .ok()
+                            .map(|rt| (i, rt))
+                    })
+                else {
+                    anyhow::bail!("found no RelTag in arguments");
+                };
+
+                let Some(blocknum) = more
+                    .iter()
+                    .map(AsRef::as_ref)
+                    .skip(reltag_at)
+                    .find_map(|s| {
+                        s.split_once("blkno=")
+                            .map(|(_garbage, actual)| actual)
+                            .unwrap_or(s)
+                            .parse::<BlockNumber>()
+                            .ok()
+                    })
+                else {
+                    anyhow::bail!("found no blocknum in arguments");
+                };
+
+                Ok(KeyMaterial::Split(reltag, blocknum))
+            }
+        }
+    }
+}
+
+#[derive(Debug)]
+pub(super) struct SpanAttributesFromLogs(RelTag, BlockNumber);
+
+impl std::str::FromStr for SpanAttributesFromLogs {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // accept the span separator but do not require or fail if either is missing
+        // "whatever{rel=1663/16389/24615 blkno=1052204 req_lsn=FFFFFFFF/FFFFFFFF}"
+        let (_, reltag) = s
+            .split_once("rel=")
+            .ok_or_else(|| anyhow::anyhow!("cannot find 'rel='"))?;
+        let reltag = reltag.split_whitespace().next().unwrap();
+
+        let (_, blocknum) = s
+            .split_once("blkno=")
+            .ok_or_else(|| anyhow::anyhow!("cannot find 'blkno='"))?;
+        let blocknum = blocknum.split_whitespace().next().unwrap();
+
+        let reltag = reltag
+            .parse()
+            .with_context(|| format!("parse reltag from {reltag:?}"))?;
+        let blocknum = blocknum
+            .parse()
+            .with_context(|| format!("parse blocknum from {blocknum:?}"))?;
+
+        Ok(Self(reltag, blocknum))
+    }
+}
+
+#[derive(Debug)]
+#[allow(dead_code)] // debug print is used
+enum RecognizedKeyKind {
+    DbDir,
+    ControlFile,
+    Checkpoint,
+    AuxFilesV1,
+    SlruDir(Result<SlruKind, u32>),
+    RelMap(RelTagish<2>),
+    RelDir(RelTagish<2>),
+    AuxFileV2(Result<AuxFileV2, utils::Hex<[u8; 16]>>),
+}
+
+#[derive(Debug, PartialEq)]
+#[allow(unused)]
+enum AuxFileV2 {
+    Recognized(&'static str, utils::Hex<[u8; 13]>),
+    OtherWithPrefix(&'static str, utils::Hex<[u8; 13]>),
+    Other(utils::Hex<[u8; 13]>),
+}
+
+impl RecognizedKeyKind {
+    fn new(key: Key) -> Option<Self> {
+        use RecognizedKeyKind::{
+            AuxFilesV1, Checkpoint, ControlFile, DbDir, RelDir, RelMap, SlruDir,
+        };
+
+        let slru_dir_kind = pageserver_api::key::slru_dir_kind(&key);
+
+        Some(match key {
+            pageserver_api::key::DBDIR_KEY => DbDir,
+            pageserver_api::key::CONTROLFILE_KEY => ControlFile,
+            pageserver_api::key::CHECKPOINT_KEY => Checkpoint,
+            pageserver_api::key::AUX_FILES_KEY => AuxFilesV1,
+            _ if slru_dir_kind.is_some() => SlruDir(slru_dir_kind.unwrap()),
+            _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 0 => {
+                RelMap([key.field2, key.field3].into())
+            }
+            _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 1 => {
+                RelDir([key.field2, key.field3].into())
+            }
+            _ if key.is_metadata_key() => RecognizedKeyKind::AuxFileV2(
+                AuxFileV2::new(key).ok_or_else(|| utils::Hex(key.to_i128().to_be_bytes())),
+            ),
+            _ => return None,
+        })
+    }
+}
+
+impl AuxFileV2 {
+    fn new(key: Key) -> Option<AuxFileV2> {
+        const EMPTY_HASH: [u8; 13] = {
+            let mut out = [0u8; 13];
+            let hash = pageserver::aux_file::fnv_hash(b"").to_be_bytes();
+            let mut i = 3;
+            while i < 16 {
+                out[i - 3] = hash[i];
+                i += 1;
+            }
+            out
+        };
+
+        let bytes = key.to_i128().to_be_bytes();
+        let hash = utils::Hex(<[u8; 13]>::try_from(&bytes[3..]).unwrap());
+
+        assert_eq!(EMPTY_HASH.len(), hash.0.len());
+
+        // TODO: we could probably find the preimages for the hashes
+
+        Some(match (bytes[1], bytes[2]) {
+            (1, 1) => AuxFileV2::Recognized("pg_logical/mappings/", hash),
+            (1, 2) => AuxFileV2::Recognized("pg_logical/snapshots/", hash),
+            (1, 3) if hash.0 == EMPTY_HASH => {
+                AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash)
+            }
+            (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash),
+            (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash),
+            (0xff, 0xff) => AuxFileV2::Other(hash),
+            _ => return None,
+        })
+    }
+}
+
+/// Prefix of RelTag, currently only known use cases are the two item versions.
+///
+/// Renders like a reltag with `/`, nothing else.
+struct RelTagish<const N: usize>([u32; N]);
+
+impl<const N: usize> From<[u32; N]> for RelTagish<N> {
+    fn from(val: [u32; N]) -> Self {
+        RelTagish(val)
+    }
+}
+
+impl<const N: usize> std::fmt::Debug for RelTagish<N> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        use std::fmt::Write as _;
+        let mut first = true;
+        self.0.iter().try_for_each(|x| {
+            if !first {
+                f.write_char('/')?;
+            }
+            first = false;
+            write!(f, "{}", x)
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use pageserver::aux_file::encode_aux_file_key;
+
+    use super::*;
+
+    #[test]
+    fn hex_is_key_material() {
+        let m = KeyMaterial::try_from(&["000000067F0000400200DF927900FFFFFFFF"][..]).unwrap();
+        assert!(matches!(m, KeyMaterial::Hex(_)), "{m:?}");
+    }
+
+    #[test]
+    fn single_positional_spanalike_is_key_material() {
+        // why is this needed? if you are checking many, then copypaste starts to appeal
+        let strings = [
+            (line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0"),
+            (line!(), "rel=1663/208101/2620_fsm blkno=2"),
+            (line!(), "rel=1663/208101/2620.1 blkno=2"),
+        ];
+
+        let mut first: Option<Key> = None;
+
+        for (line, example) in strings {
+            let m = KeyMaterial::try_from(&[example][..])
+                .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
+            let key = Key::from(m);
+            if let Some(first) = first {
+                assert_eq!(first, key);
+            } else {
+                first = Some(key);
+            }
+        }
+
+        // not supporting this is rather accidential, but I think the input parsing is lenient
+        // enough already
+        KeyMaterial::try_from(&["1663/208101/2620_fsm 2"][..]).unwrap_err();
+    }
+
+    #[test]
+    fn multiple_spanlike_args() {
+        let strings = [
+            (line!(), &["process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}"][..]),
+            (line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]),
+            (line!(), &["1663/208101/2620_fsm", "2"][..]),
+        ];
+
+        let mut first: Option<Key> = None;
+
+        for (line, example) in strings {
+            let m = KeyMaterial::try_from(example)
+                .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
+            let key = Key::from(m);
+            if let Some(first) = first {
+                assert_eq!(first, key);
+            } else {
+                first = Some(key);
+            }
+        }
+    }
+    #[test]
+    fn recognized_auxfiles() {
+        use AuxFileV2::*;
+
+        let empty = [
+            0x2e, 0x07, 0xbb, 0x01, 0x42, 0x62, 0xb8, 0x21, 0x75, 0x62, 0x95, 0xc5, 0x8d,
+        ];
+        let foobar = [
+            0x62, 0x79, 0x3c, 0x64, 0xbf, 0x6f, 0x0d, 0x35, 0x97, 0xba, 0x44, 0x6f, 0x18,
+        ];
+
+        #[rustfmt::skip]
+        let examples = [
+            (line!(), "pg_logical/mappings/foobar", Recognized("pg_logical/mappings/", utils::Hex(foobar))),
+            (line!(), "pg_logical/snapshots/foobar", Recognized("pg_logical/snapshots/", utils::Hex(foobar))),
+            (line!(), "pg_logical/replorigin_checkpoint", Recognized("pg_logical/replorigin_checkpoint", utils::Hex(empty))),
+            (line!(), "pg_logical/foobar", OtherWithPrefix("pg_logical/", utils::Hex(foobar))),
+            (line!(), "pg_replslot/foobar", Recognized("pg_replslot/", utils::Hex(foobar))),
+            (line!(), "foobar", Other(utils::Hex(foobar))),
+        ];
+
+        for (line, path, expected) in examples {
+            let key = encode_aux_file_key(path);
+            let recognized =
+                AuxFileV2::new(key).unwrap_or_else(|| panic!("line {line} example failed"));
+
+            assert_eq!(recognized, expected);
+        }
+
+        assert_eq!(
+            AuxFileV2::new(Key::from_hex("600000102000000000000000000000000000").unwrap()),
+            None,
+            "example key has one too few 0 after 6 before 1"
+        );
+    }
+}
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index e92c352dab..50c3ac4c61 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -6,6 +6,7 @@
 
 mod draw_timeline_dir;
 mod index_part;
+mod key;
 mod layer_map_analyzer;
 mod layers;
 
@@ -61,6 +62,8 @@ enum Commands {
     AnalyzeLayerMap(AnalyzeLayerMapCmd),
     #[command(subcommand)]
     Layer(LayerCmd),
+    /// Debug print a hex key found from logs
+    Key(key::DescribeKeyCommand),
 }
 
 /// Read and update pageserver metadata file
@@ -183,6 +186,7 @@ async fn main() -> anyhow::Result<()> {
                 .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
                 .await?;
         }
+        Commands::Key(dkc) => dkc.execute(),
     };
     Ok(())
 }
diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs
index eb5b242a5f..bce3285606 100644
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -5,6 +5,7 @@ use utils::lsn::Lsn;
 
 use std::collections::HashMap;
 use std::sync::Arc;
+use std::time::Instant;
 
 /// Ingest aux files into the pageserver.
 #[derive(clap::Parser)]
@@ -88,11 +89,17 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
         println!("ingested {file_cnt} files");
     }
 
-    let files = mgmt_api_client
-        .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
-        .await?;
-
-    println!("{} files found", files.len());
+    for _ in 0..100 {
+        let start = Instant::now();
+        let files = mgmt_api_client
+            .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
+            .await?;
+        println!(
+            "{} files found in {}s",
+            files.len(),
+            start.elapsed().as_secs_f64()
+        );
+    }
 
     anyhow::Ok(())
 }
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 5043a207fc..4992f37465 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -1,6 +1,6 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
+use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::models::PagestreamGetPageRequest;
 
@@ -187,7 +187,7 @@ async fn main_impl(
                     for r in partitioning.keys.ranges.iter() {
                         let mut i = r.start;
                         while i != r.end {
-                            if is_rel_block_key(&i) {
+                            if i.is_rel_block_key() {
                                 filtered.add_key(i);
                             }
                             i = i.next();
@@ -308,9 +308,10 @@ async fn main_impl(
                     let r = &ranges[weights.sample(&mut rng)];
                     let key: i128 = rng.gen_range(r.start..r.end);
                     let key = Key::from_i128(key);
-                    assert!(is_rel_block_key(&key));
-                    let (rel_tag, block_no) =
-                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
+                    assert!(key.is_rel_block_key());
+                    let (rel_tag, block_no) = key
+                        .to_rel_block()
+                        .expect("we filter non-rel-block keys out above");
                     PagestreamGetPageRequest {
                         request_lsn: if rng.gen_bool(args.req_latest_probability) {
                             Lsn::MAX
diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs
index 38e1875db1..5e527b7d61 100644
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -178,7 +178,8 @@ impl AuxFileSizeEstimator {
         }
     }
 
-    pub fn on_base_backup(&self, new_size: usize) {
+    /// When generating base backup or doing initial logical size calculation
+    pub fn on_initial(&self, new_size: usize) {
         let mut guard = self.size.lock().unwrap();
         *guard = Some(new_size as isize);
         self.report(new_size as isize);
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index dca1510810..0f057a4368 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::{key_to_slru_block, Key};
+use pageserver_api::key::Key;
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -170,7 +170,7 @@ where
     }
 
     async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
-        let (kind, segno, _) = key_to_slru_block(*key)?;
+        let (kind, segno, _) = key.to_slru_block()?;
 
         match kind {
             SlruKind::Clog => {
@@ -362,6 +362,13 @@ where
                     ));
                     info!("Replication slot {} restart LSN={}", path, restart_lsn);
                     min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
+                } else if path == "pg_logical/replorigin_checkpoint" {
+                    // replorigin_checkoint is written only on compute shutdown, so it contains
+                    // deteriorated values. So we generate our own version of this file for the particular LSN
+                    // based on information about replorigins extracted from transaction commit records.
+                    // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
+                    // but now we should handle (skip) it for backward compatibility.
+                    continue;
                 }
                 let header = new_tar_header(&path, content.len() as u64)?;
                 self.ar
@@ -390,6 +397,32 @@ where
         {
             self.add_twophase_file(xid).await?;
         }
+        let repl_origins = self
+            .timeline
+            .get_replorigins(self.lsn, self.ctx)
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?;
+        let n_origins = repl_origins.len();
+        if n_origins != 0 {
+            //
+            // Construct "pg_logical/replorigin_checkpoint" file based on information about replication origins
+            // extracted from transaction commit record. We are using this file to pass information about replication
+            // origins to compute to allow logical replication to restart from proper point.
+            //
+            let mut content = Vec::with_capacity(n_origins * 16 + 8);
+            content.extend_from_slice(&pg_constants::REPLICATION_STATE_MAGIC.to_le_bytes());
+            for (origin_id, origin_lsn) in repl_origins {
+                content.extend_from_slice(&origin_id.to_le_bytes());
+                content.extend_from_slice(&[0u8; 6]); // align to 8 bytes
+                content.extend_from_slice(&origin_lsn.0.to_le_bytes());
+            }
+            let crc32 = crc32c::crc32c(&content);
+            content.extend_from_slice(&crc32.to_le_bytes());
+            let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?;
+            self.ar.append(&header, &*content).await.context(
+                "could not add pg_logical/replorigin_checkpoint file to basebackup tarball",
+            )?;
+        }
 
         fail_point!("basebackup-before-control-file", |_| {
             Err(BasebackupError::Server(anyhow!(
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index b0afb6414b..b4a0d1ac02 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -99,8 +99,6 @@ pub mod defaults {
 
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
 
-    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async";
-
     ///
     /// Default built-in configuration file.
     ///
@@ -146,8 +144,6 @@ pub mod defaults {
 
 #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
 
-#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
-
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -300,8 +296,6 @@ pub struct PageServerConf {
     ///
     /// Setting this to zero disables limits on total ephemeral layer size.
     pub ephemeral_bytes_per_memory_kb: usize,
-
-    pub walredo_process_kind: crate::walredo::ProcessKind,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -407,8 +401,6 @@ struct PageServerConfigBuilder {
     validate_vectored_get: BuilderValue<bool>,
 
     ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
-
-    walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
 }
 
 impl PageServerConfigBuilder {
@@ -497,8 +489,6 @@ impl PageServerConfigBuilder {
             )),
             validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
-
-            walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
         }
     }
 }
@@ -686,10 +676,6 @@ impl PageServerConfigBuilder {
         self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
     }
 
-    pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
-        self.walredo_process_kind = BuilderValue::Set(value);
-    }
-
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let default = Self::default_values();
 
@@ -747,7 +733,6 @@ impl PageServerConfigBuilder {
                 max_vectored_read_bytes,
                 validate_vectored_get,
                 ephemeral_bytes_per_memory_kb,
-                walredo_process_kind,
             }
             CUSTOM LOGIC
             {
@@ -1044,9 +1029,6 @@ impl PageServerConf {
                 "ephemeral_bytes_per_memory_kb" => {
                     builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                 }
-                "walredo_process_kind" => {
-                    builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
-                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1130,7 +1112,6 @@ impl PageServerConf {
             ),
             validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-            walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
         }
     }
 }
@@ -1370,7 +1351,6 @@ background_task_maximum_delay = '334 s'
                 ),
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1444,7 +1424,6 @@ background_task_maximum_delay = '334 s'
                 ),
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 62bbde42f4..540d0d2e8c 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -358,7 +358,7 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
     // mean the synthetic size worker should terminate.
     let shutting_down = matches!(
         e.downcast_ref::<PageReconstructError>(),
-        Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
+        Some(PageReconstructError::Cancelled)
     );
 
     if !shutting_down {
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 8790a9b0a8..3960fc1b99 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -311,7 +311,7 @@ impl DeletionList {
                 result.extend(
                     timeline_layers
                         .into_iter()
-                        .map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))),
+                        .map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))),
                 );
             }
         }
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index e5eafc51f4..71b486a4d3 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -81,8 +81,10 @@ paths:
         Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
         404 means that deletion successfully finished"
       responses:
+        "200":
+          description: Tenant was successfully deleted, or was already not found.
         "404":
-          description: Tenant not found. This is the success path.
+          description: Tenant not found. This is a success result, equivalent to 200.
           content:
             application/json:
               schema:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 8a061f3ae1..7fa6c35ad6 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -74,6 +74,7 @@ use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::CompactFlags;
+use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
 use crate::tenant::SpawnMode;
@@ -183,9 +184,6 @@ impl From<PageReconstructError> for ApiError {
             PageReconstructError::Cancelled => {
                 ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
             }
-            PageReconstructError::AncestorStopping(_) => {
-                ApiError::ResourceUnavailable(format!("{pre}").into())
-            }
             PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
             PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
         }
@@ -1075,7 +1073,7 @@ async fn tenant_delete_handler(
 
     let state = get_state(&request);
 
-    state
+    let status = state
         .tenant_manager
         .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
         .instrument(info_span!("tenant_delete_handler",
@@ -1084,7 +1082,14 @@ async fn tenant_delete_handler(
         ))
         .await?;
 
-    json_response(StatusCode::ACCEPTED, ())
+    // Callers use 404 as success for deletions, for historical reasons.
+    if status == StatusCode::NOT_FOUND {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("Deletion complete").into(),
+        ));
+    }
+
+    json_response(status, ())
 }
 
 /// HTTP endpoint to query the current tenant_size of a tenant.
@@ -1813,11 +1818,22 @@ async fn timeline_checkpoint_handler(
         timeline
             .freeze_and_flush()
             .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| {
+                match e {
+                    tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
+                    other => ApiError::InternalServerError(other.into()),
+
+                }
+            })?;
         timeline
             .compact(&cancel, flags, &ctx)
             .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            .map_err(|e|
+                match e {
+                    CompactionError::ShuttingDown => ApiError::ShuttingDown,
+                    CompactionError::Other(e) => ApiError::InternalServerError(e)
+                }
+            )?;
 
         if wait_until_uploaded {
             timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
@@ -2173,7 +2189,7 @@ async fn tenant_scan_remote_handler(
             {
                 Ok((index_part, index_generation)) => {
                     tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
-                        index_part.layer_metadata.len(), index_part.get_disk_consistent_lsn());
+                        index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn());
                     generation = std::cmp::max(generation, index_generation);
                 }
                 Err(DownloadError::NotFound) => {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index e9651165b1..ae389826d5 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -66,6 +66,7 @@ use crate::tenant::mgr::GetTenantError;
 use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
+use crate::tenant::timeline::FlushLayerError;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
@@ -372,7 +373,7 @@ impl From<WaitLsnError> for PageStreamError {
         match value {
             e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
             WaitLsnError::Shutdown => Self::Shutdown,
-            WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
+            e @ WaitLsnError::BadState { .. } => Self::Reconnect(format!("{e}").into()),
         }
     }
 }
@@ -382,7 +383,7 @@ impl From<WaitLsnError> for QueryError {
         match value {
             e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
             WaitLsnError::Shutdown => Self::Shutdown,
-            WaitLsnError::BadState => Self::Reconnect,
+            WaitLsnError::BadState { .. } => Self::Reconnect,
         }
     }
 }
@@ -830,7 +831,10 @@ impl PageServerHandler {
         // We only want to persist the data, and it doesn't matter if it's in the
         // shape of deltas or images.
         info!("flushing layers");
-        timeline.freeze_and_flush().await?;
+        timeline.freeze_and_flush().await.map_err(|e| match e {
+            FlushLayerError::Cancelled => QueryError::Shutdown,
+            other => QueryError::Other(other.into()),
+        })?;
 
         info!("done");
         Ok(())
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index afba34c6d1..0bff4be150 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -17,8 +17,8 @@ use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
 use itertools::Itertools;
 use pageserver_api::key::{
-    dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
-    rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
+    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
+    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
     slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
     AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
@@ -27,7 +27,7 @@ use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
-use postgres_ffi::{Oid, TimestampTz, TransactionId};
+use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
@@ -78,11 +78,19 @@ pub enum LsnForTimestamp {
 }
 
 #[derive(Debug, thiserror::Error)]
-pub enum CalculateLogicalSizeError {
+pub(crate) enum CalculateLogicalSizeError {
     #[error("cancelled")]
     Cancelled,
+
+    /// Something went wrong while reading the metadata we use to calculate logical size
+    /// Note that cancellation variants of `PageReconstructError` are transformed to [`Self::Cancelled`]
+    /// in the `From` implementation for this variant.
     #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    PageRead(PageReconstructError),
+
+    /// Something went wrong deserializing metadata that we read to calculate logical size
+    #[error("decode error: {0}")]
+    Decode(#[from] DeserializeError),
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -107,10 +115,8 @@ impl From<PageReconstructError> for CollectKeySpaceError {
 impl From<PageReconstructError> for CalculateLogicalSizeError {
     fn from(pre: PageReconstructError) -> Self {
         match pre {
-            PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
-                Self::Cancelled
-            }
-            _ => Self::Other(pre.into()),
+            PageReconstructError::Cancelled => Self::Cancelled,
+            _ => Self::PageRead(pre),
         }
     }
 }
@@ -712,10 +718,22 @@ impl Timeline {
                 result.insert(fname, content);
             }
         }
-        self.aux_file_size_estimator.on_base_backup(sz);
+        self.aux_file_size_estimator.on_initial(sz);
         Ok(result)
     }
 
+    pub(crate) async fn trigger_aux_file_size_computation(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<(), PageReconstructError> {
+        let current_policy = self.last_aux_file_policy.load();
+        if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy {
+            self.list_aux_files_v2(lsn, ctx).await?;
+        }
+        Ok(())
+    }
+
     pub(crate) async fn list_aux_files(
         &self,
         lsn: Lsn,
@@ -754,6 +772,27 @@ impl Timeline {
         }
     }
 
+    pub(crate) async fn get_replorigins(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<HashMap<RepOriginId, Lsn>, PageReconstructError> {
+        let kv = self
+            .scan(KeySpace::single(repl_origin_key_range()), lsn, ctx)
+            .await
+            .context("scan")?;
+        let mut result = HashMap::new();
+        for (k, v) in kv {
+            let v = v.context("get value")?;
+            let origin_id = k.field6 as RepOriginId;
+            let origin_lsn = Lsn::des(&v).unwrap();
+            if origin_lsn != Lsn::INVALID {
+                result.insert(origin_id, origin_lsn);
+            }
+        }
+        Ok(result)
+    }
+
     /// Does the same as get_current_logical_size but counted on demand.
     /// Used to initialize the logical size tracking on startup.
     ///
@@ -763,7 +802,7 @@ impl Timeline {
     /// # Cancel-Safety
     ///
     /// This method is cancellation-safe.
-    pub async fn get_current_logical_size_non_incremental(
+    pub(crate) async fn get_current_logical_size_non_incremental(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
@@ -772,7 +811,7 @@ impl Timeline {
 
         // Fetch list of database dirs and iterate them
         let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
-        let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
+        let dbdir = DbDirectory::des(&buf)?;
 
         let mut total_size: u64 = 0;
         for (spcnode, dbnode) in dbdir.dbdirs.keys() {
@@ -879,7 +918,9 @@ impl Timeline {
         Ok((
             result.to_keyspace(),
             /* AUX sparse key space */
-            SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
+            SparseKeySpace(KeySpace {
+                ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
+            }),
         ))
     }
 
@@ -1148,6 +1189,20 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
+    pub async fn set_replorigin(
+        &mut self,
+        origin_id: RepOriginId,
+        origin_lsn: Lsn,
+    ) -> anyhow::Result<()> {
+        let key = repl_origin_key(origin_id);
+        self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
+        Ok(())
+    }
+
+    pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
+        self.set_replorigin(origin_id, Lsn::INVALID).await
+    }
+
     pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
         self.put(CONTROLFILE_KEY, Value::Image(img));
         Ok(())
@@ -1552,7 +1607,7 @@ impl<'a> DatadirModification<'a> {
                     self.tline.aux_file_size_estimator.on_add(content.len());
                     new_files.push((path, content));
                 }
-                (None, true) => anyhow::bail!("removing non-existing aux file: {}", path),
+                (None, true) => warn!("removing non-existing aux file: {}", path),
             }
             let new_val = aux_file::encode_file_value(&new_files)?;
             self.put(key, Value::Image(new_val.into()));
@@ -1606,8 +1661,7 @@ impl<'a> DatadirModification<'a> {
                         aux_files.dir = Some(dir);
                     }
                     Err(
-                        e @ (PageReconstructError::AncestorStopping(_)
-                        | PageReconstructError::Cancelled
+                        e @ (PageReconstructError::Cancelled
                         | PageReconstructError::AncestorLsnTimeout(_)),
                     ) => {
                         // Important that we do not interpret a shutdown error as "not found" and thereby
@@ -1679,7 +1733,7 @@ impl<'a> DatadirModification<'a> {
         let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
         for (key, values) in self.pending_updates.drain() {
             for (lsn, value) in values {
-                if is_rel_block_key(&key) || is_slru_block_key(key) {
+                if key.is_rel_block_key() || key.is_slru_block_key() {
                     // This bails out on first error without modifying pending_updates.
                     // That's Ok, cf this function's doc comment.
                     writer.put(key, lsn, &value, ctx).await?;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e6bfd57a44..60cd5c9695 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -487,6 +487,33 @@ enum CreateTimelineCause {
     Delete,
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum GcError {
+    // The tenant is shutting down
+    #[error("tenant shutting down")]
+    TenantCancelled,
+
+    // The tenant is shutting down
+    #[error("timeline shutting down")]
+    TimelineCancelled,
+
+    // The tenant is in a state inelegible to run GC
+    #[error("not active")]
+    NotActive,
+
+    // A requested GC cutoff LSN was invalid, for example it tried to move backwards
+    #[error("not active")]
+    BadLsn { why: String },
+
+    // A remote storage error while scheduling updates after compaction
+    #[error(transparent)]
+    Remote(anyhow::Error),
+
+    // If GC was invoked for a particular timeline, this error means it didn't exist
+    #[error("timeline not found")]
+    TimelineNotFound,
+}
+
 impl Tenant {
     /// Yet another helper for timeline initialization.
     ///
@@ -1393,6 +1420,36 @@ impl Tenant {
         Ok(tl)
     }
 
+    /// Helper for unit tests to create a timeline with some pre-loaded states.
+    #[cfg(test)]
+    #[allow(clippy::too_many_arguments)]
+    pub async fn create_test_timeline_with_layers(
+        &self,
+        new_timeline_id: TimelineId,
+        initdb_lsn: Lsn,
+        pg_version: u32,
+        ctx: &RequestContext,
+        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
+        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
+        end_lsn: Lsn,
+    ) -> anyhow::Result<Arc<Timeline>> {
+        let tline = self
+            .create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
+            .await?;
+        tline.force_advance_lsn(end_lsn);
+        for deltas in delta_layer_desc {
+            tline
+                .force_create_delta_layer(deltas, Some(initdb_lsn), ctx)
+                .await?;
+        }
+        for (lsn, images) in image_layer_desc {
+            tline
+                .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
+                .await?;
+        }
+        Ok(tline)
+    }
+
     /// Create a new timeline.
     ///
     /// Returns the new timeline ID and reference to its Timeline object.
@@ -1507,7 +1564,7 @@ impl Tenant {
                         .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
                         .await
                         .map_err(|e| match e {
-                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
+                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => {
                                 CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
                             }
                             WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
@@ -1575,24 +1632,23 @@ impl Tenant {
     /// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever
     /// requires more history to be retained.
     //
-    pub async fn gc_iteration(
+    pub(crate) async fn gc_iteration(
         &self,
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<GcResult> {
+    ) -> Result<GcResult, GcError> {
         // Don't start doing work during shutdown
         if let TenantState::Stopping { .. } = self.current_state() {
             return Ok(GcResult::default());
         }
 
         // there is a global allowed_error for this
-        anyhow::ensure!(
-            self.is_active(),
-            "Cannot run GC iteration on inactive tenant"
-        );
+        if !self.is_active() {
+            return Err(GcError::NotActive);
+        }
 
         {
             let conf = self.tenant_conf.load();
@@ -2760,28 +2816,13 @@ impl Tenant {
         pitr: Duration,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<GcResult> {
+    ) -> Result<GcResult, GcError> {
         let mut totals: GcResult = Default::default();
         let now = Instant::now();
 
-        let gc_timelines = match self
+        let gc_timelines = self
             .refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx)
-            .await
-        {
-            Ok(result) => result,
-            Err(e) => {
-                if let Some(PageReconstructError::Cancelled) =
-                    e.downcast_ref::<PageReconstructError>()
-                {
-                    // Handle cancellation
-                    totals.elapsed = now.elapsed();
-                    return Ok(totals);
-                } else {
-                    // Propagate other errors
-                    return Err(e);
-                }
-            }
-        };
+            .await?;
 
         failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
 
@@ -2806,7 +2847,19 @@ impl Tenant {
                 // made.
                 break;
             }
-            let result = timeline.gc().await?;
+            let result = match timeline.gc().await {
+                Err(GcError::TimelineCancelled) => {
+                    if target_timeline_id.is_some() {
+                        // If we were targetting this specific timeline, surface cancellation to caller
+                        return Err(GcError::TimelineCancelled);
+                    } else {
+                        // A timeline may be shutting down independently of the tenant's lifecycle: we should
+                        // skip past this and proceed to try GC on other timelines.
+                        continue;
+                    }
+                }
+                r => r?,
+            };
             totals += result;
         }
 
@@ -2819,11 +2872,11 @@ impl Tenant {
     /// [`Tenant::get_gc_horizon`].
     ///
     /// This is usually executed as part of periodic gc, but can now be triggered more often.
-    pub async fn refresh_gc_info(
+    pub(crate) async fn refresh_gc_info(
         &self,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
+    ) -> Result<Vec<Arc<Timeline>>, GcError> {
         // since this method can now be called at different rates than the configured gc loop, it
         // might be that these configuration values get applied faster than what it was previously,
         // since these were only read from the gc task.
@@ -2844,7 +2897,7 @@ impl Tenant {
         pitr: Duration,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
+    ) -> Result<Vec<Arc<Timeline>>, GcError> {
         // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for
         // currently visible timelines.
         let timelines = self
@@ -2881,8 +2934,8 @@ impl Tenant {
             }
         }
 
-        if !self.is_active() {
-            anyhow::bail!("shutting down");
+        if !self.is_active() || self.cancel.is_cancelled() {
+            return Err(GcError::TenantCancelled);
         }
 
         // grab mutex to prevent new timelines from being created here; avoid doing long operations
@@ -2891,19 +2944,19 @@ impl Tenant {
 
         // Scan all timelines. For each timeline, remember the timeline ID and
         // the branch point where it was created.
-        let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = {
+        let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = {
             let timelines = self.timelines.lock().unwrap();
             let mut all_branchpoints = BTreeSet::new();
-            let timeline_ids = {
+            let timelines = {
                 if let Some(target_timeline_id) = target_timeline_id.as_ref() {
                     if timelines.get(target_timeline_id).is_none() {
-                        bail!("gc target timeline does not exist")
+                        return Err(GcError::TimelineNotFound);
                     }
                 };
 
                 timelines
                     .iter()
-                    .map(|(timeline_id, timeline_entry)| {
+                    .map(|(_timeline_id, timeline_entry)| {
                         if let Some(ancestor_timeline_id) =
                             &timeline_entry.get_ancestor_timeline_id()
                         {
@@ -2925,33 +2978,28 @@ impl Tenant {
                             }
                         }
 
-                        *timeline_id
+                        timeline_entry.clone()
                     })
                     .collect::<Vec<_>>()
             };
-            (all_branchpoints, timeline_ids)
+            (all_branchpoints, timelines)
         };
 
         // Ok, we now know all the branch points.
         // Update the GC information for each timeline.
-        let mut gc_timelines = Vec::with_capacity(timeline_ids.len());
-        for timeline_id in timeline_ids {
-            // Timeline is known to be local and loaded.
-            let timeline = self
-                .get_timeline(timeline_id, false)
-                .with_context(|| format!("Timeline {timeline_id} was not found"))?;
-
+        let mut gc_timelines = Vec::with_capacity(timelines.len());
+        for timeline in timelines {
             // If target_timeline is specified, ignore all other timelines
             if let Some(target_timeline_id) = target_timeline_id {
-                if timeline_id != target_timeline_id {
+                if timeline.timeline_id != target_timeline_id {
                     continue;
                 }
             }
 
             let branchpoints: Vec<Lsn> = all_branchpoints
                 .range((
-                    Included((timeline_id, Lsn(0))),
-                    Included((timeline_id, Lsn(u64::MAX))),
+                    Included((timeline.timeline_id, Lsn(0))),
+                    Included((timeline.timeline_id, Lsn(u64::MAX))),
                 ))
                 .map(|&x| x.1)
                 .collect();
@@ -2959,7 +3007,7 @@ impl Tenant {
             {
                 let mut target = timeline.gc_info.write().unwrap();
 
-                match gc_cutoffs.remove(&timeline_id) {
+                match gc_cutoffs.remove(&timeline.timeline_id) {
                     Some(cutoffs) => {
                         *target = GcInfo {
                             retain_lsns: branchpoints,
@@ -2992,17 +3040,53 @@ impl Tenant {
         &self,
         src_timeline: &Arc<Timeline>,
         dst_id: TimelineId,
-        start_lsn: Option<Lsn>,
+        ancestor_lsn: Option<Lsn>,
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
         let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
         let tl = self
-            .branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx)
+            .branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, create_guard, ctx)
             .await?;
         tl.set_state(TimelineState::Active);
         Ok(tl)
     }
 
+    /// Helper for unit tests to branch a timeline with some pre-loaded states.
+    #[cfg(test)]
+    #[allow(clippy::too_many_arguments)]
+    pub async fn branch_timeline_test_with_layers(
+        &self,
+        src_timeline: &Arc<Timeline>,
+        dst_id: TimelineId,
+        ancestor_lsn: Option<Lsn>,
+        ctx: &RequestContext,
+        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
+        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
+        end_lsn: Lsn,
+    ) -> anyhow::Result<Arc<Timeline>> {
+        let tline = self
+            .branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx)
+            .await?;
+        let ancestor_lsn = if let Some(ancestor_lsn) = ancestor_lsn {
+            ancestor_lsn
+        } else {
+            tline.get_last_record_lsn()
+        };
+        assert!(end_lsn >= ancestor_lsn);
+        tline.force_advance_lsn(end_lsn);
+        for deltas in delta_layer_desc {
+            tline
+                .force_create_delta_layer(deltas, Some(ancestor_lsn), ctx)
+                .await?;
+        }
+        for (lsn, images) in image_layer_desc {
+            tline
+                .force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx)
+                .await?;
+        }
+        Ok(tline)
+    }
+
     /// Branch an existing timeline.
     ///
     /// The caller is responsible for activating the returned timeline.
@@ -3781,6 +3865,9 @@ pub(crate) mod harness {
         pub fn create_custom(
             test_name: &'static str,
             tenant_conf: TenantConf,
+            tenant_id: TenantId,
+            shard_identity: ShardIdentity,
+            generation: Generation,
         ) -> anyhow::Result<Self> {
             setup_logging();
 
@@ -3793,8 +3880,12 @@ pub(crate) mod harness {
             // OK in a test.
             let conf: &'static PageServerConf = Box::leak(Box::new(conf));
 
-            let tenant_id = TenantId::generate();
-            let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+            let shard = shard_identity.shard_index();
+            let tenant_shard_id = TenantShardId {
+                tenant_id,
+                shard_number: shard.shard_number,
+                shard_count: shard.shard_count,
+            };
             fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?;
             fs::create_dir_all(conf.timelines_path(&tenant_shard_id))?;
 
@@ -3812,8 +3903,8 @@ pub(crate) mod harness {
                 conf,
                 tenant_conf,
                 tenant_shard_id,
-                generation: Generation::new(0xdeadbeef),
-                shard: ShardIndex::unsharded(),
+                generation,
+                shard,
                 remote_storage,
                 remote_fs_dir,
                 deletion_queue,
@@ -3828,8 +3919,15 @@ pub(crate) mod harness {
                 compaction_period: Duration::ZERO,
                 ..TenantConf::default()
             };
-
-            Self::create_custom(test_name, tenant_conf)
+            let tenant_id = TenantId::generate();
+            let shard = ShardIdentity::unsharded();
+            Self::create_custom(
+                test_name,
+                tenant_conf,
+                tenant_id,
+                shard,
+                Generation::new(0xdeadbeef),
+            )
         }
 
         pub fn span(&self) -> tracing::Span {
@@ -3908,8 +4006,8 @@ pub(crate) mod harness {
                 let base_img = base_img.expect("Neon WAL redo requires base image").1;
                 let mut page = BytesMut::new();
                 page.extend_from_slice(&base_img);
-                for (_record_lsn, record) in records {
-                    apply_neon::apply_in_neon(&record, key, &mut page)?;
+                for (record_lsn, record) in records {
+                    apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?;
                 }
                 Ok(page.freeze())
             } else {
@@ -3953,6 +4051,7 @@ mod tests {
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
     use utils::bin_ser::BeSer;
+    use utils::id::TenantId;
 
     static TEST_KEY: Lazy<Key> =
         Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4154,7 +4253,7 @@ mod tests {
                 .await?;
             writer.finish_write(lsn);
         }
-        tline.freeze_and_flush().await
+        tline.freeze_and_flush().await.map_err(|e| e.into())
     }
 
     #[tokio::test]
@@ -4308,9 +4407,10 @@ mod tests {
 
         // This needs to traverse to the parent, and fails.
         let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
-        assert!(err
-            .to_string()
-            .contains("will not become active. Current state: Broken"));
+        assert!(err.to_string().starts_with(&format!(
+            "Bad state on timeline {}: Broken",
+            tline.timeline_id
+        )));
 
         Ok(())
     }
@@ -4851,7 +4951,13 @@ mod tests {
             ..TenantConf::default()
         };
 
-        let harness = TenantHarness::create_custom("test_get_vectored_key_gap", tenant_conf)?;
+        let harness = TenantHarness::create_custom(
+            "test_get_vectored_key_gap",
+            tenant_conf,
+            TenantId::generate(),
+            ShardIdentity::unsharded(),
+            Generation::new(0xdeadbeef),
+        )?;
         let (tenant, ctx) = harness.load().await;
 
         let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
@@ -6205,75 +6311,36 @@ mod tests {
     async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
         let (tenant, ctx) = harness.load().await;
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await?;
-
-        let cancel = CancellationToken::new();
 
         let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
         let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
         let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
 
-        let mut lsn = Lsn(0x20);
-
-        {
-            let mut writer = tline.writer().await;
-            writer
-                .put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx)
-                .await?;
-            writer.finish_write(lsn);
-            drop(writer);
-
-            tline.freeze_and_flush().await?; // this will create a image layer
-        }
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                Vec::new(), // delta layers
+                vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
+                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
+            )
+            .await?;
 
         let child = tenant
-            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .branch_timeline_test_with_layers(
+                &tline,
+                NEW_TIMELINE_ID,
+                Some(Lsn(0x20)),
+                &ctx,
+                Vec::new(), // delta layers
+                vec![(Lsn(0x30), vec![(base_key_child, test_img("data key 2"))])], // image layers
+                Lsn(0x30),
+            )
             .await
             .unwrap();
 
-        lsn.0 += 0x10;
-
-        {
-            let mut writer = child.writer().await;
-            writer
-                .put(
-                    base_key_child,
-                    lsn,
-                    &Value::Image(test_img("data key 2")),
-                    &ctx,
-                )
-                .await?;
-            writer.finish_write(lsn);
-            drop(writer);
-
-            child.freeze_and_flush().await?; // this will create a delta
-
-            {
-                // update the partitioning to include the test key space, otherwise they
-                // will be dropped by image layer creation
-                let mut guard = child.partitioning.lock().await;
-                let ((partitioning, _), partition_lsn) = &mut *guard;
-                partitioning
-                    .parts
-                    .push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key
-                *partition_lsn = lsn;
-            }
-
-            child
-                .compact(
-                    &cancel,
-                    {
-                        let mut set = EnumSet::empty();
-                        set.insert(CompactFlags::ForceImageLayerCreation);
-                        set
-                    },
-                    &ctx,
-                )
-                .await?; // force create an image layer for the keys, TODO: check if the image layer is created
-        }
-
         async fn get_vectored_impl_wrapper(
             tline: &Arc<Timeline>,
             key: Key,
@@ -6295,6 +6362,8 @@ mod tests {
             }))
         }
 
+        let lsn = Lsn(0x30);
+
         // test vectored get on parent timeline
         assert_eq!(
             get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
@@ -6332,94 +6401,42 @@ mod tests {
 
     #[tokio::test]
     async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
+        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
         let (tenant, ctx) = harness.load().await;
+
+        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+        let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
+        let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
+        assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
+
         let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                Vec::new(), // delta layers
+                vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
+                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
+            )
             .await?;
 
-        let cancel = CancellationToken::new();
-
-        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-        let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
-        let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
-        base_key.field1 = AUX_KEY_PREFIX;
-        base_key_child.field1 = AUX_KEY_PREFIX;
-        base_key_nonexist.field1 = AUX_KEY_PREFIX;
-
-        let mut lsn = Lsn(0x20);
-
-        {
-            let mut writer = tline.writer().await;
-            writer
-                .put(
-                    base_key,
-                    lsn,
-                    &Value::Image(test_img("metadata key 1")),
-                    &ctx,
-                )
-                .await?;
-            writer.finish_write(lsn);
-            drop(writer);
-
-            tline.freeze_and_flush().await?; // this will create an image layer
-
-            tline
-                .compact(
-                    &cancel,
-                    {
-                        let mut set = EnumSet::empty();
-                        set.insert(CompactFlags::ForceImageLayerCreation);
-                        set.insert(CompactFlags::ForceRepartition);
-                        set
-                    },
-                    &ctx,
-                )
-                .await?; // force create an image layer for metadata keys
-            tenant
-                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
-                .await?;
-        }
-
         let child = tenant
-            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .branch_timeline_test_with_layers(
+                &tline,
+                NEW_TIMELINE_ID,
+                Some(Lsn(0x20)),
+                &ctx,
+                Vec::new(), // delta layers
+                vec![(
+                    Lsn(0x30),
+                    vec![(base_key_child, test_img("metadata key 2"))],
+                )], // image layers
+                Lsn(0x30),
+            )
             .await
             .unwrap();
 
-        lsn.0 += 0x10;
-
-        {
-            let mut writer = child.writer().await;
-            writer
-                .put(
-                    base_key_child,
-                    lsn,
-                    &Value::Image(test_img("metadata key 2")),
-                    &ctx,
-                )
-                .await?;
-            writer.finish_write(lsn);
-            drop(writer);
-
-            child.freeze_and_flush().await?;
-
-            child
-                .compact(
-                    &cancel,
-                    {
-                        let mut set = EnumSet::empty();
-                        set.insert(CompactFlags::ForceImageLayerCreation);
-                        set.insert(CompactFlags::ForceRepartition);
-                        set
-                    },
-                    &ctx,
-                )
-                .await?; // force create an image layer for metadata keys
-            tenant
-                .gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
-                .await?;
-        }
-
         async fn get_vectored_impl_wrapper(
             tline: &Arc<Timeline>,
             key: Key,
@@ -6441,6 +6458,8 @@ mod tests {
             }))
         }
 
+        let lsn = Lsn(0x30);
+
         // test vectored get on parent timeline
         assert_eq!(
             get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
@@ -6471,4 +6490,208 @@ mod tests {
 
         Ok(())
     }
+
+    async fn get_vectored_impl_wrapper(
+        tline: &Arc<Timeline>,
+        key: Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Option<Bytes>, GetVectoredError> {
+        let mut reconstruct_state = ValuesReconstructState::new();
+        let mut res = tline
+            .get_vectored_impl(
+                KeySpace::single(key..key.next()),
+                lsn,
+                &mut reconstruct_state,
+                ctx,
+            )
+            .await?;
+        Ok(res.pop_last().map(|(k, v)| {
+            assert_eq!(k, key);
+            v.unwrap()
+        }))
+    }
+
+    #[tokio::test]
+    async fn test_metadata_tombstone_reads() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_tombstone_reads")?;
+        let (tenant, ctx) = harness.load().await;
+        let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
+        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
+        let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
+        let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap();
+
+        // We emulate the situation that the compaction algorithm creates an image layer that removes the tombstones
+        // Lsn 0x30 key0, key3, no key1+key2
+        // Lsn 0x20 key1+key2 tomestones
+        // Lsn 0x10 key1 in image, key2 in delta
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                // delta layers
+                vec![
+                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                ],
+                // image layers
+                vec![
+                    (Lsn(0x10), vec![(key1, test_img("metadata key 1"))]),
+                    (
+                        Lsn(0x30),
+                        vec![
+                            (key0, test_img("metadata key 0")),
+                            (key3, test_img("metadata key 3")),
+                        ],
+                    ),
+                ],
+                Lsn(0x30),
+            )
+            .await?;
+
+        let lsn = Lsn(0x30);
+        let old_lsn = Lsn(0x20);
+
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, key0, lsn, &ctx).await?,
+            Some(test_img("metadata key 0"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, key1, lsn, &ctx).await?,
+            None,
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, key2, lsn, &ctx).await?,
+            None,
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, key1, old_lsn, &ctx).await?,
+            Some(Bytes::new()),
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, key2, old_lsn, &ctx).await?,
+            Some(Bytes::new()),
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, key3, lsn, &ctx).await?,
+            Some(test_img("metadata key 3"))
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_metadata_tombstone_image_creation() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
+        let (tenant, ctx) = harness.load().await;
+
+        let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
+        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
+        let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
+        let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap();
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                // delta layers
+                vec![
+                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    vec![
+                        (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
+                        (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
+                    ],
+                ],
+                // image layers
+                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
+                Lsn(0x30),
+            )
+            .await?;
+
+        let cancel = CancellationToken::new();
+
+        tline
+            .compact(
+                &cancel,
+                {
+                    let mut flags = EnumSet::new();
+                    flags.insert(CompactFlags::ForceImageLayerCreation);
+                    flags.insert(CompactFlags::ForceRepartition);
+                    flags
+                },
+                &ctx,
+            )
+            .await?;
+
+        // Image layers are created at last_record_lsn
+        let images = tline
+            .inspect_image_layers(Lsn(0x30), &ctx)
+            .await?
+            .into_iter()
+            .filter(|(k, _)| k.is_metadata_key())
+            .collect::<Vec<_>>();
+        assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed.
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_metadata_tombstone_empty_image_creation() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
+        let (tenant, ctx) = harness.load().await;
+
+        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
+        let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                // delta layers
+                vec![
+                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                ],
+                // image layers
+                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
+                Lsn(0x30),
+            )
+            .await?;
+
+        let cancel = CancellationToken::new();
+
+        tline
+            .compact(
+                &cancel,
+                {
+                    let mut flags = EnumSet::new();
+                    flags.insert(CompactFlags::ForceImageLayerCreation);
+                    flags.insert(CompactFlags::ForceRepartition);
+                    flags
+                },
+                &ctx,
+            )
+            .await?;
+
+        // Image layers are created at last_record_lsn
+        let images = tline
+            .inspect_image_layers(Lsn(0x30), &ctx)
+            .await?
+            .into_iter()
+            .filter(|(k, _)| k.is_metadata_key())
+            .collect::<Vec<_>>();
+        assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 7c6640eaac..8b36aa15e5 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -16,6 +16,7 @@ use crate::{
     task_mgr::{self, TaskKind},
     tenant::{
         mgr::{TenantSlot, TenantsMapRemoveResult},
+        remote_timeline_client::remote_heatmap_path,
         timeline::ShutdownMode,
     },
 };
@@ -531,6 +532,25 @@ impl DeleteTenantFlow {
             }
         }
 
+        // Remove top-level tenant objects that don't belong to a timeline, such as heatmap
+        let heatmap_path = remote_heatmap_path(&tenant.tenant_shard_id());
+        if let Some(Err(e)) = backoff::retry(
+            || async {
+                remote_storage
+                    .delete(&heatmap_path, &task_mgr::shutdown_token())
+                    .await
+            },
+            TimeoutOrCancel::caused_by_cancel,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "remove_remote_tenant_heatmap",
+            &task_mgr::shutdown_token(),
+        )
+        .await
+        {
+            tracing::warn!("Failed to delete heatmap at {heatmap_path}: {e}");
+        }
+
         let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
         // May not exist if we fail in cleanup_remaining_fs_traces after removing it
         if timelines_path.exists() {
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index fc71ea7642..c00672895a 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -267,7 +267,7 @@ impl<'de> Deserialize<'de> for TimelineMetadata {
         D: serde::Deserializer<'de>,
     {
         let bytes = Vec::<u8>::deserialize(deserializer)?;
-        Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{e}")))
+        Self::from_bytes(bytes.as_slice()).map_err(D::Error::custom)
     }
 }
 
@@ -276,13 +276,163 @@ impl Serialize for TimelineMetadata {
     where
         S: Serializer,
     {
-        let bytes = self
-            .to_bytes()
-            .map_err(|e| serde::ser::Error::custom(format!("{e}")))?;
+        let bytes = self.to_bytes().map_err(serde::ser::Error::custom)?;
         bytes.serialize(serializer)
     }
 }
 
+pub(crate) mod modern_serde {
+    use crate::tenant::metadata::METADATA_FORMAT_VERSION;
+
+    use super::{
+        TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader, METADATA_HDR_SIZE,
+    };
+    use serde::{Deserialize, Serialize};
+
+    pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result<TimelineMetadata, D::Error>
+    where
+        D: serde::de::Deserializer<'de>,
+    {
+        // for legacy reasons versions 1-5 had TimelineMetadata serialized as a Vec<u8> field with
+        // BeSer.
+        struct Visitor;
+
+        impl<'d> serde::de::Visitor<'d> for Visitor {
+            type Value = TimelineMetadata;
+
+            fn expecting(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+                f.write_str("BeSer bytes or json structure")
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'d>,
+            {
+                use serde::de::Error;
+                let de = serde::de::value::SeqAccessDeserializer::new(seq);
+                Vec::<u8>::deserialize(de)
+                    .map(|v| TimelineMetadata::from_bytes(&v).map_err(A::Error::custom))?
+            }
+
+            fn visit_map<A>(self, map: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::MapAccess<'d>,
+            {
+                use serde::de::Error;
+
+                let de = serde::de::value::MapAccessDeserializer::new(map);
+                let body = TimelineMetadataBodyV2::deserialize(de)?;
+
+                // jump through hoops to calculate the crc32 so that TimelineMetadata::ne works
+                // across serialization versions
+                let mut sink = Crc32Sink::default();
+                <TimelineMetadataBodyV2 as utils::bin_ser::BeSer>::ser_into(&body, &mut sink)
+                    .map_err(|e| A::Error::custom(Crc32CalculationFailed(e)))?;
+
+                let size = METADATA_HDR_SIZE + sink.count;
+
+                Ok(TimelineMetadata {
+                    hdr: TimelineMetadataHeader {
+                        checksum: sink.crc,
+                        size: size as u16,
+                        format_version: METADATA_FORMAT_VERSION,
+                    },
+                    body,
+                })
+            }
+        }
+
+        deserializer.deserialize_any(Visitor)
+    }
+
+    #[derive(Default)]
+    struct Crc32Sink {
+        crc: u32,
+        count: usize,
+    }
+
+    impl std::io::Write for Crc32Sink {
+        fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+            self.crc = crc32c::crc32c_append(self.crc, buf);
+            self.count += buf.len();
+            Ok(buf.len())
+        }
+
+        fn flush(&mut self) -> std::io::Result<()> {
+            Ok(())
+        }
+    }
+
+    #[derive(thiserror::Error)]
+    #[error("re-serializing for crc32 failed")]
+    struct Crc32CalculationFailed<E>(#[source] E);
+
+    // this should be true for one release, after that we can change it to false
+    // remember to check the IndexPart::metadata field TODO comment as well
+    const LEGACY_BINCODED_BYTES: bool = true;
+
+    #[derive(serde::Serialize)]
+    #[serde(transparent)]
+    struct LegacyPaddedBytes<'a>(&'a TimelineMetadata);
+
+    struct JustTheBodyV2<'a>(&'a TimelineMetadata);
+
+    impl serde::Serialize for JustTheBodyV2<'_> {
+        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+        where
+            S: serde::Serializer,
+        {
+            // header is not needed, upon reading we've upgraded all v1 to v2
+            self.0.body.serialize(serializer)
+        }
+    }
+
+    pub(crate) fn serialize<S>(
+        metadata: &TimelineMetadata,
+        serializer: S,
+    ) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        // we cannot use TimelineMetadata::serialize for now because it'll do
+        // TimelineMetadata::to_bytes
+        if LEGACY_BINCODED_BYTES {
+            LegacyPaddedBytes(metadata).serialize(serializer)
+        } else {
+            JustTheBodyV2(metadata).serialize(serializer)
+        }
+    }
+
+    #[test]
+    fn deserializes_bytes_as_well_as_equivalent_body_v2() {
+        #[derive(serde::Deserialize, serde::Serialize)]
+        struct Wrapper(#[serde(deserialize_with = "deserialize")] TimelineMetadata);
+
+        let too_many_bytes = "[216,111,252,208,0,54,0,4,0,0,0,0,1,73,253,144,1,0,0,0,0,1,73,253,24,0,0,0,0,0,0,0,0,0,0,0,0,0,1,73,253,24,0,0,0,0,1,73,253,24,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]";
+
+        let wrapper_from_bytes = serde_json::from_str::<Wrapper>(too_many_bytes).unwrap();
+
+        let serialized = serde_json::to_value(JustTheBodyV2(&wrapper_from_bytes.0)).unwrap();
+
+        assert_eq!(
+            serialized,
+            serde_json::json! {{
+                "disk_consistent_lsn": "0/149FD90",
+                "prev_record_lsn": "0/149FD18",
+                "ancestor_timeline": null,
+                "ancestor_lsn": "0/0",
+                "latest_gc_cutoff_lsn": "0/149FD18",
+                "initdb_lsn": "0/149FD18",
+                "pg_version": 15
+            }}
+        );
+
+        let wrapper_from_json = serde_json::value::from_value::<Wrapper>(serialized).unwrap();
+
+        assert_eq!(wrapper_from_bytes.0, wrapper_from_json.0);
+    }
+}
+
 /// Parts of the metadata which are regularly modified.
 pub(crate) struct MetadataUpdate {
     disk_consistent_lsn: Lsn,
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 89fdf31849..4520bb9295 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -3,6 +3,7 @@
 
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
+use hyper::StatusCode;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
@@ -45,7 +46,7 @@ use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
-use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
+use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
 
 use utils::crashsafe::path_with_suffix_extension;
@@ -54,6 +55,7 @@ use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};
 
 use super::delete::DeleteTenantError;
+use super::remote_timeline_client::remote_tenant_path;
 use super::secondary::SecondaryTenant;
 use super::timeline::detach_ancestor::PreparedTimelineDetach;
 use super::TenantSharedResources;
@@ -1369,7 +1371,7 @@ impl TenantManager {
         &self,
         tenant_shard_id: TenantShardId,
         activation_timeout: Duration,
-    ) -> Result<(), DeleteTenantError> {
+    ) -> Result<StatusCode, DeleteTenantError> {
         super::span::debug_assert_current_span_has_tenant_id();
         // We acquire a SlotGuard during this function to protect against concurrent
         // changes while the ::prepare phase of DeleteTenantFlow executes, but then
@@ -1382,18 +1384,79 @@ impl TenantManager {
         //
         // See https://github.com/neondatabase/neon/issues/5080
 
-        let slot_guard =
-            tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
+        // Tenant deletion can happen two ways:
+        // - Legacy: called on an attached location. The attached Tenant object stays alive in Stopping
+        //   state until deletion is complete.
+        // - New: called on a pageserver without an attached location.  We proceed with deletion from
+        //   remote storage.
+        //
+        // See https://github.com/neondatabase/neon/issues/5080 for more context on this transition.
 
-        // unwrap is safe because we used MustExist mode when acquiring
-        let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
-            TenantSlot::Attached(tenant) => tenant.clone(),
-            _ => {
-                // Express "not attached" as equivalent to "not found"
-                return Err(DeleteTenantError::NotAttached);
+        let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
+        match &slot_guard.old_value {
+            Some(TenantSlot::Attached(tenant)) => {
+                // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
+                // deletion will be resumed across restarts.
+                let tenant = tenant.clone();
+                return self
+                    .delete_tenant_attached(slot_guard, tenant, activation_timeout)
+                    .await;
             }
+            Some(TenantSlot::Secondary(secondary_tenant)) => {
+                secondary_tenant.shutdown().await;
+                let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
+                let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
+                    .await
+                    .with_context(|| {
+                        format!("local tenant directory {local_tenant_directory:?} rename")
+                    })?;
+                spawn_background_purge(tmp_dir);
+            }
+            Some(TenantSlot::InProgress(_)) => unreachable!(),
+            None => {}
         };
 
+        // Fall through: local state for this tenant is no longer present, proceed with remote delete
+        let remote_path = remote_tenant_path(&tenant_shard_id);
+        let keys = match self
+            .resources
+            .remote_storage
+            .list(
+                Some(&remote_path),
+                remote_storage::ListingMode::NoDelimiter,
+                None,
+                &self.cancel,
+            )
+            .await
+        {
+            Ok(listing) => listing.keys,
+            Err(remote_storage::DownloadError::Cancelled) => {
+                return Err(DeleteTenantError::Cancelled)
+            }
+            Err(remote_storage::DownloadError::NotFound) => return Ok(StatusCode::NOT_FOUND),
+            Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
+        };
+
+        if keys.is_empty() {
+            tracing::info!("Remote storage already deleted");
+        } else {
+            tracing::info!("Deleting {} keys from remote storage", keys.len());
+            self.resources
+                .remote_storage
+                .delete_objects(&keys, &self.cancel)
+                .await?;
+        }
+
+        // Callers use 404 as success for deletions, for historical reasons.
+        Ok(StatusCode::NOT_FOUND)
+    }
+
+    async fn delete_tenant_attached(
+        &self,
+        slot_guard: SlotGuard,
+        tenant: Arc<Tenant>,
+        activation_timeout: Duration,
+    ) -> Result<StatusCode, DeleteTenantError> {
         match tenant.current_state() {
             TenantState::Broken { .. } | TenantState::Stopping { .. } => {
                 // If deletion is already in progress, return success (the semantics of this
@@ -1403,7 +1466,7 @@ impl TenantManager {
                     // The `delete_progress` lock is held: deletion is already happening
                     // in the bacckground
                     slot_guard.revert();
-                    return Ok(());
+                    return Ok(StatusCode::ACCEPTED);
                 }
             }
             _ => {
@@ -1436,7 +1499,8 @@ impl TenantManager {
 
         // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
         slot_guard.revert();
-        result
+        let () = result?;
+        Ok(StatusCode::ACCEPTED)
     }
 
     #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
@@ -2833,7 +2897,13 @@ pub(crate) async fn immediate_gc(
         }
     }
 
-    result.map_err(ApiError::InternalServerError)
+    result.map_err(|e| match e {
+        GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
+        GcError::TimelineNotFound => {
+            ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
+        }
+        other => ApiError::InternalServerError(anyhow::anyhow!(other)),
+    })
 }
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 73438a790f..e33e4b84aa 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -91,8 +91,7 @@
 //!
 //! The *actual* remote state lags behind the *desired* remote state while
 //! there are in-flight operations.
-//! We keep track of the desired remote state in
-//! [`UploadQueueInitialized::latest_files`] and [`UploadQueueInitialized::latest_metadata`].
+//! We keep track of the desired remote state in [`UploadQueueInitialized::dirty`].
 //! It is initialized based on the [`IndexPart`] that was passed during init
 //! and updated with every `schedule_*` function call.
 //! All this is necessary necessary to compute the future [`IndexPart`]s
@@ -115,8 +114,7 @@
 //!
 //! # Completion
 //!
-//! Once an operation has completed, we update
-//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately,
+//! Once an operation has completed, we update [`UploadQueueInitialized::clean`] immediately,
 //! and submit a request through the DeletionQueue to update
 //! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has
 //! validated that our generation is not stale.  It is this visible value
@@ -416,6 +414,7 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Returns `None` if nothing is yet uplodaded, `Some(disk_consistent_lsn)` otherwise.
     pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
         match &mut *self.upload_queue.lock().unwrap() {
             UploadQueue::Uninitialized => None,
@@ -442,13 +441,11 @@ impl RemoteTimelineClient {
     /// Returns true if this timeline was previously detached at this Lsn and the remote timeline
     /// client is currently initialized.
     pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
-        // technically this is a dirty read, but given how timeline detach ancestor is implemented
-        // via tenant restart, the lineage has always been uploaded.
         self.upload_queue
             .lock()
             .unwrap()
             .initialized_mut()
-            .map(|uq| uq.latest_lineage.is_previous_ancestor_lsn(lsn))
+            .map(|uq| uq.clean.0.lineage.is_previous_ancestor_lsn(lsn))
             .unwrap_or(false)
     }
 
@@ -457,7 +454,6 @@ impl RemoteTimelineClient {
             current_remote_index_part
                 .layer_metadata
                 .values()
-                // If we don't have the file size for the layer, don't account for it in the metric.
                 .map(|ilmd| ilmd.file_size)
                 .sum()
         } else {
@@ -585,9 +581,9 @@ impl RemoteTimelineClient {
 
         // As documented in the struct definition, it's ok for latest_metadata to be
         // ahead of what's _actually_ on the remote during index upload.
-        upload_queue.latest_metadata = metadata.clone();
+        upload_queue.dirty.metadata = metadata.clone();
 
-        self.schedule_index_upload(upload_queue);
+        self.schedule_index_upload(upload_queue)?;
 
         Ok(())
     }
@@ -606,9 +602,9 @@ impl RemoteTimelineClient {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
 
-        upload_queue.latest_metadata.apply(update);
+        upload_queue.dirty.metadata.apply(update);
 
-        self.schedule_index_upload(upload_queue);
+        self.schedule_index_upload(upload_queue)?;
 
         Ok(())
     }
@@ -620,8 +616,8 @@ impl RemoteTimelineClient {
     ) -> anyhow::Result<()> {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
-        upload_queue.last_aux_file_policy = last_aux_file_policy;
-        self.schedule_index_upload(upload_queue);
+        upload_queue.dirty.last_aux_file_policy = last_aux_file_policy;
+        self.schedule_index_upload(upload_queue)?;
         Ok(())
     }
     ///
@@ -639,30 +635,44 @@ impl RemoteTimelineClient {
         let upload_queue = guard.initialized_mut()?;
 
         if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue);
+            self.schedule_index_upload(upload_queue)?;
         }
 
         Ok(())
     }
 
     /// Launch an index-file upload operation in the background (internal function)
-    fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
-        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
+    fn schedule_index_upload(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+    ) -> anyhow::Result<()> {
+        let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn();
+        // fix up the duplicated field
+        upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn;
+
+        // make sure it serializes before doing it in perform_upload_task so that it doesn't
+        // look like a retryable error
+        let void = std::io::sink();
+        serde_json::to_writer(void, &upload_queue.dirty).context("serialize index_part.json")?;
+
+        let index_part = &upload_queue.dirty;
 
         info!(
             "scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
-            upload_queue.latest_files.len(),
+            index_part.layer_metadata.len(),
             upload_queue.latest_files_changes_since_metadata_upload_scheduled,
         );
 
-        let index_part = IndexPart::from(&*upload_queue);
-        let op = UploadOp::UploadMetadata(Box::new(index_part), disk_consistent_lsn);
+        let op = UploadOp::UploadMetadata {
+            uploaded: Box::new(index_part.clone()),
+        };
         self.metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
         upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
 
         // Launch the task immediately, if possible
         self.launch_queued_tasks(upload_queue);
+        Ok(())
     }
 
     pub(crate) async fn schedule_reparenting_and_wait(
@@ -675,16 +685,16 @@ impl RemoteTimelineClient {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
 
-            let Some(prev) = upload_queue.latest_metadata.ancestor_timeline() else {
+            let Some(prev) = upload_queue.dirty.metadata.ancestor_timeline() else {
                 return Err(anyhow::anyhow!(
                     "cannot reparent without a current ancestor"
                 ));
             };
 
-            upload_queue.latest_metadata.reparent(new_parent);
-            upload_queue.latest_lineage.record_previous_ancestor(&prev);
+            upload_queue.dirty.metadata.reparent(new_parent);
+            upload_queue.dirty.lineage.record_previous_ancestor(&prev);
 
-            self.schedule_index_upload(upload_queue);
+            self.schedule_index_upload(upload_queue)?;
 
             self.schedule_barrier0(upload_queue)
         };
@@ -705,16 +715,17 @@ impl RemoteTimelineClient {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
 
-            upload_queue.latest_metadata.detach_from_ancestor(&adopted);
-            upload_queue.latest_lineage.record_detaching(&adopted);
+            upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
+            upload_queue.dirty.lineage.record_detaching(&adopted);
 
             for layer in layers {
                 upload_queue
-                    .latest_files
+                    .dirty
+                    .layer_metadata
                     .insert(layer.layer_desc().layer_name(), layer.metadata());
             }
 
-            self.schedule_index_upload(upload_queue);
+            self.schedule_index_upload(upload_queue)?;
 
             let barrier = self.schedule_barrier0(upload_queue);
             self.launch_queued_tasks(upload_queue);
@@ -746,7 +757,8 @@ impl RemoteTimelineClient {
         let metadata = layer.metadata();
 
         upload_queue
-            .latest_files
+            .dirty
+            .layer_metadata
             .insert(layer.layer_desc().layer_name(), metadata.clone());
         upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
 
@@ -776,8 +788,8 @@ impl RemoteTimelineClient {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
 
-        let with_metadata =
-            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
+        let with_metadata = self
+            .schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned())?;
 
         self.schedule_deletion_of_unlinked0(upload_queue, with_metadata);
 
@@ -801,7 +813,7 @@ impl RemoteTimelineClient {
 
         let names = gc_layers.iter().map(|x| x.layer_desc().layer_name());
 
-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
+        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?;
 
         self.launch_queued_tasks(upload_queue);
 
@@ -814,7 +826,7 @@ impl RemoteTimelineClient {
         self: &Arc<Self>,
         upload_queue: &mut UploadQueueInitialized,
         names: I,
-    ) -> Vec<(LayerName, LayerFileMetadata)>
+    ) -> anyhow::Result<Vec<(LayerName, LayerFileMetadata)>>
     where
         I: IntoIterator<Item = LayerName>,
     {
@@ -824,7 +836,7 @@ impl RemoteTimelineClient {
         let with_metadata: Vec<_> = names
             .into_iter()
             .filter_map(|name| {
-                let meta = upload_queue.latest_files.remove(&name);
+                let meta = upload_queue.dirty.layer_metadata.remove(&name);
 
                 if let Some(meta) = meta {
                     upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
@@ -856,10 +868,10 @@ impl RemoteTimelineClient {
         // index_part update, because that needs to be uploaded before we can actually delete the
         // files.
         if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue);
+            self.schedule_index_upload(upload_queue)?;
         }
 
-        with_metadata
+        Ok(with_metadata)
     }
 
     /// Schedules deletion for layer files which have previously been unlinked from the
@@ -950,7 +962,7 @@ impl RemoteTimelineClient {
 
         let names = compacted_from.iter().map(|x| x.layer_desc().layer_name());
 
-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
+        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?;
         self.launch_queued_tasks(upload_queue);
 
         Ok(())
@@ -1085,7 +1097,7 @@ impl RemoteTimelineClient {
             let deleted_at = Utc::now().naive_utc();
             stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);
 
-            let mut index_part = IndexPart::from(&stopped.upload_queue_for_deletion);
+            let mut index_part = stopped.upload_queue_for_deletion.dirty.clone();
             index_part.deleted_at = Some(deleted_at);
             index_part
         };
@@ -1296,7 +1308,8 @@ impl RemoteTimelineClient {
 
             stopped
                 .upload_queue_for_deletion
-                .latest_files
+                .dirty
+                .layer_metadata
                 .drain()
                 .map(|(file_name, meta)| {
                     remote_layer_path(
@@ -1433,7 +1446,7 @@ impl RemoteTimelineClient {
                     // Can always be scheduled.
                     true
                 }
-                UploadOp::UploadMetadata(_, _) => {
+                UploadOp::UploadMetadata { .. } => {
                     // These can only be performed after all the preceding operations
                     // have finished.
                     upload_queue.inprogress_tasks.is_empty()
@@ -1475,7 +1488,7 @@ impl RemoteTimelineClient {
                 UploadOp::UploadLayer(_, _) => {
                     upload_queue.num_inprogress_layer_uploads += 1;
                 }
-                UploadOp::UploadMetadata(_, _) => {
+                UploadOp::UploadMetadata { .. } => {
                     upload_queue.num_inprogress_metadata_uploads += 1;
                 }
                 UploadOp::Delete(_) => {
@@ -1584,22 +1597,13 @@ impl RemoteTimelineClient {
                     )
                     .await
                 }
-                UploadOp::UploadMetadata(ref index_part, _lsn) => {
-                    let mention_having_future_layers = if cfg!(feature = "testing") {
-                        index_part
-                            .layer_metadata
-                            .keys()
-                            .any(|x| x.is_in_future(*_lsn))
-                    } else {
-                        false
-                    };
-
+                UploadOp::UploadMetadata { ref uploaded } => {
                     let res = upload::upload_index_part(
                         &self.storage_impl,
                         &self.tenant_shard_id,
                         &self.timeline_id,
                         self.generation,
-                        index_part,
+                        uploaded,
                         &self.cancel,
                     )
                     .measure_remote_op(
@@ -1609,10 +1613,21 @@ impl RemoteTimelineClient {
                     )
                     .await;
                     if res.is_ok() {
-                        self.update_remote_physical_size_gauge(Some(index_part));
+                        self.update_remote_physical_size_gauge(Some(uploaded));
+                        let mention_having_future_layers = if cfg!(feature = "testing") {
+                            uploaded
+                                .layer_metadata
+                                .keys()
+                                .any(|x| x.is_in_future(uploaded.metadata.disk_consistent_lsn()))
+                        } else {
+                            false
+                        };
                         if mention_having_future_layers {
                             // find rationale near crate::tenant::timeline::init::cleanup_future_layer
-                            tracing::info!(disk_consistent_lsn=%_lsn, "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup");
+                            tracing::info!(
+                                disk_consistent_lsn = %uploaded.metadata.disk_consistent_lsn(),
+                                "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup"
+                            );
                         }
                     }
                     res
@@ -1713,11 +1728,23 @@ impl RemoteTimelineClient {
                     upload_queue.num_inprogress_layer_uploads -= 1;
                     None
                 }
-                UploadOp::UploadMetadata(_, lsn) => {
+                UploadOp::UploadMetadata { ref uploaded } => {
                     upload_queue.num_inprogress_metadata_uploads -= 1;
-                    // XXX monotonicity check?
 
-                    upload_queue.projected_remote_consistent_lsn = Some(lsn);
+                    // the task id is reused as a monotonicity check for storing the "clean"
+                    // IndexPart.
+                    let last_updater = upload_queue.clean.1;
+                    let is_later = last_updater.is_some_and(|task_id| task_id < task.task_id);
+                    let monotone = is_later || last_updater.is_none();
+
+                    assert!(monotone, "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", task.task_id);
+
+                    // not taking ownership is wasteful
+                    upload_queue.clean.0.clone_from(uploaded);
+                    upload_queue.clean.1 = Some(task.task_id);
+
+                    let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
+
                     if self.generation.is_none() {
                         // Legacy mode: skip validating generation
                         upload_queue.visible_remote_consistent_lsn.store(lsn);
@@ -1771,7 +1798,7 @@ impl RemoteTimelineClient {
                 RemoteOpKind::Upload,
                 RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
             ),
-            UploadOp::UploadMetadata(_, _) => (
+            UploadOp::UploadMetadata { .. } => (
                 RemoteOpFileKind::Index,
                 RemoteOpKind::Upload,
                 DontTrackSize {
@@ -1847,11 +1874,9 @@ impl RemoteTimelineClient {
                     // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
                     let upload_queue_for_deletion = UploadQueueInitialized {
                         task_counter: 0,
-                        latest_files: initialized.latest_files.clone(),
+                        dirty: initialized.dirty.clone(),
+                        clean: initialized.clean.clone(),
                         latest_files_changes_since_metadata_upload_scheduled: 0,
-                        latest_metadata: initialized.latest_metadata.clone(),
-                        latest_lineage: initialized.latest_lineage.clone(),
-                        projected_remote_consistent_lsn: None,
                         visible_remote_consistent_lsn: initialized
                             .visible_remote_consistent_lsn
                             .clone(),
@@ -1864,7 +1889,6 @@ impl RemoteTimelineClient {
                         dangling_files: HashMap::default(),
                         shutting_down: false,
                         shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
-                        last_aux_file_policy: initialized.last_aux_file_policy,
                     };
 
                     let upload_queue = std::mem::replace(
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index bd75f980e8..d0385e4aee 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -28,6 +28,7 @@ use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
+use utils::pausable_failpoint;
 
 use super::index::{IndexPart, LayerFileMetadata};
 use super::{
@@ -152,6 +153,8 @@ async fn download_object<'a>(
 
                 let download = storage.download(src_path, cancel).await?;
 
+                pausable_failpoint!("before-downloading-layer-stream-pausable");
+
                 let mut buf_writer =
                     tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
 
@@ -199,6 +202,8 @@ async fn download_object<'a>(
 
                 let mut download = storage.download(src_path, cancel).await?;
 
+                pausable_failpoint!("before-downloading-layer-stream-pausable");
+
                 // TODO: use vectored write (writev) once supported by tokio-epoll-uring.
                 // There's chunks_vectored() on the stream.
                 let (bytes_amount, destination_file) = async {
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index f5d939c747..7d2e9b9a91 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -11,7 +11,6 @@ use utils::id::TimelineId;
 
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerName;
-use crate::tenant::upload_queue::UploadQueueInitialized;
 use crate::tenant::Generation;
 use pageserver_api::shard::ShardIndex;
 
@@ -42,9 +41,13 @@ pub struct IndexPart {
     // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
     // It's duplicated for convenience when reading the serialized structure, but is
     // private because internally we would read from metadata instead.
-    disk_consistent_lsn: Lsn,
+    pub(super) disk_consistent_lsn: Lsn,
 
-    #[serde(rename = "metadata_bytes")]
+    // TODO: later make this "rename" to "alias", rename field as "legacy_metadata"
+    #[serde(
+        rename = "metadata_bytes",
+        with = "crate::tenant::metadata::modern_serde"
+    )]
     pub metadata: TimelineMetadata,
 
     #[serde(default)]
@@ -80,23 +83,15 @@ impl IndexPart {
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
-    fn new(
-        layers_and_metadata: &HashMap<LayerName, LayerFileMetadata>,
-        disk_consistent_lsn: Lsn,
-        metadata: TimelineMetadata,
-        lineage: Lineage,
-        last_aux_file_policy: Option<AuxFilePolicy>,
-    ) -> Self {
-        let layer_metadata = layers_and_metadata.clone();
-
-        Self {
+    pub(crate) fn empty(metadata: TimelineMetadata) -> Self {
+        IndexPart {
             version: Self::LATEST_VERSION,
-            layer_metadata,
-            disk_consistent_lsn,
+            layer_metadata: Default::default(),
+            disk_consistent_lsn: metadata.disk_consistent_lsn(),
             metadata,
             deleted_at: None,
-            lineage,
-            last_aux_file_policy,
+            lineage: Default::default(),
+            last_aux_file_policy: None,
         }
     }
 
@@ -106,7 +101,7 @@ impl IndexPart {
 
     /// If you want this under normal operations, read it from self.metadata:
     /// this method is just for the scrubber to use when validating an index.
-    pub fn get_disk_consistent_lsn(&self) -> Lsn {
+    pub fn duplicated_disk_consistent_lsn(&self) -> Lsn {
         self.disk_consistent_lsn
     }
 
@@ -120,14 +115,7 @@ impl IndexPart {
 
     #[cfg(test)]
     pub(crate) fn example() -> Self {
-        let example_metadata = TimelineMetadata::example();
-        Self::new(
-            &HashMap::new(),
-            example_metadata.disk_consistent_lsn(),
-            example_metadata,
-            Default::default(),
-            Some(AuxFilePolicy::V1),
-        )
+        Self::empty(TimelineMetadata::example())
     }
 
     pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
@@ -135,22 +123,6 @@ impl IndexPart {
     }
 }
 
-impl From<&UploadQueueInitialized> for IndexPart {
-    fn from(uq: &UploadQueueInitialized) -> Self {
-        let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn();
-        let metadata = uq.latest_metadata.clone();
-        let lineage = uq.latest_lineage.clone();
-
-        Self::new(
-            &uq.latest_files,
-            disk_consistent_lsn,
-            metadata,
-            lineage,
-            uq.last_aux_file_policy,
-        )
-    }
-}
-
 /// Metadata gathered for each of the layer files.
 ///
 /// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
@@ -236,11 +208,10 @@ impl Lineage {
     /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
     /// to start a read/write primary at this lsn".
     ///
-    /// Returns true if the Lsn was previously a branch point.
+    /// Returns true if the Lsn was previously our branch point.
     pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
         self.original_ancestor
-            .as_ref()
-            .is_some_and(|(_, ancestor_lsn, _)| lsn == *ancestor_lsn)
+            .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
     }
 }
 
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index e8e824f415..c4dd184610 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -1,6 +1,7 @@
 //! Helper functions to upload files to remote storage with a RemoteStorage
 
 use anyhow::{bail, Context};
+use bytes::Bytes;
 use camino::Utf8Path;
 use fail::fail_point;
 use pageserver_api::shard::TenantShardId;
@@ -11,10 +12,10 @@ use tokio::io::AsyncSeekExt;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, pausable_failpoint};
 
+use super::index::IndexPart;
 use super::Generation;
 use crate::tenant::remote_timeline_client::{
-    index::IndexPart, remote_index_path, remote_initdb_archive_path,
-    remote_initdb_preserved_archive_path,
+    remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path,
 };
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
 use utils::id::{TenantId, TimelineId};
@@ -27,7 +28,7 @@ pub(crate) async fn upload_index_part<'a>(
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
     generation: Generation,
-    index_part: &'a IndexPart,
+    index_part: &IndexPart,
     cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
     tracing::trace!("uploading new index part");
@@ -37,16 +38,16 @@ pub(crate) async fn upload_index_part<'a>(
     });
     pausable_failpoint!("before-upload-index-pausable");
 
-    let index_part_bytes = index_part
-        .to_s3_bytes()
-        .context("serialize index part file into bytes")?;
-    let index_part_size = index_part_bytes.len();
-    let index_part_bytes = bytes::Bytes::from(index_part_bytes);
+    // FIXME: this error comes too late
+    let serialized = index_part.to_s3_bytes()?;
+    let serialized = Bytes::from(serialized);
+
+    let index_part_size = serialized.len();
 
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
     storage
         .upload_storage_object(
-            futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
+            futures::stream::once(futures::future::ready(Ok(serialized))),
             index_part_size,
             &remote_path,
             cancel,
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 5c915d6b53..62803c7838 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -1000,7 +1000,7 @@ impl<'a> TenantDownloader<'a> {
             layer.name,
             layer.metadata.file_size
         );
-        let downloaded_bytes = match download_layer_file(
+        let downloaded_bytes = download_layer_file(
             self.conf,
             self.remote_storage,
             *tenant_shard_id,
@@ -1011,8 +1011,9 @@ impl<'a> TenantDownloader<'a> {
             &self.secondary_state.cancel,
             ctx,
         )
-        .await
-        {
+        .await;
+
+        let downloaded_bytes = match downloaded_bytes {
             Ok(bytes) => bytes,
             Err(DownloadError::NotFound) => {
                 // A heatmap might be out of date and refer to a layer that doesn't exist any more.
diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs
index 0ec1c7872a..28cf2125df 100644
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -334,8 +334,11 @@ where
 
         let tenant_shard_id = job.get_tenant_shard_id();
         let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
-            tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                           "Command already running, waiting for it");
+            tracing::info!(
+                tenant_id=%tenant_shard_id.tenant_id,
+                shard_id=%tenant_shard_id.shard_slug(),
+                "Command already running, waiting for it"
+            );
             barrier
         } else {
             let running = self.spawn_now(job);
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 9ccf20c0d4..0b3f841ccf 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -318,7 +318,7 @@ pub(crate) struct LayerFringe {
 #[derive(Debug)]
 struct LayerKeyspace {
     layer: ReadableLayer,
-    target_keyspace: KeySpace,
+    target_keyspace: Vec<KeySpace>,
 }
 
 impl LayerFringe {
@@ -336,6 +336,7 @@ impl LayerFringe {
         };
 
         let removed = self.layers.remove_entry(&read_desc.layer_id);
+
         match removed {
             Some((
                 _,
@@ -343,7 +344,15 @@ impl LayerFringe {
                     layer,
                     target_keyspace,
                 },
-            )) => Some((layer, target_keyspace, read_desc.lsn_range)),
+            )) => {
+                let mut keyspace = KeySpaceRandomAccum::new();
+                for ks in target_keyspace {
+                    for part in ks.ranges {
+                        keyspace.add_range(part);
+                    }
+                }
+                Some((layer, keyspace.consume_keyspace(), read_desc.lsn_range))
+            }
             None => unreachable!("fringe internals are always consistent"),
         }
     }
@@ -358,7 +367,7 @@ impl LayerFringe {
         let entry = self.layers.entry(layer_id.clone());
         match entry {
             Entry::Occupied(mut entry) => {
-                entry.get_mut().target_keyspace.merge(&keyspace);
+                entry.get_mut().target_keyspace.push(keyspace);
             }
             Entry::Vacant(entry) => {
                 self.planned_reads_by_lsn.push(ReadDesc {
@@ -367,7 +376,7 @@ impl LayerFringe {
                 });
                 entry.insert(LayerKeyspace {
                     layer,
-                    target_keyspace: keyspace,
+                    target_keyspace: vec![keyspace],
                 });
             }
         }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 1b3802840f..999e2e8679 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -478,6 +478,23 @@ impl DeltaLayerWriterInner {
         key_end: Key,
         timeline: &Arc<Timeline>,
         ctx: &RequestContext,
+    ) -> anyhow::Result<ResidentLayer> {
+        let temp_path = self.path.clone();
+        let result = self.finish0(key_end, timeline, ctx).await;
+        if result.is_err() {
+            tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
+            if let Err(e) = std::fs::remove_file(&temp_path) {
+                tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
+            }
+        }
+        result
+    }
+
+    async fn finish0(
+        self,
+        key_end: Key,
+        timeline: &Arc<Timeline>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<ResidentLayer> {
         let index_start_blk =
             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -651,19 +668,11 @@ impl DeltaLayerWriter {
         timeline: &Arc<Timeline>,
         ctx: &RequestContext,
     ) -> anyhow::Result<ResidentLayer> {
-        let inner = self.inner.take().unwrap();
-        let temp_path = inner.path.clone();
-        let result = inner.finish(key_end, timeline, ctx).await;
-        // The delta layer files can sometimes be really large. Clean them up.
-        if result.is_err() {
-            tracing::warn!(
-                "Cleaning up temporary delta file {temp_path} after error during writing"
-            );
-            if let Err(e) = std::fs::remove_file(&temp_path) {
-                tracing::warn!("Error cleaning up temporary delta layer file {temp_path}: {e:?}")
-            }
-        }
-        result
+        self.inner
+            .take()
+            .unwrap()
+            .finish(key_end, timeline, ctx)
+            .await
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 8394b33f19..285618b146 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -917,26 +917,57 @@ impl Drop for ImageLayerWriter {
 
 #[cfg(test)]
 mod test {
+    use std::time::Duration;
+
     use bytes::Bytes;
     use pageserver_api::{
         key::Key,
         shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
     };
-    use utils::{id::TimelineId, lsn::Lsn};
+    use utils::{
+        generation::Generation,
+        id::{TenantId, TimelineId},
+        lsn::Lsn,
+    };
 
-    use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
+    use crate::{
+        tenant::{config::TenantConf, harness::TenantHarness},
+        DEFAULT_PG_VERSION,
+    };
 
     use super::ImageLayerWriter;
 
     #[tokio::test]
     async fn image_layer_rewrite() {
-        let harness = TenantHarness::create("test_image_layer_rewrite").unwrap();
-        let (tenant, ctx) = harness.load().await;
-
+        let tenant_conf = TenantConf {
+            gc_period: Duration::ZERO,
+            compaction_period: Duration::ZERO,
+            ..TenantConf::default()
+        };
+        let tenant_id = TenantId::generate();
+        let mut gen = Generation::new(0xdead0001);
+        let mut get_next_gen = || {
+            let ret = gen;
+            gen = gen.next();
+            ret
+        };
         // The LSN at which we will create an image layer to filter
         let lsn = Lsn(0xdeadbeef0000);
-
         let timeline_id = TimelineId::generate();
+
+        //
+        // Create an unsharded parent with a layer.
+        //
+
+        let harness = TenantHarness::create_custom(
+            "test_image_layer_rewrite--parent",
+            tenant_conf.clone(),
+            tenant_id,
+            ShardIdentity::unsharded(),
+            get_next_gen(),
+        )
+        .unwrap();
+        let (tenant, ctx) = harness.load().await;
         let timeline = tenant
             .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
             .await
@@ -971,9 +1002,47 @@ mod test {
         };
         let original_size = resident.metadata().file_size;
 
+        //
+        // Create child shards and do the rewrite, exercising filter().
+        // TODO: abstraction in TenantHarness for splits.
+        //
+
         // Filter for various shards: this exercises cases like values at start of key range, end of key
         // range, middle of key range.
-        for shard_number in 0..4 {
+        let shard_count = ShardCount::new(4);
+        for shard_number in 0..shard_count.count() {
+            //
+            // mimic the shard split
+            //
+            let shard_identity = ShardIdentity::new(
+                ShardNumber(shard_number),
+                shard_count,
+                ShardStripeSize(0x8000),
+            )
+            .unwrap();
+            let harness = TenantHarness::create_custom(
+                Box::leak(Box::new(format!(
+                    "test_image_layer_rewrite--child{}",
+                    shard_identity.shard_slug()
+                ))),
+                tenant_conf.clone(),
+                tenant_id,
+                shard_identity,
+                // NB: in reality, the shards would each fork off their own gen number sequence from the parent.
+                // But here, all we care about is that the gen number is unique.
+                get_next_gen(),
+            )
+            .unwrap();
+            let (tenant, ctx) = harness.load().await;
+            let timeline = tenant
+                .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
+                .await
+                .unwrap();
+
+            //
+            // use filter() and make assertions
+            //
+
             let mut filtered_writer = ImageLayerWriter::new(
                 harness.conf,
                 timeline_id,
@@ -985,15 +1054,6 @@ mod test {
             .await
             .unwrap();
 
-            // TenantHarness gave us an unsharded tenant, but we'll use a sharded ShardIdentity
-            // to exercise filter()
-            let shard_identity = ShardIdentity::new(
-                ShardNumber(shard_number),
-                ShardCount::new(4),
-                ShardStripeSize(0x8000),
-            )
-            .unwrap();
-
             let wrote_keys = resident
                 .filter(&shard_identity, &mut filtered_writer, &ctx)
                 .await
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 3ac799c69a..18f9ba4ef8 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -277,9 +277,10 @@ impl Layer {
 
         let downloaded = resident.expect("just initialized");
 
-        // if the rename works, the path is as expected
-        // TODO: sync system call
-        std::fs::rename(temp_path, owner.local_path())
+        // We never want to overwrite an existing file, so we use `RENAME_NOREPLACE`.
+        // TODO: this leaves the temp file in place if the rename fails, risking us running
+        // out of space. Should we clean it up here or does the calling context deal with this?
+        utils::fs_ext::rename_noreplace(temp_path.as_std_path(), owner.local_path().as_std_path())
             .with_context(|| format!("rename temporary file as correct path for {owner}"))?;
 
         Ok(ResidentLayer { downloaded, owner })
@@ -366,7 +367,10 @@ impl Layer {
             .0
             .get_or_maybe_download(true, Some(ctx))
             .await
-            .map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?;
+            .map_err(|err| match err {
+                DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
+                other => GetVectoredError::Other(anyhow::anyhow!(other)),
+            })?;
 
         self.0
             .access_stats
@@ -1158,6 +1162,11 @@ impl LayerInner {
                 let consecutive_failures =
                     1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
 
+                if timeline.cancel.is_cancelled() {
+                    // If we're shutting down, drop out before logging the error
+                    return Err(e);
+                }
+
                 tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
 
                 let backoff = utils::backoff::exponential_backoff_duration_seconds(
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index bf2d8a47b4..a6dfa84f35 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -380,21 +380,28 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 let res = tenant
                     .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
                     .await;
-                if let Err(e) = res {
-                    let wait_duration = backoff::exponential_backoff_duration_seconds(
-                        error_run_count + 1,
-                        1.0,
-                        MAX_BACKOFF_SECS,
-                    );
-                    error_run_count += 1;
-                    let wait_duration = Duration::from_secs_f64(wait_duration);
-                    error!(
+                match res {
+                    Ok(_) => {
+                        error_run_count = 0;
+                        period
+                    }
+                    Err(crate::tenant::GcError::TenantCancelled) => {
+                        return;
+                    }
+                    Err(e) => {
+                        let wait_duration = backoff::exponential_backoff_duration_seconds(
+                            error_run_count + 1,
+                            1.0,
+                            MAX_BACKOFF_SECS,
+                        );
+                        error_run_count += 1;
+                        let wait_duration = Duration::from_secs_f64(wait_duration);
+
+                        error!(
                         "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
                     );
-                    wait_duration
-                } else {
-                    error_run_count = 0;
-                    period
+                        wait_duration
+                    }
                 }
             };
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d4f6e25843..4c46c4e635 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -102,7 +102,6 @@ use crate::metrics::{
 };
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
-use pageserver_api::key::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::ShardIndex;
 
@@ -131,14 +130,17 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
 use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
+use super::{
+    secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
+    GcError,
+};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub(super) enum FlushLoopState {
+pub(crate) enum FlushLoopState {
     NotStarted,
     Running {
         #[cfg(test)]
@@ -496,15 +498,11 @@ pub(crate) enum PageReconstructError {
     Other(#[from] anyhow::Error),
 
     #[error("Ancestor LSN wait error: {0}")]
-    AncestorLsnTimeout(#[from] WaitLsnError),
+    AncestorLsnTimeout(WaitLsnError),
 
     #[error("timeline shutting down")]
     Cancelled,
 
-    /// The ancestor of this is being stopped
-    #[error("ancestor timeline {0} is being stopped")]
-    AncestorStopping(TimelineId),
-
     /// An error happened replaying WAL records
     #[error(transparent)]
     WalRedo(anyhow::Error),
@@ -569,7 +567,7 @@ impl PageReconstructError {
         match self {
             Other(_) => false,
             AncestorLsnTimeout(_) => false,
-            Cancelled | AncestorStopping(_) => true,
+            Cancelled => true,
             WalRedo(_) => false,
             MissingKey { .. } => false,
         }
@@ -577,7 +575,7 @@ impl PageReconstructError {
 }
 
 #[derive(thiserror::Error, Debug)]
-enum CreateImageLayersError {
+pub(crate) enum CreateImageLayersError {
     #[error("timeline shutting down")]
     Cancelled,
 
@@ -591,17 +589,35 @@ enum CreateImageLayersError {
     Other(#[from] anyhow::Error),
 }
 
-#[derive(thiserror::Error, Debug)]
-enum FlushLayerError {
+#[derive(thiserror::Error, Debug, Clone)]
+pub(crate) enum FlushLayerError {
     /// Timeline cancellation token was cancelled
     #[error("timeline shutting down")]
     Cancelled,
 
+    /// We tried to flush a layer while the Timeline is in an unexpected state
+    #[error("cannot flush frozen layers when flush_loop is not running, state is {0:?}")]
+    NotRunning(FlushLoopState),
+
+    // Arc<> the following non-clonable error types: we must be Clone-able because the flush error is propagated from the flush
+    // loop via a watch channel, where we can only borrow it.
     #[error(transparent)]
-    CreateImageLayersError(CreateImageLayersError),
+    CreateImageLayersError(Arc<CreateImageLayersError>),
 
     #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    Other(#[from] Arc<anyhow::Error>),
+}
+
+impl FlushLayerError {
+    // When crossing from generic anyhow errors to this error type, we explicitly check
+    // for timeline cancellation to avoid logging inoffensive shutdown errors as warn/err.
+    fn from_anyhow(timeline: &Timeline, err: anyhow::Error) -> Self {
+        if timeline.cancel.is_cancelled() {
+            Self::Cancelled
+        } else {
+            Self::Other(Arc::new(err))
+        }
+    }
 }
 
 #[derive(thiserror::Error, Debug)]
@@ -627,17 +643,17 @@ pub(crate) enum GetVectoredError {
 
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum GetReadyAncestorError {
-    #[error("ancestor timeline {0} is being stopped")]
-    AncestorStopping(TimelineId),
-
     #[error("Ancestor LSN wait error: {0}")]
     AncestorLsnTimeout(#[from] WaitLsnError),
 
+    #[error("Bad state on timeline {timeline_id}: {state:?}")]
+    BadState {
+        timeline_id: TimelineId,
+        state: TimelineState,
+    },
+
     #[error("Cancelled")]
     Cancelled,
-
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
 }
 
 #[derive(Clone, Copy)]
@@ -672,8 +688,8 @@ pub(crate) enum WaitLsnError {
     Shutdown,
 
     // Called on an timeline not in active state or shutting down
-    #[error("Bad state (not active)")]
-    BadState,
+    #[error("Bad timeline state: {0:?}")]
+    BadState(TimelineState),
 
     // Timeout expired while waiting for LSN to catch up with goal.
     #[error("{0}")]
@@ -696,7 +712,7 @@ impl From<CreateImageLayersError> for FlushLayerError {
     fn from(e: CreateImageLayersError) -> Self {
         match e {
             CreateImageLayersError::Cancelled => FlushLayerError::Cancelled,
-            any => FlushLayerError::CreateImageLayersError(any),
+            any => FlushLayerError::CreateImageLayersError(Arc::new(any)),
         }
     }
 }
@@ -736,10 +752,9 @@ impl From<GetReadyAncestorError> for PageReconstructError {
     fn from(e: GetReadyAncestorError) -> Self {
         use GetReadyAncestorError::*;
         match e {
-            AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid),
             AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err),
+            bad_state @ BadState { .. } => PageReconstructError::Other(anyhow::anyhow!(bad_state)),
             Cancelled => PageReconstructError::Cancelled,
-            Other(other) => PageReconstructError::Other(other),
         }
     }
 }
@@ -1171,9 +1186,7 @@ impl Timeline {
 
                 use PageReconstructError::*;
                 match block {
-                    Err(Cancelled | AncestorStopping(_)) => {
-                        return Err(GetVectoredError::Cancelled)
-                    }
+                    Err(Cancelled) => return Err(GetVectoredError::Cancelled),
                     Err(MissingKey(_))
                         if NON_INHERITED_RANGE.contains(&key)
                             || NON_INHERITED_SPARSE_RANGE.contains(&key) =>
@@ -1448,10 +1461,11 @@ impl Timeline {
         who_is_waiting: WaitLsnWaiter<'_>,
         ctx: &RequestContext, /* Prepare for use by cancellation */
     ) -> Result<(), WaitLsnError> {
-        if self.cancel.is_cancelled() {
+        let state = self.current_state();
+        if self.cancel.is_cancelled() || matches!(state, TimelineState::Stopping) {
             return Err(WaitLsnError::Shutdown);
-        } else if !self.is_active() {
-            return Err(WaitLsnError::BadState);
+        } else if !matches!(state, TimelineState::Active) {
+            return Err(WaitLsnError::BadState(state));
         }
 
         if cfg!(debug_assertions) {
@@ -1547,13 +1561,13 @@ impl Timeline {
 
     /// Flush to disk all data that was written with the put_* functions
     #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
-    pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
+    pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> {
         self.freeze_and_flush0().await
     }
 
     // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
     // polluting the span hierarchy.
-    pub(crate) async fn freeze_and_flush0(&self) -> anyhow::Result<()> {
+    pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
         let to_lsn = self.freeze_inmem_layer(false).await;
         self.flush_frozen_layers_and_wait(to_lsn).await
     }
@@ -2735,11 +2749,6 @@ impl Timeline {
             self.current_logical_size.initialized.add_permits(1);
         }
 
-        enum BackgroundCalculationError {
-            Cancelled,
-            Other(anyhow::Error),
-        }
-
         let try_once = |attempt: usize| {
             let background_ctx = &background_ctx;
             let self_ref = &self;
@@ -2757,10 +2766,10 @@ impl Timeline {
                         (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit)
                     }
                     _ = self_ref.cancel.cancelled() => {
-                        return Err(BackgroundCalculationError::Cancelled);
+                        return Err(CalculateLogicalSizeError::Cancelled);
                     }
                     _ = cancel.cancelled() => {
-                        return Err(BackgroundCalculationError::Cancelled);
+                        return Err(CalculateLogicalSizeError::Cancelled);
                     },
                     () = skip_concurrency_limiter.cancelled() => {
                         // Some action that is part of a end user interaction requested logical size
@@ -2778,28 +2787,21 @@ impl Timeline {
                     crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
                 };
 
-                match self_ref
+                let calculated_size = self_ref
                     .logical_size_calculation_task(
                         initial_part_end,
                         LogicalSizeCalculationCause::Initial,
                         background_ctx,
                     )
-                    .await
-                {
-                    Ok(calculated_size) => Ok((calculated_size, metrics_guard)),
-                    Err(CalculateLogicalSizeError::Cancelled) => {
-                        Err(BackgroundCalculationError::Cancelled)
-                    }
-                    Err(CalculateLogicalSizeError::Other(err)) => {
-                        if let Some(PageReconstructError::AncestorStopping(_)) =
-                            err.root_cause().downcast_ref()
-                        {
-                            Err(BackgroundCalculationError::Cancelled)
-                        } else {
-                            Err(BackgroundCalculationError::Other(err))
-                        }
-                    }
-                }
+                    .await?;
+
+                self_ref
+                    .trigger_aux_file_size_computation(initial_part_end, background_ctx)
+                    .await?;
+
+                // TODO: add aux file size to logical size
+
+                Ok((calculated_size, metrics_guard))
             }
         };
 
@@ -2810,8 +2812,11 @@ impl Timeline {
 
                 match try_once(attempt).await {
                     Ok(res) => return ControlFlow::Continue(res),
-                    Err(BackgroundCalculationError::Cancelled) => return ControlFlow::Break(()),
-                    Err(BackgroundCalculationError::Other(e)) => {
+                    Err(CalculateLogicalSizeError::Cancelled) => return ControlFlow::Break(()),
+                    Err(
+                        e @ (CalculateLogicalSizeError::Decode(_)
+                        | CalculateLogicalSizeError::PageRead(_)),
+                    ) => {
                         warn!(attempt, "initial size calculation failed: {e:?}");
                         // exponential back-off doesn't make sense at these long intervals;
                         // use fixed retry interval with generous jitter instead
@@ -3188,17 +3193,21 @@ impl Timeline {
             }
 
             // Recurse into ancestor if needed
-            if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
-                trace!(
-                    "going into ancestor {}, cont_lsn is {}",
-                    timeline.ancestor_lsn,
-                    cont_lsn
-                );
+            if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() {
+                if key.is_inherited_key() && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
+                    trace!(
+                        "going into ancestor {}, cont_lsn is {}",
+                        timeline.ancestor_lsn,
+                        cont_lsn
+                    );
 
-                timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
-                timeline = &*timeline_owned;
-                prev_lsn = None;
-                continue 'outer;
+                    timeline_owned = timeline
+                        .get_ready_ancestor_timeline(ancestor_timeline, ctx)
+                        .await?;
+                    timeline = &*timeline_owned;
+                    prev_lsn = None;
+                    continue 'outer;
+                }
             }
 
             let guard = timeline.layers.read().await;
@@ -3347,10 +3356,10 @@ impl Timeline {
                 break None;
             }
 
-            // Not fully retrieved but no ancestor timeline.
-            if timeline.ancestor_timeline.is_none() {
+            let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else {
+                // Not fully retrieved but no ancestor timeline.
                 break Some(keyspace);
-            }
+            };
 
             // Now we see if there are keys covered by the image layer but does not exist in the
             // image layer, which means that the key does not exist.
@@ -3370,7 +3379,7 @@ impl Timeline {
             // Take the min to avoid reconstructing a page with data newer than request Lsn.
             cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
             timeline_owned = timeline
-                .get_ready_ancestor_timeline(ctx)
+                .get_ready_ancestor_timeline(ancestor_timeline, ctx)
                 .await
                 .map_err(GetVectoredError::GetReadyAncestorError)?;
             timeline = &*timeline_owned;
@@ -3542,13 +3551,9 @@ impl Timeline {
 
     async fn get_ready_ancestor_timeline(
         &self,
+        ancestor: &Arc<Timeline>,
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, GetReadyAncestorError> {
-        let ancestor = match self.get_ancestor_timeline() {
-            Ok(timeline) => timeline,
-            Err(e) => return Err(GetReadyAncestorError::from(e)),
-        };
-
         // It's possible that the ancestor timeline isn't active yet, or
         // is active but hasn't yet caught up to the branch point. Wait
         // for it.
@@ -3576,16 +3581,14 @@ impl Timeline {
         match ancestor.wait_to_become_active(ctx).await {
             Ok(()) => {}
             Err(TimelineState::Stopping) => {
-                return Err(GetReadyAncestorError::AncestorStopping(
-                    ancestor.timeline_id,
-                ));
+                // If an ancestor is stopping, it means the tenant is stopping: handle this the same as if this timeline was stopping.
+                return Err(GetReadyAncestorError::Cancelled);
             }
             Err(state) => {
-                return Err(GetReadyAncestorError::Other(anyhow::anyhow!(
-                    "Timeline {} will not become active. Current state: {:?}",
-                    ancestor.timeline_id,
-                    &state,
-                )));
+                return Err(GetReadyAncestorError::BadState {
+                    timeline_id: ancestor.timeline_id,
+                    state,
+                });
             }
         }
         ancestor
@@ -3594,21 +3597,17 @@ impl Timeline {
             .map_err(|e| match e {
                 e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
                 WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled,
-                e @ WaitLsnError::BadState => GetReadyAncestorError::Other(anyhow::anyhow!(e)),
+                WaitLsnError::BadState(state) => GetReadyAncestorError::BadState {
+                    timeline_id: ancestor.timeline_id,
+                    state,
+                },
             })?;
 
-        Ok(ancestor)
+        Ok(ancestor.clone())
     }
 
-    pub(crate) fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
-        let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
-            format!(
-                "Ancestor is missing. Timeline id: {} Ancestor id {:?}",
-                self.timeline_id,
-                self.get_ancestor_timeline_id(),
-            )
-        })?;
-        Ok(Arc::clone(ancestor))
+    pub(crate) fn get_ancestor_timeline(&self) -> Option<Arc<Timeline>> {
+        self.ancestor_timeline.clone()
     }
 
     pub(crate) fn get_shard_identity(&self) -> &ShardIdentity {
@@ -3717,7 +3716,9 @@ impl Timeline {
                         return;
                     }
                     err @ Err(
-                        FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
+                        FlushLayerError::NotRunning(_)
+                        | FlushLayerError::Other(_)
+                        | FlushLayerError::CreateImageLayersError(_),
                     ) => {
                         error!("could not flush frozen layer: {err:?}");
                         break err.map(|_| ());
@@ -3763,7 +3764,10 @@ impl Timeline {
     /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
     /// it means no data will be written between the top of the highest frozen layer and to_lsn,
     /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
-    async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> {
+    async fn flush_frozen_layers_and_wait(
+        &self,
+        last_record_lsn: Lsn,
+    ) -> Result<(), FlushLayerError> {
         let mut rx = self.layer_flush_done_tx.subscribe();
 
         // Increment the flush cycle counter and wake up the flush task.
@@ -3774,7 +3778,7 @@ impl Timeline {
 
         let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
         if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
-            anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
+            return Err(FlushLayerError::NotRunning(flush_loop_state));
         }
 
         self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
@@ -3787,14 +3791,11 @@ impl Timeline {
             {
                 let (last_result_counter, last_result) = &*rx.borrow();
                 if *last_result_counter >= my_flush_request {
-                    if let Err(_err) = last_result {
+                    if let Err(err) = last_result {
                         // We already logged the original error in
                         // flush_loop. We cannot propagate it to the caller
                         // here, because it might not be Cloneable
-                        anyhow::bail!(
-                            "Could not flush frozen layer. Request id: {}",
-                            my_flush_request
-                        );
+                        return Err(err.clone());
                     } else {
                         return Ok(());
                     }
@@ -3803,7 +3804,7 @@ impl Timeline {
             trace!("waiting for flush to complete");
             tokio::select! {
                 rx_e = rx.changed() => {
-                    rx_e?;
+                    rx_e.map_err(|_| FlushLayerError::NotRunning(*self.flush_loop_state.lock().unwrap()))?;
                 },
                 // Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring
                 // the notification from [`flush_loop`] that it completed.
@@ -3875,31 +3876,36 @@ impl Timeline {
                     EnumSet::empty(),
                     ctx,
                 )
-                .await?;
+                .await
+                .map_err(|e| FlushLayerError::from_anyhow(self, e))?;
 
             if self.cancel.is_cancelled() {
                 return Err(FlushLayerError::Cancelled);
             }
 
+            // FIXME(auxfilesv2): support multiple metadata key partitions might need initdb support as well?
+            // This code path will not be hit during regression tests. After #7099 we have a single partition
+            // with two key ranges. If someone wants to fix initdb optimization in the future, this might need
+            // to be fixed.
+
             // For metadata, always create delta layers.
             let delta_layer = if !metadata_partition.parts.is_empty() {
                 assert_eq!(
                     metadata_partition.parts.len(),
                     1,
-                    "currently sparse keyspace should only contain a single aux file keyspace"
+                    "currently sparse keyspace should only contain a single metadata keyspace"
                 );
                 let metadata_keyspace = &metadata_partition.parts[0];
-                assert_eq!(
-                    metadata_keyspace.0.ranges.len(),
-                    1,
-                    "aux file keyspace should be a single range"
-                );
                 self.create_delta_layer(
                     &frozen_layer,
-                    Some(metadata_keyspace.0.ranges[0].clone()),
+                    Some(
+                        metadata_keyspace.0.ranges.first().unwrap().start
+                            ..metadata_keyspace.0.ranges.last().unwrap().end,
+                    ),
                     ctx,
                 )
-                .await?
+                .await
+                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
             } else {
                 None
             };
@@ -3926,7 +3932,11 @@ impl Timeline {
             // Normal case, write out a L0 delta layer file.
             // `create_delta_layer` will not modify the layer map.
             // We will remove frozen layer and add delta layer in one atomic operation later.
-            let Some(layer) = self.create_delta_layer(&frozen_layer, None, ctx).await? else {
+            let Some(layer) = self
+                .create_delta_layer(&frozen_layer, None, ctx)
+                .await
+                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
+            else {
                 panic!("delta layer cannot be empty if no filter is applied");
             };
             (
@@ -3959,7 +3969,8 @@ impl Timeline {
 
             if self.set_disk_consistent_lsn(disk_consistent_lsn) {
                 // Schedule remote uploads that will reflect our new disk_consistent_lsn
-                self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
+                self.schedule_uploads(disk_consistent_lsn, layers_to_upload)
+                    .map_err(|e| FlushLayerError::from_anyhow(self, e))?;
             }
             // release lock on 'layers'
         };
@@ -4257,7 +4268,7 @@ impl Timeline {
                                 // Unfortunately we cannot do this for the main fork, or for
                                 // any metadata keys, keys, as that would lead to actual data
                                 // loss.
-                                if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) {
+                                if img_key.is_rel_fsm_block_key() || img_key.is_rel_vm_block_key() {
                                     warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
                                     ZERO_PAGE.clone()
                                 } else {
@@ -4307,6 +4318,7 @@ impl Timeline {
         ctx: &RequestContext,
         img_range: Range<Key>,
         mode: ImageLayerCreationMode,
+        start: Key,
     ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
         assert!(!matches!(mode, ImageLayerCreationMode::Initial));
 
@@ -4315,39 +4327,43 @@ impl Timeline {
         let data = self
             .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
             .await?;
-        let (data, total_kb_retrieved, total_key_retrieved) = {
+        let (data, total_kb_retrieved, total_keys_retrieved) = {
             let mut new_data = BTreeMap::new();
             let mut total_kb_retrieved = 0;
-            let mut total_key_retrieved = 0;
+            let mut total_keys_retrieved = 0;
             for (k, v) in data {
                 let v = v.map_err(CreateImageLayersError::PageReconstructError)?;
                 total_kb_retrieved += KEY_SIZE + v.len();
-                total_key_retrieved += 1;
+                total_keys_retrieved += 1;
                 new_data.insert(k, v);
             }
-            (new_data, total_kb_retrieved / 1024, total_key_retrieved)
+            (new_data, total_kb_retrieved / 1024, total_keys_retrieved)
         };
-        let delta_file_accessed = reconstruct_state.get_delta_layers_visited();
+        let delta_files_accessed = reconstruct_state.get_delta_layers_visited();
 
-        let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
+        let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
         debug!(
-            "generate image layers for metadata keys: trigger_generation={trigger_generation}, \
-                delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
-                total_key_retrieved={total_key_retrieved}"
+            trigger_generation,
+            delta_files_accessed,
+            total_kb_retrieved,
+            total_keys_retrieved,
+            "generate metadata images"
         );
+
         if !trigger_generation && mode == ImageLayerCreationMode::Try {
             return Ok(ImageLayerCreationOutcome {
                 image: None,
                 next_start_key: img_range.end,
             });
         }
-        let has_keys = !data.is_empty();
+        let mut wrote_any_image = false;
         for (k, v) in data {
-            // Even if the value is empty (deleted), we do not delete it for now until we can ensure vectored get
-            // considers this situation properly.
-            // if v.is_empty() {
-            //     continue;
-            // }
+            if v.is_empty() {
+                // the key has been deleted, it does not need an image
+                // in metadata keyspace, an empty image == tombstone
+                continue;
+            }
+            wrote_any_image = true;
 
             // No need to handle sharding b/c metadata keys are always on the 0-th shard.
 
@@ -4355,16 +4371,26 @@ impl Timeline {
             // on the normal data path either.
             image_layer_writer.put_image(k, v, ctx).await?;
         }
-        Ok(ImageLayerCreationOutcome {
-            image: if has_keys {
-                let image_layer = image_layer_writer.finish(self, ctx).await?;
-                Some(image_layer)
-            } else {
-                tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
-                None
-            },
-            next_start_key: img_range.end,
-        })
+
+        if wrote_any_image {
+            // Normal path: we have written some data into the new image layer for this
+            // partition, so flush it to disk.
+            let image_layer = image_layer_writer.finish(self, ctx).await?;
+            Ok(ImageLayerCreationOutcome {
+                image: Some(image_layer),
+                next_start_key: img_range.end,
+            })
+        } else {
+            // Special case: the image layer may be empty if this is a sharded tenant and the
+            // partition does not cover any keys owned by this shard. In this case, to ensure
+            // we don't leave gaps between image layers, leave `start` where it is, so that the next
+            // layer we write will cover the key range that we just scanned.
+            tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
+            Ok(ImageLayerCreationOutcome {
+                image: None,
+                next_start_key: start,
+            })
+        }
     }
 
     #[tracing::instrument(skip_all, fields(%lsn, %mode))]
@@ -4474,6 +4500,7 @@ impl Timeline {
                         ctx,
                         img_range,
                         mode,
+                        start,
                     )
                     .await?;
                 start = next_start_key;
@@ -4835,7 +4862,7 @@ impl Timeline {
     /// Currently, we don't make any attempt at removing unneeded page versions
     /// within a layer file. We can only remove the whole file if it's fully
     /// obsolete.
-    pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
+    pub(super) async fn gc(&self) -> Result<GcResult, GcError> {
         // this is most likely the background tasks, but it might be the spawned task from
         // immediate_gc
         let _g = tokio::select! {
@@ -4848,7 +4875,7 @@ impl Timeline {
 
         // Is the timeline being deleted?
         if self.is_stopping() {
-            anyhow::bail!("timeline is Stopping");
+            return Err(GcError::TimelineCancelled);
         }
 
         let (horizon_cutoff, pitr_cutoff, retain_lsns) = {
@@ -4906,7 +4933,7 @@ impl Timeline {
         pitr_cutoff: Lsn,
         retain_lsns: Vec<Lsn>,
         new_gc_cutoff: Lsn,
-    ) -> anyhow::Result<GcResult> {
+    ) -> Result<GcResult, GcError> {
         // FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc
 
         let now = SystemTime::now();
@@ -4928,12 +4955,15 @@ impl Timeline {
         // The GC cutoff should only ever move forwards.
         let waitlist = {
             let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
-            ensure!(
-                *write_guard <= new_gc_cutoff,
-                "Cannot move GC cutoff LSN backwards (was {}, new {})",
-                *write_guard,
-                new_gc_cutoff
-            );
+            if *write_guard > new_gc_cutoff {
+                return Err(GcError::BadLsn {
+                    why: format!(
+                        "Cannot move GC cutoff LSN backwards (was {}, new {})",
+                        *write_guard, new_gc_cutoff
+                    ),
+                });
+            }
+
             write_guard.store_and_unlock(new_gc_cutoff)
         };
         waitlist.wait().await;
@@ -5042,7 +5072,14 @@ impl Timeline {
             // This unconditionally schedules also an index_part.json update, even though, we will
             // be doing one a bit later with the unlinked gc'd layers.
             let disk_consistent_lsn = self.disk_consistent_lsn.load();
-            self.schedule_uploads(disk_consistent_lsn, None)?;
+            self.schedule_uploads(disk_consistent_lsn, None)
+                .map_err(|e| {
+                    if self.cancel.is_cancelled() {
+                        GcError::TimelineCancelled
+                    } else {
+                        GcError::Remote(e)
+                    }
+                })?;
 
             let gc_layers = layers_to_remove
                 .iter()
@@ -5051,7 +5088,15 @@ impl Timeline {
 
             result.layers_removed = gc_layers.len() as u64;
 
-            self.remote_client.schedule_gc_update(&gc_layers)?;
+            self.remote_client
+                .schedule_gc_update(&gc_layers)
+                .map_err(|e| {
+                    if self.cancel.is_cancelled() {
+                        GcError::TimelineCancelled
+                    } else {
+                        GcError::Remote(e)
+                    }
+                })?;
 
             guard.finish_gc_timeline(&gc_layers);
 
@@ -5066,7 +5111,7 @@ impl Timeline {
             result.layers_removed, new_gc_cutoff
         );
 
-        result.elapsed = now.elapsed()?;
+        result.elapsed = now.elapsed().unwrap_or(Duration::ZERO);
         Ok(result)
     }
 
@@ -5358,6 +5403,133 @@ impl Timeline {
             shard_count: self.tenant_shard_id.shard_count,
         }
     }
+
+    #[cfg(test)]
+    pub(super) fn force_advance_lsn(self: &Arc<Timeline>, new_lsn: Lsn) {
+        self.last_record_lsn.advance(new_lsn);
+    }
+
+    /// Force create an image layer and place it into the layer map.
+    ///
+    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
+    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
+    #[cfg(test)]
+    pub(super) async fn force_create_image_layer(
+        self: &Arc<Timeline>,
+        lsn: Lsn,
+        mut images: Vec<(Key, Bytes)>,
+        check_start_lsn: Option<Lsn>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let last_record_lsn = self.get_last_record_lsn();
+        assert!(
+            lsn <= last_record_lsn,
+            "advance last record lsn before inserting a layer, lsn={lsn}, last_record_lsn={last_record_lsn}"
+        );
+        if let Some(check_start_lsn) = check_start_lsn {
+            assert!(lsn >= check_start_lsn);
+        }
+        images.sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb));
+        let min_key = *images.first().map(|(k, _)| k).unwrap();
+        let max_key = images.last().map(|(k, _)| k).unwrap().next();
+        let mut image_layer_writer = ImageLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            &(min_key..max_key),
+            lsn,
+            ctx,
+        )
+        .await?;
+        for (key, img) in images {
+            image_layer_writer.put_image(key, img, ctx).await?;
+        }
+        let image_layer = image_layer_writer.finish(self, ctx).await?;
+
+        {
+            let mut guard = self.layers.write().await;
+            guard.force_insert_layer(image_layer);
+        }
+
+        Ok(())
+    }
+
+    /// Force create a delta layer and place it into the layer map.
+    ///
+    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
+    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
+    #[cfg(test)]
+    pub(super) async fn force_create_delta_layer(
+        self: &Arc<Timeline>,
+        mut deltas: Vec<(Key, Lsn, Value)>,
+        check_start_lsn: Option<Lsn>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let last_record_lsn = self.get_last_record_lsn();
+        deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
+        let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
+        let max_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
+        let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
+        let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
+        assert!(
+            max_lsn <= last_record_lsn,
+            "advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}"
+        );
+        let end_lsn = Lsn(max_lsn.0 + 1);
+        if let Some(check_start_lsn) = check_start_lsn {
+            assert!(min_lsn >= check_start_lsn);
+        }
+        let mut delta_layer_writer = DeltaLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            min_key,
+            min_lsn..end_lsn,
+            ctx,
+        )
+        .await?;
+        for (key, lsn, val) in deltas {
+            delta_layer_writer.put_value(key, lsn, val, ctx).await?;
+        }
+        let delta_layer = delta_layer_writer.finish(max_key, self, ctx).await?;
+
+        {
+            let mut guard = self.layers.write().await;
+            guard.force_insert_layer(delta_layer);
+        }
+
+        Ok(())
+    }
+
+    /// Return all keys at the LSN in the image layers
+    #[cfg(test)]
+    pub(crate) async fn inspect_image_layers(
+        self: &Arc<Timeline>,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<(Key, Bytes)>> {
+        let mut all_data = Vec::new();
+        let guard = self.layers.read().await;
+        for layer in guard.layer_map().iter_historic_layers() {
+            if !layer.is_delta() && layer.image_layer_lsn() == lsn {
+                let layer = guard.get_from_desc(&layer);
+                let mut reconstruct_data = ValuesReconstructState::default();
+                layer
+                    .get_values_reconstruct_data(
+                        KeySpace::single(Key::MIN..Key::MAX),
+                        lsn..Lsn(lsn.0 + 1),
+                        &mut reconstruct_data,
+                        ctx,
+                    )
+                    .await?;
+                for (k, v) in reconstruct_data.keys {
+                    all_data.push((k, v?.img.unwrap().1));
+                }
+            }
+        }
+        all_data.sort();
+        Ok(all_data)
+    }
 }
 
 type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 07a12f535a..d8de6aee7c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -133,8 +133,7 @@ impl Timeline {
                         },
                         &image_ctx,
                     )
-                    .await
-                    .map_err(anyhow::Error::from)?;
+                    .await?;
 
                 self.upload_new_image_layers(image_layers)?;
                 partitioning.parts.len()
@@ -422,48 +421,6 @@ impl Timeline {
             return Ok(CompactLevel0Phase1Result::default());
         }
 
-        // This failpoint is used together with `test_duplicate_layers` integration test.
-        // It returns the compaction result exactly the same layers as input to compaction.
-        // We want to ensure that this will not cause any problem when updating the layer map
-        // after the compaction is finished.
-        //
-        // Currently, there are two rare edge cases that will cause duplicated layers being
-        // inserted.
-        // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which
-        //    is compacted to 5, but the page server is shut down, next time we start page server we will get a layer
-        //    map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this
-        //    point again, it is likely that we will get a file 6 which has the same content and the key range as 5,
-        //    and this causes an overwrite. This is acceptable because the content is the same, and we should do a
-        //    layer replace instead of the normal remove / upload process.
-        // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file
-        //    size length. Compaction will likely create the same set of n files afterwards.
-        //
-        // This failpoint is a superset of both of the cases.
-        if cfg!(feature = "testing") {
-            let active = (|| {
-                ::fail::fail_point!("compact-level0-phase1-return-same", |_| true);
-                false
-            })();
-
-            if active {
-                let mut new_layers = Vec::with_capacity(level0_deltas.len());
-                for delta in &level0_deltas {
-                    // we are just faking these layers as being produced again for this failpoint
-                    new_layers.push(
-                        delta
-                            .download_and_keep_resident()
-                            .await
-                            .context("download layer for failpoint")?,
-                    );
-                }
-                tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
-                return Ok(CompactLevel0Phase1Result {
-                    new_layers,
-                    deltas_to_compact: level0_deltas,
-                });
-            }
-        }
-
         // Gather the files to compact in this iteration.
         //
         // Start with the oldest Level 0 delta file, and collect any other
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index e6ddabe5b5..4fc89330ba 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;
 
-use super::{layer_manager::LayerManager, Timeline};
+use super::{layer_manager::LayerManager, FlushLayerError, Timeline};
 use crate::{
     context::{DownloadBehavior, RequestContext},
     task_mgr::TaskKind,
@@ -23,7 +23,7 @@ pub(crate) enum Error {
     #[error("shutting down, please retry later")]
     ShuttingDown,
     #[error("flushing failed")]
-    FlushAncestor(#[source] anyhow::Error),
+    FlushAncestor(#[source] FlushLayerError),
     #[error("layer download failed")]
     RewrittenDeltaDownloadFailed(#[source] anyhow::Error),
     #[error("copying LSN prefix locally failed")]
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 884b71df75..b78c98a506 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -255,6 +255,13 @@ impl LayerManager {
         updates.flush()
     }
 
+    #[cfg(test)]
+    pub(crate) fn force_insert_layer(&mut self, layer: ResidentLayer) {
+        let mut updates = self.layer_map.batch_update();
+        Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+        updates.flush()
+    }
+
     /// Helper function to insert a layer into the layer map and file manager.
     fn insert_historic_layer(
         layer: Layer,
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 02f87303d1..50c977a950 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -3,12 +3,10 @@ use super::storage_layer::ResidentLayer;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
-use crate::tenant::remote_timeline_client::index::Lineage;
 use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;
 
 use chrono::NaiveDateTime;
-use pageserver_api::models::AuxFilePolicy;
 use std::sync::Arc;
 use tracing::info;
 use utils::lsn::AtomicLsn;
@@ -45,34 +43,25 @@ pub(crate) struct UploadQueueInitialized {
     /// Counter to assign task IDs
     pub(crate) task_counter: u64,
 
-    /// All layer files stored in the remote storage, taking into account all
-    /// in-progress and queued operations
-    pub(crate) latest_files: HashMap<LayerName, LayerFileMetadata>,
+    /// The next uploaded index_part.json; assumed to be dirty.
+    ///
+    /// Should not be read, directly except for layer file updates. Instead you should add a
+    /// projected field.
+    pub(crate) dirty: IndexPart,
+
+    /// The latest remote persisted IndexPart.
+    ///
+    /// Each completed metadata upload will update this. The second item is the task_id which last
+    /// updated the value, used to ensure we never store an older value over a newer one.
+    pub(crate) clean: (IndexPart, Option<u64>),
 
     /// How many file uploads or deletions been scheduled, since the
     /// last (scheduling of) metadata index upload?
     pub(crate) latest_files_changes_since_metadata_upload_scheduled: u64,
 
-    /// Metadata stored in the remote storage, taking into account all
-    /// in-progress and queued operations.
-    /// DANGER: do not return to outside world, e.g., safekeepers.
-    pub(crate) latest_metadata: TimelineMetadata,
-
-    /// Part of the flattened "next" `index_part.json`.
-    pub(crate) latest_lineage: Lineage,
-
-    /// The last aux file policy used on this timeline.
-    pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
-
-    /// `disk_consistent_lsn` from the last metadata file that was successfully
-    /// uploaded. `Lsn(0)` if nothing was uploaded yet.
-    /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
-    /// Safekeeper can rely on it to make decisions for WAL storage.
-    ///
-    /// visible_remote_consistent_lsn is only updated after our generation has been validated with
+    /// The Lsn is only updated after our generation has been validated with
     /// the control plane (unlesss a timeline's generation is None, in which case
     /// we skip validation)
-    pub(crate) projected_remote_consistent_lsn: Option<Lsn>,
     pub(crate) visible_remote_consistent_lsn: Arc<AtomicLsn>,
 
     // Breakdown of different kinds of tasks currently in-progress
@@ -118,7 +107,8 @@ impl UploadQueueInitialized {
     }
 
     pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
-        self.projected_remote_consistent_lsn
+        let lsn = self.clean.0.metadata.disk_consistent_lsn();
+        self.clean.1.map(|_| lsn)
     }
 }
 
@@ -174,13 +164,12 @@ impl UploadQueue {
 
         info!("initializing upload queue for empty remote");
 
+        let index_part = IndexPart::empty(metadata.clone());
+
         let state = UploadQueueInitialized {
-            // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
-            latest_files: HashMap::new(),
+            dirty: index_part.clone(),
+            clean: (index_part, None),
             latest_files_changes_since_metadata_upload_scheduled: 0,
-            latest_metadata: metadata.clone(),
-            latest_lineage: Lineage::default(),
-            projected_remote_consistent_lsn: None,
             visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
             // what follows are boring default initializations
             task_counter: 0,
@@ -193,7 +182,6 @@ impl UploadQueue {
             dangling_files: HashMap::new(),
             shutting_down: false,
             shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
-            last_aux_file_policy: Default::default(),
         };
 
         *self = UploadQueue::Initialized(state);
@@ -211,22 +199,15 @@ impl UploadQueue {
             }
         }
 
-        let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
-        for (layer_name, layer_metadata) in &index_part.layer_metadata {
-            files.insert(layer_name.to_owned(), layer_metadata.clone());
-        }
-
         info!(
             "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
             index_part.metadata.disk_consistent_lsn()
         );
 
         let state = UploadQueueInitialized {
-            latest_files: files,
+            dirty: index_part.clone(),
+            clean: (index_part.clone(), None),
             latest_files_changes_since_metadata_upload_scheduled: 0,
-            latest_metadata: index_part.metadata.clone(),
-            latest_lineage: index_part.lineage.clone(),
-            projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
             visible_remote_consistent_lsn: Arc::new(
                 index_part.metadata.disk_consistent_lsn().into(),
             ),
@@ -241,7 +222,6 @@ impl UploadQueue {
             dangling_files: HashMap::new(),
             shutting_down: false,
             shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
-            last_aux_file_policy: index_part.last_aux_file_policy(),
         };
 
         *self = UploadQueue::Initialized(state);
@@ -298,13 +278,16 @@ pub(crate) enum UploadOp {
     /// Upload a layer file
     UploadLayer(ResidentLayer, LayerFileMetadata),
 
-    /// Upload the metadata file
-    UploadMetadata(Box<IndexPart>, Lsn),
+    /// Upload a index_part.json file
+    UploadMetadata {
+        /// The next [`UploadQueueInitialized::clean`] after this upload succeeds.
+        uploaded: Box<IndexPart>,
+    },
 
     /// Delete layer files
     Delete(Delete),
 
-    /// Barrier. When the barrier operation is reached,
+    /// Barrier. When the barrier operation is reached, the channel is closed.
     Barrier(tokio::sync::watch::Sender<()>),
 
     /// Shutdown; upon encountering this operation no new operations will be spawned, otherwise
@@ -322,8 +305,12 @@ impl std::fmt::Display for UploadOp {
                     layer, metadata.file_size, metadata.generation
                 )
             }
-            UploadOp::UploadMetadata(_, lsn) => {
-                write!(f, "UploadMetadata(lsn: {})", lsn)
+            UploadOp::UploadMetadata { uploaded, .. } => {
+                write!(
+                    f,
+                    "UploadMetadata(lsn: {})",
+                    uploaded.metadata.disk_consistent_lsn()
+                )
             }
             UploadOp::Delete(delete) => {
                 write!(f, "Delete({} layers)", delete.layers.len())
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index b68f3a0e89..04d9386fab 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -344,21 +344,21 @@ macro_rules! with_file {
 
 impl VirtualFile {
     /// Open a file in read-only mode. Like File::open.
-    pub async fn open(
-        path: &Utf8Path,
+    pub async fn open<P: AsRef<Utf8Path>>(
+        path: P,
         ctx: &RequestContext,
     ) -> Result<VirtualFile, std::io::Error> {
-        Self::open_with_options(path, OpenOptions::new().read(true), ctx).await
+        Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await
     }
 
     /// Create a new file for writing. If the file exists, it will be truncated.
     /// Like File::create.
-    pub async fn create(
-        path: &Utf8Path,
+    pub async fn create<P: AsRef<Utf8Path>>(
+        path: P,
         ctx: &RequestContext,
     ) -> Result<VirtualFile, std::io::Error> {
         Self::open_with_options(
-            path,
+            path.as_ref(),
             OpenOptions::new().write(true).create(true).truncate(true),
             ctx,
         )
@@ -370,12 +370,13 @@ impl VirtualFile {
     /// Note: If any custom flags were set in 'open_options' through OpenOptionsExt,
     /// they will be applied also when the file is subsequently re-opened, not only
     /// on the first time. Make sure that's sane!
-    pub async fn open_with_options(
-        path: &Utf8Path,
+    pub async fn open_with_options<P: AsRef<Utf8Path>>(
+        path: P,
         open_options: &OpenOptions,
         _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
     ) -> Result<VirtualFile, std::io::Error> {
-        let path_str = path.to_string();
+        let path_ref = path.as_ref();
+        let path_str = path_ref.to_string();
         let parts = path_str.split('/').collect::<Vec<&str>>();
         let (tenant_id, shard_id, timeline_id) =
             if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
@@ -401,7 +402,7 @@ impl VirtualFile {
         // where our caller doesn't get to use the returned VirtualFile before its
         // slot gets re-used by someone else.
         let file = observe_duration!(StorageIoOperation::Open, {
-            open_options.open(path.as_std_path()).await?
+            open_options.open(path_ref.as_std_path()).await?
         });
 
         // Strip all options other than read and write.
@@ -417,7 +418,7 @@ impl VirtualFile {
         let vfile = VirtualFile {
             handle: RwLock::new(handle),
             pos: 0,
-            path: path.to_path_buf(),
+            path: path_ref.to_path_buf(),
             open_options: reopen_options,
             tenant_id,
             shard_id,
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 79f075b877..4f26f2f6d1 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -234,6 +234,7 @@ impl WalIngest {
                         modification,
                         &parsed_xact,
                         info == pg_constants::XLOG_XACT_COMMIT,
+                        decoded.origin_id,
                         ctx,
                     )
                     .await?;
@@ -246,6 +247,7 @@ impl WalIngest {
                         modification,
                         &parsed_xact,
                         info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
+                        decoded.origin_id,
                         ctx,
                     )
                     .await?;
@@ -375,6 +377,18 @@ impl WalIngest {
                     self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
                 }
             }
+            pg_constants::RM_REPLORIGIN_ID => {
+                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+                if info == pg_constants::XLOG_REPLORIGIN_SET {
+                    let xlrec = crate::walrecord::XlReploriginSet::decode(&mut buf);
+                    modification
+                        .set_replorigin(xlrec.node_id, xlrec.remote_lsn)
+                        .await?
+                } else if info == pg_constants::XLOG_REPLORIGIN_DROP {
+                    let xlrec = crate::walrecord::XlReploriginDrop::decode(&mut buf);
+                    modification.drop_replorigin(xlrec.node_id).await?
+                }
+            }
             _x => {
                 // TODO: should probably log & fail here instead of blindly
                 // doing something without understanding the protocol
@@ -1178,6 +1192,7 @@ impl WalIngest {
         modification: &mut DatadirModification<'_>,
         parsed: &XlXactParsedRecord,
         is_commit: bool,
+        origin_id: u16,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Record update of CLOG pages
@@ -1243,6 +1258,11 @@ impl WalIngest {
                 }
             }
         }
+        if origin_id != 0 {
+            modification
+                .set_replorigin(origin_id, parsed.origin_lsn)
+                .await?;
+        }
         Ok(())
     }
 
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index 02f6f49694..205f8dee4d 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -9,10 +9,10 @@ use postgres_ffi::pg_constants;
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{BlockNumber, TimestampTz};
 use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId};
-use postgres_ffi::{XLogRecord, XLOG_SIZE_OF_XLOG_RECORD};
+use postgres_ffi::{RepOriginId, XLogRecord, XLOG_SIZE_OF_XLOG_RECORD};
 use serde::{Deserialize, Serialize};
 use tracing::*;
-use utils::bin_ser::DeserializeError;
+use utils::{bin_ser::DeserializeError, lsn::Lsn};
 
 /// Each update to a page is represented by a NeonWalRecord. It can be a wrapper
 /// around a PostgreSQL WAL record, or a custom neon-specific "record".
@@ -116,6 +116,7 @@ pub struct DecodedWALRecord {
 
     pub blocks: Vec<DecodedBkpBlock>,
     pub main_data_offset: usize,
+    pub origin_id: u16,
 }
 
 #[repr(C)]
@@ -573,6 +574,7 @@ pub struct XlXactParsedRecord {
     pub subxacts: Vec<TransactionId>,
 
     pub xnodes: Vec<RelFileNode>,
+    pub origin_lsn: Lsn,
 }
 
 impl XlXactParsedRecord {
@@ -651,6 +653,11 @@ impl XlXactParsedRecord {
             debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid);
         }
 
+        let origin_lsn = if xinfo & pg_constants::XACT_XINFO_HAS_ORIGIN != 0 {
+            Lsn(buf.get_u64_le())
+        } else {
+            Lsn::INVALID
+        };
         XlXactParsedRecord {
             xid,
             info,
@@ -660,6 +667,7 @@ impl XlXactParsedRecord {
             ts_id,
             subxacts,
             xnodes,
+            origin_lsn,
         }
     }
 }
@@ -810,6 +818,36 @@ impl XlRunningXacts {
     }
 }
 
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlReploriginDrop {
+    pub node_id: RepOriginId,
+}
+
+impl XlReploriginDrop {
+    pub fn decode(buf: &mut Bytes) -> XlReploriginDrop {
+        XlReploriginDrop {
+            node_id: buf.get_u16_le(),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlReploriginSet {
+    pub remote_lsn: Lsn,
+    pub node_id: RepOriginId,
+}
+
+impl XlReploriginSet {
+    pub fn decode(buf: &mut Bytes) -> XlReploriginSet {
+        XlReploriginSet {
+            remote_lsn: Lsn(buf.get_u64_le()),
+            node_id: buf.get_u16_le(),
+        }
+    }
+}
+
 /// Main routine to decode a WAL record and figure out which blocks are modified
 //
 // See xlogrecord.h for details
@@ -844,6 +882,7 @@ pub fn decode_wal_record(
     let mut rnode_dbnode: u32 = 0;
     let mut rnode_relnode: u32 = 0;
     let mut got_rnode = false;
+    let mut origin_id: u16 = 0;
 
     let mut buf = record.clone();
 
@@ -891,7 +930,7 @@ pub fn decode_wal_record(
 
             pg_constants::XLR_BLOCK_ID_ORIGIN => {
                 // RepOriginId is uint16
-                buf.advance(2);
+                origin_id = buf.get_u16_le();
             }
 
             pg_constants::XLR_BLOCK_ID_TOPLEVEL_XID => {
@@ -1088,6 +1127,7 @@ pub fn decode_wal_record(
     decoded.xl_info = xlogrec.xl_info;
     decoded.xl_rmid = xlogrec.xl_rmid;
     decoded.record = record;
+    decoded.origin_id = origin_id;
     decoded.main_data_offset = main_data_offset;
 
     Ok(())
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 3decea0c6d..d562540bde 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -20,7 +20,6 @@
 
 /// Process lifecycle and abstracction for the IPC protocol.
 mod process;
-pub use process::Kind as ProcessKind;
 
 /// Code to apply [`NeonWalRecord`]s.
 pub(crate) mod apply_neon;
@@ -34,7 +33,6 @@ use crate::repository::Key;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use bytes::{Bytes, BytesMut};
-use pageserver_api::key::key_to_rel_block;
 use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
 use pageserver_api::shard::TenantShardId;
 use std::sync::Arc;
@@ -55,7 +53,7 @@ pub struct PostgresRedoManager {
     tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
     last_redo_at: std::sync::Mutex<Option<Instant>>,
-    /// The current [`process::Process`] that is used by new redo requests.
+    /// The current [`process::WalRedoProcess`] that is used by new redo requests.
     /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
     /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
     /// their process object; we use [`Arc::clone`] for that.
@@ -67,7 +65,7 @@ pub struct PostgresRedoManager {
     /// still be using the old redo process. But, those other tasks will most likely
     /// encounter an error as well, and errors are an unexpected condition anyway.
     /// So, probably we could get rid of the `Arc` in the future.
-    redo_process: heavier_once_cell::OnceCell<Arc<process::Process>>,
+    redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
 }
 
 ///
@@ -208,30 +206,35 @@ impl PostgresRedoManager {
     ) -> anyhow::Result<Bytes> {
         *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
 
-        let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
+        let (rel, blknum) = key.to_rel_block().context("invalid record")?;
         const MAX_RETRY_ATTEMPTS: u32 = 1;
         let mut n_attempts = 0u32;
         loop {
-            let proc: Arc<process::Process> = match self.redo_process.get_or_init_detached().await {
-                Ok(guard) => Arc::clone(&guard),
-                Err(permit) => {
-                    // don't hold poison_guard, the launch code can bail
-                    let start = Instant::now();
-                    let proc = Arc::new(
-                        process::Process::launch(self.conf, self.tenant_shard_id, pg_version)
+            let proc: Arc<process::WalRedoProcess> =
+                match self.redo_process.get_or_init_detached().await {
+                    Ok(guard) => Arc::clone(&guard),
+                    Err(permit) => {
+                        // don't hold poison_guard, the launch code can bail
+                        let start = Instant::now();
+                        let proc = Arc::new(
+                            process::WalRedoProcess::launch(
+                                self.conf,
+                                self.tenant_shard_id,
+                                pg_version,
+                            )
                             .context("launch walredo process")?,
-                    );
-                    let duration = start.elapsed();
-                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
-                    info!(
-                        duration_ms = duration.as_millis(),
-                        pid = proc.id(),
-                        "launched walredo process"
-                    );
-                    self.redo_process.set(Arc::clone(&proc), permit);
-                    proc
-                }
-            };
+                        );
+                        let duration = start.elapsed();
+                        WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                        info!(
+                            duration_ms = duration.as_millis(),
+                            pid = proc.id(),
+                            "launched walredo process"
+                        );
+                        self.redo_process.set(Arc::clone(&proc), permit);
+                        proc
+                    }
+                };
 
             let started_at = std::time::Instant::now();
 
@@ -362,10 +365,10 @@ impl PostgresRedoManager {
         &self,
         key: Key,
         page: &mut BytesMut,
-        _record_lsn: Lsn,
+        record_lsn: Lsn,
         record: &NeonWalRecord,
     ) -> anyhow::Result<()> {
-        apply_neon::apply_in_neon(record, key, page)?;
+        apply_neon::apply_in_neon(record, record_lsn, key, page)?;
 
         Ok(())
     }
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index 247704e2a5..24e8d8b01c 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -3,7 +3,7 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, BytesMut};
-use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key};
+use pageserver_api::key::Key;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
@@ -14,6 +14,7 @@ use postgres_ffi::v14::nonrelfile_utils::{
 use postgres_ffi::BLCKSZ;
 use tracing::*;
 use utils::bin_ser::BeSer;
+use utils::lsn::Lsn;
 
 /// Can this request be served by neon redo functions
 /// or we need to pass it to wal-redo postgres process?
@@ -32,6 +33,7 @@ pub(crate) fn can_apply_in_neon(rec: &NeonWalRecord) -> bool {
 
 pub(crate) fn apply_in_neon(
     record: &NeonWalRecord,
+    lsn: Lsn,
     key: Key,
     page: &mut BytesMut,
 ) -> Result<(), anyhow::Error> {
@@ -48,7 +50,7 @@ pub(crate) fn apply_in_neon(
             flags,
         } => {
             // sanity check that this is modifying the correct relation
-            let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
+            let (rel, blknum) = key.to_rel_block().context("invalid record")?;
             assert!(
                 rel.forknum == VISIBILITYMAP_FORKNUM,
                 "ClearVisibilityMapFlags record on unexpected rel {}",
@@ -67,6 +69,7 @@ pub(crate) fn apply_in_neon(
                 let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
 
                 map[map_byte as usize] &= !(flags << map_offset);
+                postgres_ffi::page_set_lsn(page, lsn);
             }
 
             // Repeat for 'old_heap_blkno', if any
@@ -80,12 +83,13 @@ pub(crate) fn apply_in_neon(
                 let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
 
                 map[map_byte as usize] &= !(flags << map_offset);
+                postgres_ffi::page_set_lsn(page, lsn);
             }
         }
         // Non-relational WAL records are handled here, with custom code that has the
         // same effects as the corresponding Postgres WAL redo function.
         NeonWalRecord::ClogSetCommitted { xids, timestamp } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
             assert_eq!(
                 slru_kind,
                 SlruKind::Clog,
@@ -130,7 +134,7 @@ pub(crate) fn apply_in_neon(
             }
         }
         NeonWalRecord::ClogSetAborted { xids } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
             assert_eq!(
                 slru_kind,
                 SlruKind::Clog,
@@ -160,7 +164,7 @@ pub(crate) fn apply_in_neon(
             }
         }
         NeonWalRecord::MultixactOffsetCreate { mid, moff } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
             assert_eq!(
                 slru_kind,
                 SlruKind::MultiXactOffsets,
@@ -192,7 +196,7 @@ pub(crate) fn apply_in_neon(
             LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
         }
         NeonWalRecord::MultixactMembersCreate { moff, members } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
+            let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?;
             assert_eq!(
                 slru_kind,
                 SlruKind::MultiXactMembers,
@@ -285,7 +289,7 @@ mod test {
         let mut page = BytesMut::from_iter(base_image);
 
         for record in deltas {
-            apply_in_neon(&record, file_path, &mut page)?;
+            apply_in_neon(&record, Lsn(8), file_path, &mut page)?;
         }
 
         let reconstructed = AuxFilesDirectory::des(&page)?;
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
index 02c9c04bf1..5b0af334ee 100644
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -1,64 +1,184 @@
-/// Layer of indirection previously used to support multiple implementations.
-/// Subject to removal: <https://github.com/neondatabase/neon/issues/7753>
-use std::time::Duration;
-
-use bytes::Bytes;
-use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use tracing::warn;
-use utils::lsn::Lsn;
-
-use crate::{config::PageServerConf, walrecord::NeonWalRecord};
-
 mod no_leak_child;
 /// The IPC protocol that pageserver and walredo process speak over their shared pipe.
 mod protocol;
 
-mod process_impl {
-    pub(super) mod process_async;
+use self::no_leak_child::NoLeakChild;
+use crate::{
+    config::PageServerConf,
+    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    walrecord::NeonWalRecord,
+};
+use anyhow::Context;
+use bytes::Bytes;
+use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use postgres_ffi::BLCKSZ;
+#[cfg(feature = "testing")]
+use std::sync::atomic::AtomicUsize;
+use std::{
+    collections::VecDeque,
+    process::{Command, Stdio},
+    time::Duration,
+};
+use tokio::io::{AsyncReadExt, AsyncWriteExt};
+use tracing::{debug, error, instrument, Instrument};
+use utils::{lsn::Lsn, poison::Poison};
+
+pub struct WalRedoProcess {
+    #[allow(dead_code)]
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    // Some() on construction, only becomes None on Drop.
+    child: Option<NoLeakChild>,
+    stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
+    stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
 }
 
-#[derive(
-    Clone,
-    Copy,
-    Debug,
-    PartialEq,
-    Eq,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    strum_macros::IntoStaticStr,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
-#[repr(u8)]
-pub enum Kind {
-    Sync,
-    Async,
+struct ProcessInput {
+    stdin: tokio::process::ChildStdin,
+    n_requests: usize,
 }
 
-pub(crate) struct Process(process_impl::process_async::WalRedoProcess);
+struct ProcessOutput {
+    stdout: tokio::process::ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
 
-impl Process {
-    #[inline(always)]
-    pub fn launch(
+impl WalRedoProcess {
+    //
+    // Start postgres binary in special WAL redo mode.
+    //
+    #[instrument(skip_all,fields(pg_version=pg_version))]
+    pub(crate) fn launch(
         conf: &'static PageServerConf,
         tenant_shard_id: TenantShardId,
         pg_version: u32,
     ) -> anyhow::Result<Self> {
-        if conf.walredo_process_kind != Kind::Async {
-            warn!(
-                configured = %conf.walredo_process_kind,
-                "the walredo_process_kind setting has been turned into a no-op, using async implementation"
-            );
-        }
-        Ok(Self(process_impl::process_async::WalRedoProcess::launch(
+        crate::span::debug_assert_current_span_has_tenant_id();
+
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+
+        use no_leak_child::NoLeakChildCommandExt;
+        // Start postgres itself
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
+            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
+            .stdin(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
+            .spawn_no_leak_child(tenant_shard_id)
+            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait(WalRedoKillCause::Startup);
+        });
+
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
+        let stdin =
+            tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
+        let stdout = tokio::process::ChildStdout::from_std(stdout)
+            .context("convert to tokio::ChildStdout")?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);
+
+        tokio::spawn(
+            async move {
+                scopeguard::defer! {
+                    debug!("wal-redo-postgres stderr_logger_task finished");
+                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                }
+                debug!("wal-redo-postgres stderr_logger_task started");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+                use tokio::io::AsyncBufReadExt;
+                let mut stderr_lines = tokio::io::BufReader::new(stderr);
+                let mut buf = Vec::new();
+                let res = loop {
+                    buf.clear();
+                    // TODO we don't trust the process to cap its stderr length.
+                    // Currently it can do unbounded Vec allocation.
+                    match stderr_lines.read_until(b'\n', &mut buf).await {
+                        Ok(0) => break Ok(()), // eof
+                        Ok(num_bytes) => {
+                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                            error!(%output, "received output");
+                        }
+                        Err(e) => {
+                            break Err(e);
+                        }
+                    }
+                };
+                match res {
+                    Ok(()) => (),
+                    Err(e) => {
+                        error!(error=?e, "failed to read from walredo stderr");
+                    }
+                }
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+        );
+
+        Ok(Self {
             conf,
             tenant_shard_id,
-            pg_version,
-        )?))
+            child: Some(child),
+            stdin: tokio::sync::Mutex::new(Poison::new(
+                "stdin",
+                ProcessInput {
+                    stdin,
+                    n_requests: 0,
+                },
+            )),
+            stdout: tokio::sync::Mutex::new(Poison::new(
+                "stdout",
+                ProcessOutput {
+                    stdout,
+                    pending_responses: VecDeque::new(),
+                    n_processed_responses: 0,
+                },
+            )),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
+        })
     }
 
-    #[inline(always)]
+    pub(crate) fn id(&self) -> u32 {
+        self.child
+            .as_ref()
+            .expect("must not call this during Drop")
+            .id()
+    }
+
+    /// Apply given WAL records ('records') over an old page image. Returns
+    /// new page image.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// Cancellation safe.
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
     pub(crate) async fn apply_wal_records(
         &self,
         rel: RelTag,
@@ -67,12 +187,191 @@ impl Process {
         records: &[(Lsn, NeonWalRecord)],
         wal_redo_timeout: Duration,
     ) -> anyhow::Result<Bytes> {
-        self.0
-            .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
-            .await
+        let tag = protocol::BufferTag { rel, blknum };
+
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
+        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            protocol::build_push_page_msg(tag, img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            if let NeonWalRecord::Postgres {
+                will_init: _,
+                rec: postgres_rec,
+            } = rec
+            {
+                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
+            } else {
+                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+            }
+        }
+        protocol::build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        let Ok(res) =
+            tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
+        else {
+            anyhow::bail!("WAL redo timed out");
+        };
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
     }
 
-    pub(crate) fn id(&self) -> u32 {
-        self.0.id()
+    /// # Cancel-Safety
+    ///
+    /// When not polled to completion (e.g. because in `tokio::select!` another
+    /// branch becomes ready before this future), concurrent and subsequent
+    /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
+    /// Dispose of this process instance and create a new one.
+    async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
+        let request_no = {
+            let mut lock_guard = self.stdin.lock().await;
+            let mut poison_guard = lock_guard.check_and_arm()?;
+            let input = poison_guard.data_mut();
+            input
+                .stdin
+                .write_all(writebuf)
+                .await
+                .context("write to walredo stdin")?;
+            let request_no = input.n_requests;
+            input.n_requests += 1;
+            poison_guard.disarm();
+            request_no
+        };
+
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut lock_guard = self.stdout.lock().await;
+        let mut poison_guard = lock_guard.check_and_arm()?;
+        let output = poison_guard.data_mut();
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            output
+                .stdout
+                .read_exact(&mut resultbuf)
+                .await
+                .context("read walredo stdout")?;
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        poison_guard.disarm();
+        Ok(res)
+    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        use std::sync::atomic::Ordering;
+
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+
+        use std::io::Write;
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
+        }
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
+}
+
+impl Drop for WalRedoProcess {
+    fn drop(&mut self) {
+        self.child
+            .take()
+            .expect("we only do this once")
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        // no way to wait for stderr_logger_task from Drop because that is async only
     }
 }
diff --git a/pageserver/src/walredo/process/process_impl/process_async.rs b/pageserver/src/walredo/process/process_impl/process_async.rs
deleted file mode 100644
index 262858b033..0000000000
--- a/pageserver/src/walredo/process/process_impl/process_async.rs
+++ /dev/null
@@ -1,374 +0,0 @@
-use self::no_leak_child::NoLeakChild;
-use crate::{
-    config::PageServerConf,
-    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
-    walrecord::NeonWalRecord,
-    walredo::process::{no_leak_child, protocol},
-};
-use anyhow::Context;
-use bytes::Bytes;
-use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use postgres_ffi::BLCKSZ;
-#[cfg(feature = "testing")]
-use std::sync::atomic::AtomicUsize;
-use std::{
-    collections::VecDeque,
-    process::{Command, Stdio},
-    time::Duration,
-};
-use tokio::io::{AsyncReadExt, AsyncWriteExt};
-use tracing::{debug, error, instrument, Instrument};
-use utils::{lsn::Lsn, poison::Poison};
-
-pub struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
-    stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
-}
-
-struct ProcessInput {
-    stdin: tokio::process::ChildStdin,
-    n_requests: usize,
-}
-
-struct ProcessOutput {
-    stdout: tokio::process::ChildStdout,
-    pending_responses: VecDeque<Option<Bytes>>,
-    n_processed_responses: usize,
-}
-
-impl WalRedoProcess {
-    //
-    // Start postgres binary in special WAL redo mode.
-    //
-    #[instrument(skip_all,fields(pg_version=pg_version))]
-    pub(crate) fn launch(
-        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
-        pg_version: u32,
-    ) -> anyhow::Result<Self> {
-        crate::span::debug_assert_current_span_has_tenant_id();
-
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
-
-        use no_leak_child::NoLeakChildCommandExt;
-        // Start postgres itself
-        let child = Command::new(pg_bin_dir_path.join("postgres"))
-            // the first arg must be --wal-redo so the child process enters into walredo mode
-            .arg("--wal-redo")
-            // the child doesn't process this arg, but, having it in the argv helps indentify the
-            // walredo process for a particular tenant when debugging a pagserver
-            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
-            .stdin(Stdio::piped())
-            .stderr(Stdio::piped())
-            .stdout(Stdio::piped())
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // NB: The redo process is not trusted after we sent it the first
-            // walredo work. Before that, it is trusted. Specifically, we trust
-            // it to
-            // 1. close all file descriptors except stdin, stdout, stderr because
-            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
-            //    the files it opens, and
-            // 2. to use seccomp to sandbox itself before processing the first
-            //    walredo request.
-            .spawn_no_leak_child(tenant_shard_id)
-            .context("spawn process")?;
-        WAL_REDO_PROCESS_COUNTERS.started.inc();
-        let mut child = scopeguard::guard(child, |child| {
-            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait(WalRedoKillCause::Startup);
-        });
-
-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
-        let stdin =
-            tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
-        let stdout = tokio::process::ChildStdout::from_std(stdout)
-            .context("convert to tokio::ChildStdout")?;
-
-        // all fallible operations post-spawn are complete, so get rid of the guard
-        let child = scopeguard::ScopeGuard::into_inner(child);
-
-        tokio::spawn(
-            async move {
-                scopeguard::defer! {
-                    debug!("wal-redo-postgres stderr_logger_task finished");
-                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
-                }
-                debug!("wal-redo-postgres stderr_logger_task started");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-                use tokio::io::AsyncBufReadExt;
-                let mut stderr_lines = tokio::io::BufReader::new(stderr);
-                let mut buf = Vec::new();
-                let res = loop {
-                    buf.clear();
-                    // TODO we don't trust the process to cap its stderr length.
-                    // Currently it can do unbounded Vec allocation.
-                    match stderr_lines.read_until(b'\n', &mut buf).await {
-                        Ok(0) => break Ok(()), // eof
-                        Ok(num_bytes) => {
-                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                            error!(%output, "received output");
-                        }
-                        Err(e) => {
-                            break Err(e);
-                        }
-                    }
-                };
-                match res {
-                    Ok(()) => (),
-                    Err(e) => {
-                        error!(error=?e, "failed to read from walredo stderr");
-                    }
-                }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
-        );
-
-        Ok(Self {
-            conf,
-            tenant_shard_id,
-            child: Some(child),
-            stdin: tokio::sync::Mutex::new(Poison::new(
-                "stdin",
-                ProcessInput {
-                    stdin,
-                    n_requests: 0,
-                },
-            )),
-            stdout: tokio::sync::Mutex::new(Poison::new(
-                "stdout",
-                ProcessOutput {
-                    stdout,
-                    pending_responses: VecDeque::new(),
-                    n_processed_responses: 0,
-                },
-            )),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
-        })
-    }
-
-    pub(crate) fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
-    }
-
-    /// Apply given WAL records ('records') over an old page image. Returns
-    /// new page image.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// Cancellation safe.
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
-    pub(crate) async fn apply_wal_records(
-        &self,
-        rel: RelTag,
-        blknum: u32,
-        base_img: &Option<Bytes>,
-        records: &[(Lsn, NeonWalRecord)],
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let tag = protocol::BufferTag { rel, blknum };
-
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        //
-        // Most requests start with a before-image with BLCKSZ bytes, followed by
-        // by some other WAL records. Start with a buffer that can hold that
-        // comfortably.
-        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
-        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            protocol::build_push_page_msg(tag, img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            if let NeonWalRecord::Postgres {
-                will_init: _,
-                rec: postgres_rec,
-            } = rec
-            {
-                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
-            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
-            }
-        }
-        protocol::build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let Ok(res) =
-            tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
-        else {
-            anyhow::bail!("WAL redo timed out");
-        };
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
-    }
-
-    /// # Cancel-Safety
-    ///
-    /// When not polled to completion (e.g. because in `tokio::select!` another
-    /// branch becomes ready before this future), concurrent and subsequent
-    /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
-    /// Dispose of this process instance and create a new one.
-    async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
-        let request_no = {
-            let mut lock_guard = self.stdin.lock().await;
-            let mut poison_guard = lock_guard.check_and_arm()?;
-            let input = poison_guard.data_mut();
-            input
-                .stdin
-                .write_all(writebuf)
-                .await
-                .context("write to walredo stdin")?;
-            let request_no = input.n_requests;
-            input.n_requests += 1;
-            poison_guard.disarm();
-            request_no
-        };
-
-        // To improve walredo performance we separate sending requests and receiving
-        // responses. Them are protected by different mutexes (output and input).
-        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
-        // then there is not warranty that T1 will first granted output mutex lock.
-        // To address this issue we maintain number of sent requests, number of processed
-        // responses and ring buffer with pending responses. After sending response
-        // (under input mutex), threads remembers request number. Then it releases
-        // input mutex, locks output mutex and fetch in ring buffer all responses until
-        // its stored request number. The it takes correspondent element from
-        // pending responses ring buffer and truncate all empty elements from the front,
-        // advancing processed responses number.
-
-        let mut lock_guard = self.stdout.lock().await;
-        let mut poison_guard = lock_guard.check_and_arm()?;
-        let output = poison_guard.data_mut();
-        let n_processed_responses = output.n_processed_responses;
-        while n_processed_responses + output.pending_responses.len() <= request_no {
-            // We expect the WAL redo process to respond with an 8k page image. We read it
-            // into this buffer.
-            let mut resultbuf = vec![0; BLCKSZ.into()];
-            output
-                .stdout
-                .read_exact(&mut resultbuf)
-                .await
-                .context("read walredo stdout")?;
-            output
-                .pending_responses
-                .push_back(Some(Bytes::from(resultbuf)));
-        }
-        // Replace our request's response with None in `pending_responses`.
-        // Then make space in the ring buffer by clearing out any seqence of contiguous
-        // `None`'s from the front of `pending_responses`.
-        // NB: We can't pop_front() because other requests' responses because another
-        // requester might have grabbed the output mutex before us:
-        // T1: grab input mutex
-        // T1: send request_no 23
-        // T1: release input mutex
-        // T2: grab input mutex
-        // T2: send request_no 24
-        // T2: release input mutex
-        // T2: grab output mutex
-        // T2: n_processed_responses + output.pending_responses.len() <= request_no
-        //            23                                0                   24
-        // T2: enters poll loop that reads stdout
-        // T2: put response for 23 into pending_responses
-        // T2: put response for 24 into pending_resposnes
-        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
-        // T2: takes its response_24
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: releases output mutex
-        // T1: grabs output mutex
-        // T1: n_processed_responses + output.pending_responses.len() > request_no
-        //            23                                2                   23
-        // T1: skips poll loop that reads stdout
-        // T1: takes its response_23
-        // pending_responses now looks like this: Front None None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Back
-        // n_processed_responses now has value 25
-        let res = output.pending_responses[request_no - n_processed_responses]
-            .take()
-            .expect("we own this request_no, nobody else is supposed to take it");
-        while let Some(front) = output.pending_responses.front() {
-            if front.is_none() {
-                output.pending_responses.pop_front();
-                output.n_processed_responses += 1;
-            } else {
-                break;
-            }
-        }
-        poison_guard.disarm();
-        Ok(res)
-    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        use std::sync::atomic::Ordering;
-
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
-
-        use std::io::Write;
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
-        }
-    }
-
-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
-}
-
-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        // no way to wait for stderr_logger_task from Drop because that is async only
-    }
-}
diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c
index e43f4d9d96..60eb8e1fc9 100644
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -184,8 +184,8 @@ NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, Ti
 	}
 	else if (state->wre_errno == ENOENT)
 	{
-		nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote",
-				LSN_FORMAT_ARGS(startptr));
+		nwr_log(LOG, "local read at %X/%X len %zu failed as segment file doesn't exist, attempting remote",
+				LSN_FORMAT_ARGS(startptr), count);
 		return NeonWALReadRemote(state, buf, startptr, count, tli);
 	}
 	else
@@ -614,6 +614,7 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
 		uint32		startoff;
 		int			segbytes;
 		int			readbytes;
+		XLogSegNo	lastRemovedSegNo;
 
 		startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
 
@@ -689,6 +690,23 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
 			return false;
 		}
 
+		/*
+		 * Recheck that the segment hasn't been removed while we were reading
+		 * it.
+		 */
+		lastRemovedSegNo = XLogGetLastRemovedSegno();
+		if (state->seg.ws_segno <= lastRemovedSegNo)
+		{
+			char		fname[MAXFNAMELEN];
+
+			state->wre_errno = ENOENT;
+
+			XLogFileName(fname, tli, state->seg.ws_segno, state->segcxt.ws_segsize);
+			snprintf(state->err_msg, sizeof(state->err_msg), "WAL segment %s has been removed during the read, lastRemovedSegNo " UINT64_FORMAT,
+					 fname, lastRemovedSegNo);
+			return false;
+		}
+
 		/* Update state for read */
 		recptr += readbytes;
 		nbytes -= readbytes;
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 4d074f98a5..634ec9042c 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -452,7 +452,7 @@ pub struct ApiLocks<K> {
 
 #[derive(Debug, thiserror::Error)]
 pub enum ApiLockError {
-    #[error("permit could not be acquired")]
+    #[error("timeout acquiring resource permit")]
     TimeoutError(#[from] tokio::time::error::Elapsed),
 }
 
@@ -504,7 +504,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
                     .clone()
             }
         };
-        let permit = semaphore.acquire_deadline(now + self.timeout).await;
+        let permit = semaphore.acquire_timeout(self.timeout).await;
 
         self.metrics
             .semaphore_acquire_seconds
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
index 072fdb80b0..3842ce269e 100644
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -3,7 +3,7 @@ use parking_lot::Mutex;
 use std::{pin::pin, sync::Arc, time::Duration};
 use tokio::{
     sync::Notify,
-    time::{error::Elapsed, timeout_at, Instant},
+    time::{error::Elapsed, Instant},
 };
 
 use self::aimd::Aimd;
@@ -80,7 +80,7 @@ pub struct LimiterInner {
 }
 
 impl LimiterInner {
-    fn update(&mut self, latency: Duration, outcome: Option<Outcome>) {
+    fn update_limit(&mut self, latency: Duration, outcome: Option<Outcome>) {
         if let Some(outcome) = outcome {
             let sample = Sample {
                 latency,
@@ -92,12 +92,12 @@ impl LimiterInner {
     }
 
     fn take(&mut self, ready: &Notify) -> Option<()> {
-        if self.available > 1 {
+        if self.available >= 1 {
             self.available -= 1;
             self.in_flight += 1;
 
             // tell the next in the queue that there is a permit ready
-            if self.available > 1 {
+            if self.available >= 1 {
                 ready.notify_one();
             }
             Some(())
@@ -157,16 +157,12 @@ impl DynamicLimiter {
     }
 
     /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available.
-    ///
-    /// Returns `None` if there are none available after `duration`.
     pub async fn acquire_timeout(self: &Arc<Self>, duration: Duration) -> Result<Token, Elapsed> {
-        self.acquire_deadline(Instant::now() + duration).await
+        tokio::time::timeout(duration, self.acquire()).await?
     }
 
-    /// Try to acquire a concurrency [Token], waiting until `deadline` if there are none available.
-    ///
-    /// Returns `None` if there are none available after `deadline`.
-    pub async fn acquire_deadline(self: &Arc<Self>, deadline: Instant) -> Result<Token, Elapsed> {
+    /// Try to acquire a concurrency [Token].
+    async fn acquire(self: &Arc<Self>) -> Result<Token, Elapsed> {
         if self.config.initial_limit == 0 {
             // If the rate limiter is disabled, we can always acquire a token.
             Ok(Token::disabled())
@@ -174,22 +170,16 @@ impl DynamicLimiter {
             let mut notified = pin!(self.ready.notified());
             let mut ready = notified.as_mut().enable();
             loop {
-                let mut limit = None;
                 if ready {
                     let mut inner = self.inner.lock();
                     if inner.take(&self.ready).is_some() {
                         break Ok(Token::new(self.clone()));
-                    }
-                    limit = Some(inner.limit);
-                }
-                match timeout_at(deadline, notified.as_mut()).await {
-                    Ok(()) => ready = true,
-                    Err(e) => {
-                        let limit = limit.unwrap_or_else(|| self.inner.lock().limit);
-                        tracing::info!(limit, "could not acquire token in time");
-                        break Err(e);
+                    } else {
+                        notified.set(self.ready.notified());
                     }
                 }
+                notified.as_mut().await;
+                ready = true;
             }
         }
     }
@@ -208,14 +198,14 @@ impl DynamicLimiter {
 
         let mut inner = self.inner.lock();
 
-        inner.update(start.elapsed(), outcome);
+        inner.update_limit(start.elapsed(), outcome);
+
+        inner.in_flight -= 1;
         if inner.in_flight < inner.limit {
             inner.available = inner.limit - inner.in_flight;
             // At least 1 permit is now available
             self.ready.notify_one();
         }
-
-        inner.in_flight -= 1;
     }
 
     /// The current state of the limiter.
diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
index 370d4be802..ccc9c42420 100644
--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -51,7 +51,9 @@ impl LimitAlgorithm for Aimd {
                 // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1
                 let limit = limit.floor() as usize;
 
-                limit.clamp(self.min, self.max)
+                let limit = limit.clamp(self.min, self.max);
+                tracing::info!(limit, "limit decreased");
+                limit
             }
         }
     }
@@ -67,6 +69,53 @@ mod tests {
 
     use super::*;
 
+    #[tokio::test(start_paused = true)]
+    async fn increase_decrease() {
+        let config = RateLimiterConfig {
+            initial_limit: 1,
+            algorithm: RateLimitAlgorithm::Aimd {
+                conf: Aimd {
+                    min: 1,
+                    max: 2,
+                    inc: 10,
+                    dec: 0.5,
+                    utilisation: 0.8,
+                },
+            },
+        };
+
+        let limiter = DynamicLimiter::new(config);
+
+        let token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+        token.release(Outcome::Success);
+
+        assert_eq!(limiter.state().limit(), 2);
+
+        let token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+        token.release(Outcome::Success);
+        assert_eq!(limiter.state().limit(), 2);
+
+        let token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+        token.release(Outcome::Overload);
+        assert_eq!(limiter.state().limit(), 1);
+
+        let token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+        token.release(Outcome::Overload);
+        assert_eq!(limiter.state().limit(), 1);
+    }
+
     #[tokio::test(start_paused = true)]
     async fn should_decrease_limit_on_overload() {
         let config = RateLimiterConfig {
@@ -85,7 +134,7 @@ mod tests {
         let limiter = DynamicLimiter::new(config);
 
         let token = limiter
-            .acquire_timeout(Duration::from_millis(1))
+            .acquire_timeout(Duration::from_millis(100))
             .await
             .unwrap();
         token.release(Outcome::Overload);
@@ -93,6 +142,41 @@ mod tests {
         assert_eq!(limiter.state().limit(), 5, "overload: decrease");
     }
 
+    #[tokio::test(start_paused = true)]
+    async fn acquire_timeout_times_out() {
+        let config = RateLimiterConfig {
+            initial_limit: 1,
+            algorithm: RateLimitAlgorithm::Aimd {
+                conf: Aimd {
+                    min: 1,
+                    max: 2,
+                    inc: 10,
+                    dec: 0.5,
+                    utilisation: 0.8,
+                },
+            },
+        };
+
+        let limiter = DynamicLimiter::new(config);
+
+        let token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+        let now = tokio::time::Instant::now();
+        limiter
+            .acquire_timeout(Duration::from_secs(1))
+            .await
+            .err()
+            .unwrap();
+
+        assert!(now.elapsed() >= Duration::from_secs(1));
+
+        token.release(Outcome::Success);
+
+        assert_eq!(limiter.state().limit(), 2);
+    }
+
     #[tokio::test(start_paused = true)]
     async fn should_increase_limit_on_success_when_using_gt_util_threshold() {
         let config = RateLimiterConfig {
diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml
index e56bd43fb8..48b50ca21c 100644
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -11,6 +11,7 @@ either.workspace = true
 tokio-rustls.workspace = true
 anyhow.workspace = true
 hex.workspace = true
+humantime.workspace = true
 thiserror.workspace = true
 rand.workspace = true
 bytes.workspace = true
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index 134afa53da..44fb53696c 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -1,7 +1,7 @@
 use std::collections::{HashMap, HashSet};
 
 use anyhow::Context;
-use aws_sdk_s3::{types::ObjectIdentifier, Client};
+use aws_sdk_s3::Client;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
 use tracing::{error, info, warn};
@@ -70,7 +70,7 @@ pub(crate) fn branch_cleanup_and_check_errors(
 
     match s3_data {
         Some(s3_data) => {
-            result.garbage_keys.extend(s3_data.keys_to_remove);
+            result.garbage_keys.extend(s3_data.unknown_keys);
 
             match s3_data.blob_data {
                 BlobDataParseResult::Parsed {
@@ -93,12 +93,12 @@ pub(crate) fn branch_cleanup_and_check_errors(
                     }
 
                     if index_part.metadata.disk_consistent_lsn()
-                        != index_part.get_disk_consistent_lsn()
+                        != index_part.duplicated_disk_consistent_lsn()
                     {
                         result.errors.push(format!(
                             "Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})",
                             index_part.metadata.disk_consistent_lsn(),
-                            index_part.get_disk_consistent_lsn(),
+                            index_part.duplicated_disk_consistent_lsn(),
                         ))
                     }
 
@@ -240,7 +240,12 @@ impl TenantObjectListing {
 #[derive(Debug)]
 pub(crate) struct S3TimelineBlobData {
     pub(crate) blob_data: BlobDataParseResult,
-    pub(crate) keys_to_remove: Vec<String>,
+
+    // Index objects that were not used when loading `blob_data`, e.g. those from old generations
+    pub(crate) unused_index_keys: Vec<String>,
+
+    // Objects whose keys were not recognized at all, i.e. not layer files, not indices
+    pub(crate) unknown_keys: Vec<String>,
 }
 
 #[derive(Debug)]
@@ -276,12 +281,12 @@ pub(crate) async fn list_timeline_blobs(
     let mut s3_layers = HashSet::new();
 
     let mut errors = Vec::new();
-    let mut keys_to_remove = Vec::new();
+    let mut unknown_keys = Vec::new();
 
     let mut timeline_dir_target = s3_root.timeline_root(&id);
     timeline_dir_target.delimiter = String::new();
 
-    let mut index_parts: Vec<ObjectIdentifier> = Vec::new();
+    let mut index_part_keys: Vec<String> = Vec::new();
     let mut initdb_archive: bool = false;
 
     let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
@@ -292,16 +297,16 @@ pub(crate) async fn list_timeline_blobs(
         let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
         match blob_name {
             Some(name) if name.starts_with("index_part.json") => {
-                tracing::info!("Index key {key}");
-                index_parts.push(obj)
+                tracing::debug!("Index key {key}");
+                index_part_keys.push(key.to_owned())
             }
             Some("initdb.tar.zst") => {
-                tracing::info!("initdb archive {key}");
+                tracing::debug!("initdb archive {key}");
                 initdb_archive = true;
             }
             Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
                 Ok((new_layer, gen)) => {
-                    tracing::info!("Parsed layer key: {} {:?}", new_layer, gen);
+                    tracing::debug!("Parsed layer key: {} {:?}", new_layer, gen);
                     s3_layers.insert((new_layer, gen));
                 }
                 Err(e) => {
@@ -309,37 +314,37 @@ pub(crate) async fn list_timeline_blobs(
                     errors.push(
                         format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
                     );
-                    keys_to_remove.push(key.to_string());
+                    unknown_keys.push(key.to_string());
                 }
             },
             None => {
-                tracing::info!("Peculiar key {}", key);
+                tracing::warn!("Unknown key {}", key);
                 errors.push(format!("S3 list response got an object with odd key {key}"));
-                keys_to_remove.push(key.to_string());
+                unknown_keys.push(key.to_string());
             }
         }
     }
 
-    if index_parts.is_empty() && s3_layers.is_empty() && initdb_archive {
-        tracing::info!(
+    if index_part_keys.is_empty() && s3_layers.is_empty() && initdb_archive {
+        tracing::debug!(
             "Timeline is empty apart from initdb archive: expected post-deletion state."
         );
         return Ok(S3TimelineBlobData {
             blob_data: BlobDataParseResult::Relic,
-            keys_to_remove: Vec::new(),
+            unused_index_keys: index_part_keys,
+            unknown_keys: Vec::new(),
         });
     }
 
     // Choose the index_part with the highest generation
-    let (index_part_object, index_part_generation) = match index_parts
+    let (index_part_object, index_part_generation) = match index_part_keys
         .iter()
-        .filter_map(|k| {
-            let key = k.key();
+        .filter_map(|key| {
             // Stripping the index key to the last part, because RemotePath doesn't
             // like absolute paths, and depending on prefix_in_bucket it's possible
             // for the keys we read back to start with a slash.
             let basename = key.rsplit_once('/').unwrap().1;
-            parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (k, g))
+            parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (key, g))
         })
         .max_by_key(|i| i.1)
         .map(|(k, g)| (k.clone(), g))
@@ -347,15 +352,18 @@ pub(crate) async fn list_timeline_blobs(
         Some((key, gen)) => (Some(key), gen),
         None => {
             // Legacy/missing case: one or zero index parts, which did not have a generation
-            (index_parts.pop(), Generation::none())
+            (index_part_keys.pop(), Generation::none())
         }
     };
 
-    if index_part_object.is_none() {
-        errors.push("S3 list response got no index_part.json file".to_string());
+    match index_part_object.as_ref() {
+        Some(selected) => index_part_keys.retain(|k| k != selected),
+        None => {
+            errors.push("S3 list response got no index_part.json file".to_string());
+        }
     }
 
-    if let Some(index_part_object_key) = index_part_object.as_ref().map(|object| object.key()) {
+    if let Some(index_part_object_key) = index_part_object.as_ref() {
         let index_part_bytes = download_object_with_retries(
             s3_client,
             &timeline_dir_target.bucket_name,
@@ -372,17 +380,14 @@ pub(crate) async fn list_timeline_blobs(
                         index_part_generation,
                         s3_layers,
                     },
-                    keys_to_remove,
+                    unused_index_keys: index_part_keys,
+                    unknown_keys,
                 })
             }
             Err(index_parse_error) => errors.push(format!(
                 "index_part.json body parsing error: {index_parse_error}"
             )),
         }
-    } else {
-        errors.push(format!(
-            "Index part object {index_part_object:?} has no key"
-        ));
     }
 
     if errors.is_empty() {
@@ -393,6 +398,7 @@ pub(crate) async fn list_timeline_blobs(
 
     Ok(S3TimelineBlobData {
         blob_data: BlobDataParseResult::Incorrect(errors),
-        keys_to_remove,
+        unused_index_keys: index_part_keys,
+        unknown_keys,
     })
 }
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index e0f99ecd9c..64273432fc 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -4,6 +4,7 @@ pub mod checks;
 pub mod cloud_admin_api;
 pub mod garbage;
 pub mod metadata_stream;
+pub mod pageserver_physical_gc;
 pub mod scan_pageserver_metadata;
 pub mod scan_safekeeper_metadata;
 pub mod tenant_snapshot;
@@ -396,7 +397,7 @@ async fn download_object_with_retries(
             .await
         {
             Ok(bytes_read) => {
-                tracing::info!("Downloaded {bytes_read} bytes for object object with key {key}");
+                tracing::debug!("Downloaded {bytes_read} bytes for object {key}");
                 return Ok(body_buf);
             }
             Err(e) => {
diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs
index e49c280b99..ade8ef7d7a 100644
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -2,11 +2,13 @@ use anyhow::bail;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
+use s3_scrubber::pageserver_physical_gc::GcMode;
 use s3_scrubber::scan_pageserver_metadata::scan_metadata;
 use s3_scrubber::tenant_snapshot::SnapshotDownloader;
 use s3_scrubber::{
-    init_logging, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig,
-    NodeKind, TraversingDepth,
+    init_logging, pageserver_physical_gc::pageserver_physical_gc,
+    scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind,
+    TraversingDepth,
 };
 
 use clap::{Parser, Subcommand};
@@ -62,6 +64,14 @@ enum Command {
         #[arg(short, long)]
         output_path: Utf8PathBuf,
     },
+    PageserverPhysicalGc {
+        #[arg(long = "tenant-id", num_args = 0..)]
+        tenant_ids: Vec<TenantShardId>,
+        #[arg(long = "min-age")]
+        min_age: humantime::Duration,
+        #[arg(short, long, default_value_t = GcMode::IndicesOnly)]
+        mode: GcMode,
+    },
 }
 
 #[tokio::main]
@@ -75,6 +85,7 @@ async fn main() -> anyhow::Result<()> {
         Command::FindGarbage { .. } => "find-garbage",
         Command::PurgeGarbage { .. } => "purge-garbage",
         Command::TenantSnapshot { .. } => "tenant-snapshot",
+        Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc",
     };
     let _guard = init_logging(&format!(
         "{}_{}_{}_{}.log",
@@ -178,5 +189,15 @@ async fn main() -> anyhow::Result<()> {
                 SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?;
             downloader.download().await
         }
+        Command::PageserverPhysicalGc {
+            tenant_ids,
+            min_age,
+            mode,
+        } => {
+            let summary =
+                pageserver_physical_gc(bucket_config, tenant_ids, min_age.into(), mode).await?;
+            println!("{}", serde_json::to_string(&summary).unwrap());
+            Ok(())
+        }
     }
 }
diff --git a/s3_scrubber/src/pageserver_physical_gc.rs b/s3_scrubber/src/pageserver_physical_gc.rs
new file mode 100644
index 0000000000..0146433128
--- /dev/null
+++ b/s3_scrubber/src/pageserver_physical_gc.rs
@@ -0,0 +1,239 @@
+use std::time::{Duration, UNIX_EPOCH};
+
+use crate::checks::{list_timeline_blobs, BlobDataParseResult};
+use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
+use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
+use aws_sdk_s3::Client;
+use futures_util::{StreamExt, TryStreamExt};
+use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
+use pageserver::tenant::IndexPart;
+use pageserver_api::shard::TenantShardId;
+use remote_storage::RemotePath;
+use serde::Serialize;
+use tracing::{info_span, Instrument};
+use utils::generation::Generation;
+
+#[derive(Serialize, Default)]
+pub struct GcSummary {
+    indices_deleted: usize,
+    remote_storage_errors: usize,
+}
+
+#[derive(clap::ValueEnum, Debug, Clone, Copy)]
+pub enum GcMode {
+    // Delete nothing
+    DryRun,
+
+    // Enable only removing old-generation indices
+    IndicesOnly,
+    // Enable all forms of GC
+    // TODO: this will be used when shard split ancestor layer deletion is added
+    // All,
+}
+
+impl std::fmt::Display for GcMode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            GcMode::DryRun => write!(f, "dry-run"),
+            GcMode::IndicesOnly => write!(f, "indices-only"),
+        }
+    }
+}
+
+async fn maybe_delete_index(
+    s3_client: &Client,
+    bucket_config: &BucketConfig,
+    min_age: &Duration,
+    latest_gen: Generation,
+    key: &str,
+    mode: GcMode,
+    summary: &mut GcSummary,
+) {
+    // Validation: we will only delete things that parse cleanly
+    let basename = key.rsplit_once('/').unwrap().1;
+    let candidate_generation =
+        match parse_remote_index_path(RemotePath::from_string(basename).unwrap()) {
+            Some(g) => g,
+            None => {
+                if basename == IndexPart::FILE_NAME {
+                    // A legacy pre-generation index
+                    Generation::none()
+                } else {
+                    // A strange key: we will not delete this because we don't understand it.
+                    tracing::warn!("Bad index key");
+                    return;
+                }
+            }
+        };
+
+    // Validation: we will only delete indices more than one generation old, to avoid interfering
+    // in typical migrations, even if they are very long running.
+    if candidate_generation >= latest_gen {
+        // This shouldn't happen: when we loaded metadata, it should have selected the latest
+        // generation already, and only populated [`S3TimelineBlobData::unused_index_keys`]
+        // with older generations.
+        tracing::warn!("Deletion candidate is >= latest generation, this is a bug!");
+        return;
+    } else if candidate_generation.next() == latest_gen {
+        // Skip deleting the latest-1th generation's index.
+        return;
+    }
+
+    // Validation: we will only delete indices after one week, so that during incidents we will have
+    // easy access to recent indices.
+    let age: Duration = match s3_client
+        .head_object()
+        .bucket(&bucket_config.bucket)
+        .key(key)
+        .send()
+        .await
+    {
+        Ok(response) => match response.last_modified {
+            None => {
+                tracing::warn!("Missing last_modified");
+                summary.remote_storage_errors += 1;
+                return;
+            }
+            Some(last_modified) => {
+                let last_modified =
+                    UNIX_EPOCH + Duration::from_secs_f64(last_modified.as_secs_f64());
+                match last_modified.elapsed() {
+                    Ok(e) => e,
+                    Err(_) => {
+                        tracing::warn!("Bad last_modified time: {last_modified:?}");
+                        return;
+                    }
+                }
+            }
+        },
+        Err(e) => {
+            tracing::warn!("Failed to HEAD {key}: {e}");
+            summary.remote_storage_errors += 1;
+            return;
+        }
+    };
+    if &age < min_age {
+        tracing::info!(
+            "Skipping young object {} < {}",
+            age.as_secs_f64(),
+            min_age.as_secs_f64()
+        );
+        return;
+    }
+
+    if matches!(mode, GcMode::DryRun) {
+        tracing::info!("Dry run: would delete this key");
+        return;
+    }
+
+    // All validations passed: erase the object
+    match s3_client
+        .delete_object()
+        .bucket(&bucket_config.bucket)
+        .key(key)
+        .send()
+        .await
+    {
+        Ok(_) => {
+            tracing::info!("Successfully deleted index");
+            summary.indices_deleted += 1;
+        }
+        Err(e) => {
+            tracing::warn!("Failed to delete index: {e}");
+            summary.remote_storage_errors += 1;
+        }
+    }
+}
+
+/// Physical garbage collection: removing unused S3 objects.  This is distinct from the garbage collection
+/// done inside the pageserver, which operates at a higher level (keys, layers).  This type of garbage collection
+/// is about removing:
+/// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between
+///   uploading a layer and uploading an index)
+/// - Index objects from historic generations
+///
+/// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and
+/// make sure that object listings don't get slowed down by large numbers of garbage objects.
+pub async fn pageserver_physical_gc(
+    bucket_config: BucketConfig,
+    tenant_ids: Vec<TenantShardId>,
+    min_age: Duration,
+    mode: GcMode,
+) -> anyhow::Result<GcSummary> {
+    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
+
+    let tenants = if tenant_ids.is_empty() {
+        futures::future::Either::Left(stream_tenants(&s3_client, &target))
+    } else {
+        futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
+    };
+
+    // How many tenants to process in parallel.  We need to be mindful of pageservers
+    // accessing the same per tenant prefixes, so use a lower setting than pageservers.
+    const CONCURRENCY: usize = 32;
+
+    // Generate a stream of TenantTimelineId
+    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
+    let timelines = timelines.try_buffered(CONCURRENCY);
+    let timelines = timelines.try_flatten();
+
+    // Generate a stream of S3TimelineBlobData
+    async fn gc_timeline(
+        s3_client: &Client,
+        bucket_config: &BucketConfig,
+        min_age: &Duration,
+        target: &RootTarget,
+        mode: GcMode,
+        ttid: TenantShardTimelineId,
+    ) -> anyhow::Result<GcSummary> {
+        let mut summary = GcSummary::default();
+        let data = list_timeline_blobs(s3_client, ttid, target).await?;
+
+        let (latest_gen, candidates) = match &data.blob_data {
+            BlobDataParseResult::Parsed {
+                index_part: _index_part,
+                index_part_generation,
+                s3_layers: _s3_layers,
+            } => (*index_part_generation, data.unused_index_keys),
+            BlobDataParseResult::Relic => {
+                // Post-deletion tenant location: don't try and GC it.
+                return Ok(summary);
+            }
+            BlobDataParseResult::Incorrect(reasons) => {
+                // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
+                tracing::warn!("Skipping timeline {ttid}, bad metadata: {reasons:?}");
+                return Ok(summary);
+            }
+        };
+
+        for key in candidates {
+            maybe_delete_index(
+                s3_client,
+                bucket_config,
+                min_age,
+                latest_gen,
+                &key,
+                mode,
+                &mut summary,
+            )
+            .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, key))
+            .await;
+        }
+
+        Ok(summary)
+    }
+    let timelines = timelines
+        .map_ok(|ttid| gc_timeline(&s3_client, &bucket_config, &min_age, &target, mode, ttid));
+    let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
+
+    let mut summary = GcSummary::default();
+
+    while let Some(i) = timelines.next().await {
+        let tl_summary = i?;
+
+        summary.indices_deleted += tl_summary.indices_deleted;
+        summary.remote_storage_errors += tl_summary.remote_storage_errors;
+    }
+
+    Ok(summary)
+}
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index aee3898ac7..7476654426 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -29,13 +29,12 @@ use safekeeper::defaults::{
     DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
     DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
-use safekeeper::remove_wal;
+use safekeeper::http;
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
 use safekeeper::SafeKeeperConf;
 use safekeeper::{broker, WAL_SERVICE_RUNTIME};
 use safekeeper::{control_file, BROKER_RUNTIME};
-use safekeeper::{http, WAL_REMOVER_RUNTIME};
 use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
 use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
@@ -441,14 +440,6 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
         .map(|res| ("broker main".to_owned(), res));
     tasks_handles.push(Box::pin(broker_task_handle));
 
-    let conf_ = conf.clone();
-    let wal_remover_handle = current_thread_rt
-        .as_ref()
-        .unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle())
-        .spawn(remove_wal::task_main(conf_))
-        .map(|res| ("WAL remover".to_owned(), res));
-    tasks_handles.push(Box::pin(wal_remover_handle));
-
     set_build_info_metric(GIT_VERSION, BUILD_TAG);
 
     // TODO: update tokio-stream, convert to real async Stream with
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index fe9f2e6899..e9bb5202da 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -2,7 +2,7 @@
 
 use anyhow::{bail, ensure, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
-use camino::Utf8PathBuf;
+use camino::{Utf8Path, Utf8PathBuf};
 use tokio::fs::File;
 use tokio::io::AsyncWriteExt;
 use utils::crashsafe::durable_rename;
@@ -12,9 +12,9 @@ use std::ops::Deref;
 use std::path::Path;
 use std::time::Instant;
 
-use crate::control_file_upgrade::upgrade_control_file;
 use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
 use crate::state::TimelinePersistentState;
+use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
 use utils::{bin_ser::LeSer, id::TenantTimelineId};
 
 use crate::SafeKeeperConf;
@@ -43,7 +43,7 @@ pub trait Storage: Deref<Target = TimelinePersistentState> {
 pub struct FileStorage {
     // save timeline dir to avoid reconstructing it every time
     timeline_dir: Utf8PathBuf,
-    conf: SafeKeeperConf,
+    no_sync: bool,
 
     /// Last state persisted to disk.
     state: TimelinePersistentState,
@@ -54,13 +54,12 @@ pub struct FileStorage {
 impl FileStorage {
     /// Initialize storage by loading state from disk.
     pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result<FileStorage> {
-        let timeline_dir = conf.timeline_dir(ttid);
-
-        let state = Self::load_control_file_conf(conf, ttid)?;
+        let timeline_dir = get_timeline_dir(conf, ttid);
+        let state = Self::load_control_file_from_dir(&timeline_dir)?;
 
         Ok(FileStorage {
             timeline_dir,
-            conf: conf.clone(),
+            no_sync: conf.no_sync,
             state,
             last_persist_at: Instant::now(),
         })
@@ -74,7 +73,7 @@ impl FileStorage {
     ) -> Result<FileStorage> {
         let store = FileStorage {
             timeline_dir,
-            conf: conf.clone(),
+            no_sync: conf.no_sync,
             state,
             last_persist_at: Instant::now(),
         };
@@ -102,12 +101,9 @@ impl FileStorage {
         upgrade_control_file(buf, version)
     }
 
-    /// Load control file for given ttid at path specified by conf.
-    pub fn load_control_file_conf(
-        conf: &SafeKeeperConf,
-        ttid: &TenantTimelineId,
-    ) -> Result<TimelinePersistentState> {
-        let path = conf.timeline_dir(ttid).join(CONTROL_FILE_NAME);
+    /// Load control file from given directory.
+    pub fn load_control_file_from_dir(timeline_dir: &Utf8Path) -> Result<TimelinePersistentState> {
+        let path = timeline_dir.join(CONTROL_FILE_NAME);
         Self::load_control_file(path)
     }
 
@@ -203,7 +199,7 @@ impl Storage for FileStorage {
         })?;
 
         let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
-        durable_rename(&control_partial_path, &control_path, !self.conf.no_sync).await?;
+        durable_rename(&control_partial_path, &control_path, !self.no_sync).await?;
 
         // update internal state
         self.state = s.clone();
@@ -233,12 +229,13 @@ mod test {
         conf: &SafeKeeperConf,
         ttid: &TenantTimelineId,
     ) -> Result<(FileStorage, TimelinePersistentState)> {
-        fs::create_dir_all(conf.timeline_dir(ttid))
+        let timeline_dir = get_timeline_dir(conf, ttid);
+        fs::create_dir_all(&timeline_dir)
             .await
             .expect("failed to create timeline dir");
         Ok((
             FileStorage::restore_new(ttid, conf)?,
-            FileStorage::load_control_file_conf(conf, ttid)?,
+            FileStorage::load_control_file_from_dir(&timeline_dir)?,
         ))
     }
 
@@ -246,11 +243,11 @@ mod test {
         conf: &SafeKeeperConf,
         ttid: &TenantTimelineId,
     ) -> Result<(FileStorage, TimelinePersistentState)> {
-        fs::create_dir_all(conf.timeline_dir(ttid))
+        let timeline_dir = get_timeline_dir(conf, ttid);
+        fs::create_dir_all(&timeline_dir)
             .await
             .expect("failed to create timeline dir");
         let state = TimelinePersistentState::empty();
-        let timeline_dir = conf.timeline_dir(ttid);
         let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?;
         Ok((storage, state))
     }
@@ -291,7 +288,7 @@ mod test {
                 .await
                 .expect("failed to persist state");
         }
-        let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME);
+        let control_path = get_timeline_dir(&conf, &ttid).join(CONTROL_FILE_NAME);
         let mut data = fs::read(&control_path).await.unwrap();
         data[0] += 1; // change the first byte of the file to fail checksum validation
         fs::write(&control_path, &data)
diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
index 3023d4e2cb..51cf4db6b5 100644
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -15,10 +15,10 @@ use crate::{
     control_file::{FileStorage, Storage},
     pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline},
     state::TimelinePersistentState,
-    timeline::{Timeline, TimelineError},
+    timeline::{FullAccessTimeline, Timeline, TimelineError},
     wal_backup::copy_s3_segments,
     wal_storage::{wal_file_paths, WalReader},
-    GlobalTimelines, SafeKeeperConf,
+    GlobalTimelines,
 };
 
 // we don't want to have more than 10 segments on disk after copy, because they take space
@@ -46,12 +46,14 @@ pub async fn handle_request(request: Request) -> Result<()> {
         }
     }
 
+    let source_tli = request.source.full_access_guard().await?;
+
     let conf = &GlobalTimelines::get_global_config();
     let ttid = request.destination_ttid;
 
     let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
 
-    let (mem_state, state) = request.source.get_state().await;
+    let (mem_state, state) = source_tli.get_state().await;
     let start_lsn = state.timeline_start_lsn;
     if start_lsn == Lsn::INVALID {
         bail!("timeline is not initialized");
@@ -60,7 +62,7 @@ pub async fn handle_request(request: Request) -> Result<()> {
 
     {
         let commit_lsn = mem_state.commit_lsn;
-        let flush_lsn = request.source.get_flush_lsn().await;
+        let flush_lsn = source_tli.get_flush_lsn().await;
 
         info!(
             "collected info about source timeline: start_lsn={}, backup_lsn={}, commit_lsn={}, flush_lsn={}",
@@ -127,10 +129,8 @@ pub async fn handle_request(request: Request) -> Result<()> {
     .await?;
 
     copy_disk_segments(
-        conf,
-        &state,
+        &source_tli,
         wal_seg_size,
-        &request.source.ttid,
         new_backup_lsn,
         request.until_lsn,
         &tli_dir_path,
@@ -159,21 +159,13 @@ pub async fn handle_request(request: Request) -> Result<()> {
 }
 
 async fn copy_disk_segments(
-    conf: &SafeKeeperConf,
-    persisted_state: &TimelinePersistentState,
+    tli: &FullAccessTimeline,
     wal_seg_size: usize,
-    source_ttid: &TenantTimelineId,
     start_lsn: Lsn,
     end_lsn: Lsn,
     tli_dir_path: &Utf8PathBuf,
 ) -> Result<()> {
-    let mut wal_reader = WalReader::new(
-        conf.workdir.clone(),
-        conf.timeline_dir(source_ttid),
-        persisted_state,
-        start_lsn,
-        true,
-    )?;
+    let mut wal_reader = tli.get_walreader(start_lsn).await?;
 
     let mut buf = [0u8; MAX_SEND_SIZE];
 
diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs
index b50f2e1158..062ff4b3db 100644
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -10,6 +10,7 @@ use std::sync::Arc;
 use anyhow::bail;
 use anyhow::Result;
 use camino::Utf8Path;
+use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use postgres_ffi::XLogSegNo;
 use postgres_ffi::MAX_SEND_SIZE;
@@ -26,7 +27,8 @@ use crate::safekeeper::TermHistory;
 use crate::send_wal::WalSenderState;
 use crate::state::TimelineMemState;
 use crate::state::TimelinePersistentState;
-use crate::wal_storage::WalReader;
+use crate::timeline::get_timeline_dir;
+use crate::timeline::FullAccessTimeline;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;
 
@@ -68,6 +70,7 @@ pub struct Response {
 pub struct TimelineDumpSer {
     pub tli: Arc<crate::timeline::Timeline>,
     pub args: Args,
+    pub timeline_dir: Utf8PathBuf,
     pub runtime: Arc<tokio::runtime::Runtime>,
 }
 
@@ -85,14 +88,20 @@ impl Serialize for TimelineDumpSer {
     where
         S: serde::Serializer,
     {
-        let dump = self
-            .runtime
-            .block_on(build_from_tli_dump(self.tli.clone(), self.args.clone()));
+        let dump = self.runtime.block_on(build_from_tli_dump(
+            &self.tli,
+            &self.args,
+            &self.timeline_dir,
+        ));
         dump.serialize(serializer)
     }
 }
 
-async fn build_from_tli_dump(timeline: Arc<crate::timeline::Timeline>, args: Args) -> Timeline {
+async fn build_from_tli_dump(
+    timeline: &Arc<crate::timeline::Timeline>,
+    args: &Args,
+    timeline_dir: &Utf8Path,
+) -> Timeline {
     let control_file = if args.dump_control_file {
         let mut state = timeline.get_state().await.1;
         if !args.dump_term_history {
@@ -112,7 +121,8 @@ async fn build_from_tli_dump(timeline: Arc<crate::timeline::Timeline>, args: Arg
     let disk_content = if args.dump_disk_content {
         // build_disk_content can fail, but we don't want to fail the whole
         // request because of that.
-        build_disk_content(&timeline.timeline_dir).ok()
+        // Note: timeline can be in offloaded state, this is not a problem.
+        build_disk_content(timeline_dir).ok()
     } else {
         None
     };
@@ -186,6 +196,7 @@ pub struct FileInfo {
 pub async fn build(args: Args) -> Result<Response> {
     let start_time = Utc::now();
     let timelines_count = GlobalTimelines::timelines_count();
+    let config = GlobalTimelines::get_global_config();
 
     let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() {
         // If both tenant_id and timeline_id are specified, we can just get the
@@ -223,12 +234,11 @@ pub async fn build(args: Args) -> Result<Response> {
         timelines.push(TimelineDumpSer {
             tli,
             args: args.clone(),
+            timeline_dir: get_timeline_dir(&config, &ttid),
             runtime: runtime.clone(),
         });
     }
 
-    let config = GlobalTimelines::get_global_config();
-
     Ok(Response {
         start_time,
         finish_time: Utc::now(),
@@ -316,27 +326,19 @@ pub struct TimelineDigest {
 }
 
 pub async fn calculate_digest(
-    tli: &Arc<crate::timeline::Timeline>,
+    tli: &FullAccessTimeline,
     request: TimelineDigestRequest,
 ) -> Result<TimelineDigest> {
     if request.from_lsn > request.until_lsn {
         bail!("from_lsn is greater than until_lsn");
     }
 
-    let conf = GlobalTimelines::get_global_config();
     let (_, persisted_state) = tli.get_state().await;
-
     if persisted_state.timeline_start_lsn > request.from_lsn {
         bail!("requested LSN is before the start of the timeline");
     }
 
-    let mut wal_reader = WalReader::new(
-        conf.workdir.clone(),
-        tli.timeline_dir.clone(),
-        &persisted_state,
-        request.from_lsn,
-        true,
-    )?;
+    let mut wal_reader = tli.get_walreader(request.from_lsn).await?;
 
     let mut hasher = Sha256::new();
     let mut buf = [0u8; MAX_SEND_SIZE];
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 4aacd3421d..1e29b21fac 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -85,11 +85,11 @@ impl From<TermSwitchApiEntry> for TermLsn {
     }
 }
 
-/// Augment AcceptorState with epoch for convenience
+/// Augment AcceptorState with last_log_term for convenience
 #[derive(Debug, Serialize, Deserialize)]
 pub struct AcceptorStateStatus {
     pub term: Term,
-    pub epoch: Term,
+    pub epoch: Term, // aka last_log_term
     pub term_history: Vec<TermSwitchApiEntry>,
 }
 
@@ -130,7 +130,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
     let (inmem, state) = tli.get_state().await;
     let flush_lsn = tli.get_flush_lsn().await;
 
-    let epoch = state.acceptor_state.get_epoch(flush_lsn);
+    let last_log_term = state.acceptor_state.get_last_log_term(flush_lsn);
     let term_history = state
         .acceptor_state
         .term_history
@@ -143,7 +143,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
         .collect();
     let acc_state = AcceptorStateStatus {
         term: state.acceptor_state.term,
-        epoch,
+        epoch: last_log_term,
         term_history,
     };
 
@@ -249,6 +249,10 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
     };
 
     let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let tli = tli
+        .full_access_guard()
+        .await
+        .map_err(ApiError::InternalServerError)?;
 
     let response = debug_dump::calculate_digest(&tli, request)
         .await
@@ -268,8 +272,12 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
     let filename: String = parse_request_param(&request, "filename")?;
 
     let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let tli = tli
+        .full_access_guard()
+        .await
+        .map_err(ApiError::InternalServerError)?;
 
-    let filepath = tli.timeline_dir.join(filename);
+    let filepath = tli.get_timeline_dir().join(filename);
     let mut file = File::open(&filepath)
         .await
         .map_err(|e| ApiError::InternalServerError(e.into()))?;
@@ -287,7 +295,7 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
         .map_err(|e| ApiError::InternalServerError(e.into()))
 }
 
-/// Force persist control file and remove old WAL.
+/// Force persist control file.
 async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
@@ -297,13 +305,13 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
     );
 
     let tli = GlobalTimelines::get(ttid)?;
-    tli.maybe_persist_control_file(true)
+    tli.write_shared_state()
+        .await
+        .sk
+        .state
+        .flush()
         .await
         .map_err(ApiError::InternalServerError)?;
-    tli.remove_old_wal()
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
     json_response(StatusCode::OK, ())
 }
 
diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs
index 32d5889803..27e54776e0 100644
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -6,8 +6,6 @@
 //! modifications in tests.
 //!
 
-use std::sync::Arc;
-
 use anyhow::Context;
 use bytes::Bytes;
 use postgres_backend::QueryError;
@@ -23,7 +21,7 @@ use crate::safekeeper::{
 };
 use crate::safekeeper::{Term, TermHistory, TermLsn};
 use crate::state::TimelinePersistentState;
-use crate::timeline::Timeline;
+use crate::timeline::FullAccessTimeline;
 use crate::GlobalTimelines;
 use postgres_backend::PostgresBackend;
 use postgres_ffi::encode_logical_message;
@@ -104,8 +102,8 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
 async fn prepare_safekeeper(
     ttid: TenantTimelineId,
     pg_version: u32,
-) -> anyhow::Result<Arc<Timeline>> {
-    GlobalTimelines::create(
+) -> anyhow::Result<FullAccessTimeline> {
+    let tli = GlobalTimelines::create(
         ttid,
         ServerInfo {
             pg_version,
@@ -115,10 +113,16 @@ async fn prepare_safekeeper(
         Lsn::INVALID,
         Lsn::INVALID,
     )
-    .await
+    .await?;
+
+    tli.full_access_guard().await
 }
 
-async fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
+async fn send_proposer_elected(
+    tli: &FullAccessTimeline,
+    term: Term,
+    lsn: Lsn,
+) -> anyhow::Result<()> {
     // add new term to existing history
     let history = tli.get_state().await.1.acceptor_state.term_history;
     let history = history.up_to(lsn.checked_sub(1u64).unwrap());
@@ -147,7 +151,7 @@ pub struct InsertedWAL {
 /// Extend local WAL with new LogicalMessage record. To do that,
 /// create AppendRequest with new WAL and pass it to safekeeper.
 pub async fn append_logical_message(
-    tli: &Arc<Timeline>,
+    tli: &FullAccessTimeline,
     msg: &AppendLogicalMessage,
 ) -> anyhow::Result<InsertedWAL> {
     let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message);
@@ -165,7 +169,7 @@ pub async fn append_logical_message(
     let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest {
         h: AppendRequestHeader {
             term: msg.term,
-            epoch_start_lsn: begin_lsn,
+            term_start_lsn: begin_lsn,
             begin_lsn,
             end_lsn,
             commit_lsn,
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 8d8d2cf23e..1a56ff736c 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -7,10 +7,7 @@ use tokio::runtime::Runtime;
 use std::time::Duration;
 use storage_broker::Uri;
 
-use utils::{
-    auth::SwappableJwtAuth,
-    id::{NodeId, TenantId, TenantTimelineId},
-};
+use utils::{auth::SwappableJwtAuth, id::NodeId};
 
 mod auth;
 pub mod broker;
@@ -89,15 +86,6 @@ pub struct SafeKeeperConf {
 }
 
 impl SafeKeeperConf {
-    pub fn tenant_dir(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.workdir.join(tenant_id.to_string())
-    }
-
-    pub fn timeline_dir(&self, ttid: &TenantTimelineId) -> Utf8PathBuf {
-        self.tenant_dir(&ttid.tenant_id)
-            .join(ttid.timeline_id.to_string())
-    }
-
     pub fn is_wal_backup_enabled(&self) -> bool {
         self.remote_storage.is_some() && self.wal_backup_enabled
     }
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index f7cc40f58a..7b41c98cb8 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -17,7 +17,7 @@ use utils::{
 use crate::{
     control_file, debug_dump,
     http::routes::TimelineStatus,
-    timeline::{Timeline, TimelineError},
+    timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError},
     wal_storage::{self, Storage},
     GlobalTimelines, SafeKeeperConf,
 };
@@ -283,13 +283,13 @@ pub async fn load_temp_timeline(
     }
 
     // Move timeline dir to the correct location
-    let timeline_path = conf.timeline_dir(&ttid);
+    let timeline_path = get_timeline_dir(conf, &ttid);
 
     info!(
         "moving timeline {} from {} to {}",
         ttid, tmp_path, timeline_path
     );
-    tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
+    tokio::fs::create_dir_all(get_tenant_dir(conf, &ttid.tenant_id)).await?;
     tokio::fs::rename(tmp_path, &timeline_path).await?;
 
     let tli = GlobalTimelines::load_timeline(&guard, ttid)
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 03cfa882c4..7943a2fd86 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -6,7 +6,7 @@ use crate::handler::SafekeeperPostgresHandler;
 use crate::safekeeper::AcceptorProposerMessage;
 use crate::safekeeper::ProposerAcceptorMessage;
 use crate::safekeeper::ServerInfo;
-use crate::timeline::Timeline;
+use crate::timeline::FullAccessTimeline;
 use crate::wal_service::ConnectionId;
 use crate::GlobalTimelines;
 use anyhow::{anyhow, Context};
@@ -213,7 +213,7 @@ impl SafekeeperPostgresHandler {
         &mut self,
         pgb: &mut PostgresBackend<IO>,
     ) -> Result<(), QueryError> {
-        let mut tli: Option<Arc<Timeline>> = None;
+        let mut tli: Option<FullAccessTimeline> = None;
         if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
             // Log the result and probably send it to the client, closing the stream.
             let handle_end_fut = pgb.handle_copy_stream_end(end);
@@ -233,7 +233,7 @@ impl SafekeeperPostgresHandler {
     pub async fn handle_start_wal_push_guts<IO: AsyncRead + AsyncWrite + Unpin>(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
-        tli: &mut Option<Arc<Timeline>>,
+        tli: &mut Option<FullAccessTimeline>,
     ) -> Result<(), CopyStreamHandlerEnd> {
         // Notify the libpq client that it's allowed to send `CopyData` messages
         pgb.write_message(&BeMessage::CopyBothResponse).await?;
@@ -323,7 +323,7 @@ struct NetworkReader<'a, IO> {
 impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
     async fn read_first_message(
         &mut self,
-    ) -> Result<(Arc<Timeline>, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
+    ) -> Result<(FullAccessTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
         // Receive information about server to create timeline, if not yet.
         let next_msg = read_message(self.pgb_reader).await?;
         let tli = match next_msg {
@@ -337,7 +337,10 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
                     system_id: greeting.system_id,
                     wal_seg_size: greeting.wal_seg_size,
                 };
-                GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID).await?
+                let tli =
+                    GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
+                        .await?;
+                tli.full_access_guard().await?
             }
             _ => {
                 return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!(
@@ -353,7 +356,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
         msg_tx: Sender<ProposerAcceptorMessage>,
         msg_rx: Receiver<ProposerAcceptorMessage>,
         reply_tx: Sender<AcceptorProposerMessage>,
-        tli: Arc<Timeline>,
+        tli: FullAccessTimeline,
         next_msg: ProposerAcceptorMessage,
     ) -> Result<(), CopyStreamHandlerEnd> {
         *self.acceptor_handle = Some(WalAcceptor::spawn(
@@ -448,7 +451,7 @@ const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
 /// replies to reply_tx; reading from socket and writing to disk in parallel is
 /// beneficial for performance, this struct provides writing to disk part.
 pub struct WalAcceptor {
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
     msg_rx: Receiver<ProposerAcceptorMessage>,
     reply_tx: Sender<AcceptorProposerMessage>,
     conn_id: Option<ConnectionId>,
@@ -461,7 +464,7 @@ impl WalAcceptor {
     ///
     /// conn_id None means WalAcceptor is used by recovery initiated at this safekeeper.
     pub fn spawn(
-        tli: Arc<Timeline>,
+        tli: FullAccessTimeline,
         msg_rx: Receiver<ProposerAcceptorMessage>,
         reply_tx: Sender<AcceptorProposerMessage>,
         conn_id: Option<ConnectionId>,
diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index dfa1892c40..80a630b1e1 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -2,7 +2,7 @@
 //! provide it, i.e. safekeeper lags too much.
 
 use std::time::SystemTime;
-use std::{fmt, pin::pin, sync::Arc};
+use std::{fmt, pin::pin};
 
 use anyhow::{bail, Context};
 use futures::StreamExt;
@@ -21,6 +21,7 @@ use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config}
 
 use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
 use crate::safekeeper::{AppendRequest, AppendRequestHeader};
+use crate::timeline::FullAccessTimeline;
 use crate::{
     http::routes::TimelineStatus,
     receive_wal::MSG_QUEUE_SIZE,
@@ -28,14 +29,14 @@ use crate::{
         AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory,
         TermLsn, VoteRequest,
     },
-    timeline::{PeerInfo, Timeline},
+    timeline::PeerInfo,
     SafeKeeperConf,
 };
 
 /// Entrypoint for per timeline task which always runs, checking whether
 /// recovery for this safekeeper is needed and starting it if so.
 #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
-pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
+pub async fn recovery_main(tli: FullAccessTimeline, conf: SafeKeeperConf) {
     info!("started");
 
     let cancel = tli.cancel.clone();
@@ -47,6 +48,87 @@ pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
     }
 }
 
+/// Should we start fetching WAL from a peer safekeeper, and if yes, from
+/// which? Answer is yes, i.e. .donors is not empty if 1) there is something
+/// to fetch, and we can do that without running elections; 2) there is no
+/// actively streaming compute, as we don't want to compete with it.
+///
+/// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
+/// to its last_log_term so we are sure such a leader ever had been elected.
+///
+/// All possible donors are returned so that we could keep connection to the
+/// current one if it is good even if it slightly lags behind.
+///
+/// Note that term conditions above might be not met, but safekeepers are
+/// still not aligned on last flush_lsn. Generally in this case until
+/// elections are run it is not possible to say which safekeeper should
+/// recover from which one -- history which would be committed is different
+/// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
+/// Thus we don't try to predict it here.
+async fn recovery_needed(
+    tli: &FullAccessTimeline,
+    heartbeat_timeout: Duration,
+) -> RecoveryNeededInfo {
+    let ss = tli.read_shared_state().await;
+    let term = ss.sk.state.acceptor_state.term;
+    let last_log_term = ss.sk.get_last_log_term();
+    let flush_lsn = ss.sk.flush_lsn();
+    // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
+    let mut peers = ss.get_peers(heartbeat_timeout);
+    // Sort by <last log term, lsn> pairs.
+    peers.sort_by(|p1, p2| {
+        let tl1 = TermLsn {
+            term: p1.last_log_term,
+            lsn: p1.flush_lsn,
+        };
+        let tl2 = TermLsn {
+            term: p2.last_log_term,
+            lsn: p2.flush_lsn,
+        };
+        tl2.cmp(&tl1) // desc
+    });
+    let num_streaming_computes = tli.get_walreceivers().get_num_streaming();
+    let donors = if num_streaming_computes > 0 {
+        vec![] // If there is a streaming compute, don't try to recover to not intervene.
+    } else {
+        peers
+            .iter()
+            .filter_map(|candidate| {
+                // Are we interested in this candidate?
+                let candidate_tl = TermLsn {
+                    term: candidate.last_log_term,
+                    lsn: candidate.flush_lsn,
+                };
+                let my_tl = TermLsn {
+                    term: last_log_term,
+                    lsn: flush_lsn,
+                };
+                if my_tl < candidate_tl {
+                    // Yes, we are interested. Can we pull from it without
+                    // (re)running elections? It is possible if 1) his term
+                    // is equal to his last_log_term so we could act on
+                    // behalf of leader of this term (we must be sure he was
+                    // ever elected) and 2) our term is not higher, or we'll refuse data.
+                    if candidate.term == candidate.last_log_term && candidate.term >= term {
+                        Some(Donor::from(candidate))
+                    } else {
+                        None
+                    }
+                } else {
+                    None
+                }
+            })
+            .collect()
+    };
+    RecoveryNeededInfo {
+        term,
+        last_log_term,
+        flush_lsn,
+        peers,
+        num_streaming_computes,
+        donors,
+    }
+}
 /// Result of Timeline::recovery_needed, contains donor(s) if recovery needed and
 /// fields to explain the choice.
 #[derive(Debug)]
@@ -113,10 +195,10 @@ impl From<&PeerInfo> for Donor {
 const CHECK_INTERVAL_MS: u64 = 2000;
 
 /// Check regularly whether we need to start recovery.
-async fn recovery_main_loop(tli: Arc<Timeline>, conf: SafeKeeperConf) {
+async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) {
     let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
     loop {
-        let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
+        let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
         match recovery_needed_info.donors.first() {
             Some(donor) => {
                 info!(
@@ -146,7 +228,7 @@ async fn recovery_main_loop(tli: Arc<Timeline>, conf: SafeKeeperConf) {
 /// Recover from the specified donor. Returns message explaining normal finish
 /// reason or error.
 async fn recover(
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
     donor: &Donor,
     conf: &SafeKeeperConf,
 ) -> anyhow::Result<String> {
@@ -232,7 +314,7 @@ async fn recover(
 
 // Pull WAL from donor, assuming handshake is already done.
 async fn recovery_stream(
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
     donor: &Donor,
     start_streaming_at: Lsn,
     conf: &SafeKeeperConf,
@@ -316,7 +398,7 @@ async fn network_io(
     physical_stream: ReplicationStream,
     msg_tx: Sender<ProposerAcceptorMessage>,
     donor: Donor,
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
     conf: SafeKeeperConf,
 ) -> anyhow::Result<Option<String>> {
     let mut physical_stream = pin!(physical_stream);
@@ -337,7 +419,7 @@ async fn network_io(
             ReplicationMessage::XLogData(xlog_data) => {
                 let ar_hdr = AppendRequestHeader {
                     term: donor.term,
-                    epoch_start_lsn: Lsn::INVALID, // unused
+                    term_start_lsn: Lsn::INVALID, // unused
                     begin_lsn: Lsn(xlog_data.wal_start()),
                     end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
                     commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
@@ -365,7 +447,7 @@ async fn network_io(
             }
             ReplicationMessage::PrimaryKeepAlive(_) => {
                 // keepalive means nothing is being streamed for a while. Check whether we need to stop.
-                let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
+                let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
                 // do current donors still contain one we currently connected to?
                 if !recovery_needed_info
                     .donors
diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs
index 3400eee9b7..b661e48cb5 100644
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -1,41 +1,25 @@
-//! Thread removing old WAL.
+use utils::lsn::Lsn;
 
-use std::time::Duration;
+use crate::timeline_manager::StateSnapshot;
 
-use tokio::time::sleep;
-use tracing::*;
+/// Get oldest LSN we still need to keep. We hold WAL till it is consumed
+/// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
+/// offloading.
+/// While it is safe to use inmem values for determining horizon,
+/// we use persistent to make possible normal states less surprising.
+/// All segments covering LSNs before horizon_lsn can be removed.
+pub fn calc_horizon_lsn(state: &StateSnapshot, extra_horizon_lsn: Option<Lsn>) -> Lsn {
+    use std::cmp::min;
 
-use crate::{GlobalTimelines, SafeKeeperConf};
-
-pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {
-    let wal_removal_interval = Duration::from_millis(5000);
-    loop {
-        let now = tokio::time::Instant::now();
-        let tlis = GlobalTimelines::get_all();
-        for tli in &tlis {
-            let ttid = tli.ttid;
-            async {
-                if let Err(e) = tli.maybe_persist_control_file(false).await {
-                    warn!("failed to persist control file: {e}");
-                }
-                if let Err(e) = tli.remove_old_wal().await {
-                    error!("failed to remove WAL: {}", e);
-                }
-            }
-            .instrument(info_span!("WAL removal", ttid = %ttid))
-            .await;
-        }
-
-        let elapsed = now.elapsed();
-        let total_timelines = tlis.len();
-
-        if elapsed > wal_removal_interval {
-            info!(
-                "WAL removal is too long, processed {} timelines in {:?}",
-                total_timelines, elapsed
-            );
-        }
-
-        sleep(wal_removal_interval).await;
+    let mut horizon_lsn = min(
+        state.cfile_remote_consistent_lsn,
+        state.cfile_peer_horizon_lsn,
+    );
+    // we don't want to remove WAL that is not yet offloaded to s3
+    horizon_lsn = min(horizon_lsn, state.cfile_backup_lsn);
+    if let Some(extra_horizon_lsn) = extra_horizon_lsn {
+        horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
     }
+
+    horizon_lsn
 }
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 2a620f5fef..563dbbe315 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -10,7 +10,6 @@ use std::cmp::max;
 use std::cmp::min;
 use std::fmt;
 use std::io::Read;
-use std::time::Duration;
 use storage_broker::proto::SafekeeperTimelineInfo;
 
 use tracing::*;
@@ -188,8 +187,8 @@ pub struct AcceptorState {
 }
 
 impl AcceptorState {
-    /// acceptor's epoch is the term of the highest entry in the log
-    pub fn get_epoch(&self, flush_lsn: Lsn) -> Term {
+    /// acceptor's last_log_term is the term of the highest entry in the log
+    pub fn get_last_log_term(&self, flush_lsn: Lsn) -> Term {
         let th = self.term_history.up_to(flush_lsn);
         match th.0.last() {
             Some(e) => e.term,
@@ -305,9 +304,9 @@ pub struct AppendRequest {
 pub struct AppendRequestHeader {
     // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
     pub term: Term,
-    // TODO: remove this field, it in unused -- LSN of term switch can be taken
-    // from ProposerElected (as well as from term history).
-    pub epoch_start_lsn: Lsn,
+    // TODO: remove this field from the protocol, it in unused -- LSN of term
+    // switch can be taken from ProposerElected (as well as from term history).
+    pub term_start_lsn: Lsn,
     /// start position of message in WAL
     pub begin_lsn: Lsn,
     /// end position of message in WAL
@@ -326,9 +325,10 @@ pub struct AppendResponse {
     // Current term of the safekeeper; if it is higher than proposer's, the
     // compute is out of date.
     pub term: Term,
-    // NOTE: this is physical end of wal on safekeeper; currently it doesn't
-    // make much sense without taking epoch into account, as history can be
-    // diverged.
+    // Flushed end of wal on safekeeper; one should be always mindful from what
+    // term history this value comes, either checking history directly or
+    // observing term being set to one for which WAL truncation is known to have
+    // happened.
     pub flush_lsn: Lsn,
     // We report back our awareness about which WAL is committed, as this is
     // a criterion for walproposer --sync mode exit
@@ -482,8 +482,8 @@ impl AcceptorProposerMessage {
 /// - messages from broker peers
 pub struct SafeKeeper<CTRL: control_file::Storage, WAL: wal_storage::Storage> {
     /// LSN since the proposer safekeeper currently talking to appends WAL;
-    /// determines epoch switch point.
-    pub epoch_start_lsn: Lsn,
+    /// determines last_log_term switch point.
+    pub term_start_lsn: Lsn,
 
     pub state: TimelineState<CTRL>, // persistent state storage
     pub wal_store: WAL,
@@ -511,7 +511,7 @@ where
         }
 
         Ok(SafeKeeper {
-            epoch_start_lsn: Lsn(0),
+            term_start_lsn: Lsn(0),
             state: TimelineState::new(state),
             wal_store,
             node_id,
@@ -531,8 +531,10 @@ where
         self.state.acceptor_state.term
     }
 
-    pub fn get_epoch(&self) -> Term {
-        self.state.acceptor_state.get_epoch(self.flush_lsn())
+    pub fn get_last_log_term(&self) -> Term {
+        self.state
+            .acceptor_state
+            .get_last_log_term(self.flush_lsn())
     }
 
     /// wal_store wrapper avoiding commit_lsn <= flush_lsn violation when we don't have WAL yet.
@@ -713,7 +715,7 @@ where
         // proceed, but to prevent commit_lsn surprisingly going down we should
         // either refuse the session (simpler) or skip the part we already have
         // from the stream (can be implemented).
-        if msg.term == self.get_epoch() && self.flush_lsn() > msg.start_streaming_at {
+        if msg.term == self.get_last_log_term() && self.flush_lsn() > msg.start_streaming_at {
             bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
                    msg.term, self.flush_lsn(), msg.start_streaming_at)
         }
@@ -788,7 +790,7 @@ where
         // Cache LSN where term starts to immediately fsync control file with
         // commit_lsn once we reach it -- sync-safekeepers finishes when
         // persisted commit_lsn on majority of safekeepers aligns.
-        self.epoch_start_lsn = match msg.term_history.0.last() {
+        self.term_start_lsn = match msg.term_history.0.last() {
             None => bail!("proposer elected with empty term history"),
             Some(term_lsn_start) => term_lsn_start.lsn,
         };
@@ -814,35 +816,17 @@ where
 
         self.state.inmem.commit_lsn = commit_lsn;
 
-        // If new commit_lsn reached epoch switch, force sync of control
+        // If new commit_lsn reached term switch, force sync of control
         // file: walproposer in sync mode is very interested when this
         // happens. Note: this is for sync-safekeepers mode only, as
-        // otherwise commit_lsn might jump over epoch_start_lsn.
-        if commit_lsn >= self.epoch_start_lsn && self.state.commit_lsn < self.epoch_start_lsn {
+        // otherwise commit_lsn might jump over term_start_lsn.
+        if commit_lsn >= self.term_start_lsn && self.state.commit_lsn < self.term_start_lsn {
             self.state.flush().await?;
         }
 
         Ok(())
     }
 
-    /// Persist control file if there is something to save and enough time
-    /// passed after the last save.
-    pub async fn maybe_persist_inmem_control_file(&mut self, force: bool) -> Result<bool> {
-        const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
-        if !force && self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
-            return Ok(false);
-        }
-        let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn
-            || self.state.inmem.backup_lsn > self.state.backup_lsn
-            || self.state.inmem.peer_horizon_lsn > self.state.peer_horizon_lsn
-            || self.state.inmem.remote_consistent_lsn > self.state.remote_consistent_lsn;
-        if need_persist {
-            self.state.flush().await?;
-            trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
-        }
-        Ok(need_persist)
-    }
-
     /// Handle request to append WAL.
     #[allow(clippy::comparison_chain)]
     async fn handle_append_request(
@@ -933,7 +917,7 @@ where
             // Note: the check is too restrictive, generally we can update local
             // commit_lsn if our history matches (is part of) history of advanced
             // commit_lsn provider.
-            if sk_info.last_log_term == self.get_epoch() {
+            if sk_info.last_log_term == self.get_last_log_term() {
                 self.update_commit_lsn(Lsn(sk_info.commit_lsn)).await?;
             }
         }
@@ -1079,7 +1063,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_epoch_switch() {
+    async fn test_last_log_term_switch() {
         let storage = InMemoryState {
             persisted_state: test_sk_state(),
         };
@@ -1089,7 +1073,7 @@ mod tests {
 
         let mut ar_hdr = AppendRequestHeader {
             term: 1,
-            epoch_start_lsn: Lsn(3),
+            term_start_lsn: Lsn(3),
             begin_lsn: Lsn(1),
             end_lsn: Lsn(2),
             commit_lsn: Lsn(0),
@@ -1114,14 +1098,14 @@ mod tests {
             .await
             .unwrap();
 
-        // check that AppendRequest before epochStartLsn doesn't switch epoch
+        // check that AppendRequest before term_start_lsn doesn't switch last_log_term.
         let resp = sk
             .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
             .await;
         assert!(resp.is_ok());
-        assert_eq!(sk.get_epoch(), 0);
+        assert_eq!(sk.get_last_log_term(), 0);
 
-        // but record at epochStartLsn does the switch
+        // but record at term_start_lsn does the switch
         ar_hdr.begin_lsn = Lsn(2);
         ar_hdr.end_lsn = Lsn(3);
         append_request = AppendRequest {
@@ -1133,7 +1117,7 @@ mod tests {
             .await;
         assert!(resp.is_ok());
         sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
-        assert_eq!(sk.get_epoch(), 1);
+        assert_eq!(sk.get_last_log_term(), 1);
     }
 
     #[test]
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 5a9745e1c9..df75893838 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -5,7 +5,7 @@ use crate::handler::SafekeeperPostgresHandler;
 use crate::metrics::RECEIVED_PS_FEEDBACKS;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{Term, TermLsn};
-use crate::timeline::Timeline;
+use crate::timeline::FullAccessTimeline;
 use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
@@ -387,8 +387,10 @@ impl SafekeeperPostgresHandler {
         term: Option<Term>,
     ) -> Result<(), QueryError> {
         let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
+        let full_access = tli.full_access_guard().await?;
+
         if let Err(end) = self
-            .handle_start_replication_guts(pgb, start_pos, term, tli.clone())
+            .handle_start_replication_guts(pgb, start_pos, term, full_access)
             .await
         {
             let info = tli.get_safekeeper_info(&self.conf).await;
@@ -405,7 +407,7 @@ impl SafekeeperPostgresHandler {
         pgb: &mut PostgresBackend<IO>,
         start_pos: Lsn,
         term: Option<Term>,
-        tli: Arc<Timeline>,
+        tli: FullAccessTimeline,
     ) -> Result<(), CopyStreamHandlerEnd> {
         let appname = self.appname.clone();
 
@@ -448,14 +450,7 @@ impl SafekeeperPostgresHandler {
         // switch to copy
         pgb.write_message(&BeMessage::CopyBothResponse).await?;
 
-        let (_, persisted_state) = tli.get_state().await;
-        let wal_reader = WalReader::new(
-            self.conf.workdir.clone(),
-            self.conf.timeline_dir(&tli.ttid),
-            &persisted_state,
-            start_pos,
-            self.conf.is_wal_backup_enabled(),
-        )?;
+        let wal_reader = tli.get_walreader(start_pos).await?;
 
         // Split to concurrently receive and send data; replies are generally
         // not synchronized with sends, so this avoids deadlocks.
@@ -532,7 +527,7 @@ impl EndWatch {
 /// A half driving sending WAL.
 struct WalSender<'a, IO> {
     pgb: &'a mut PostgresBackend<IO>,
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
     appname: Option<String>,
     // Position since which we are sending next chunk.
     start_pos: Lsn,
@@ -741,7 +736,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
 struct ReplyReader<IO> {
     reader: PostgresBackendReader<IO>,
     ws_guard: Arc<WalSenderGuard>,
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
 }
 
 impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index f30c503382..148a7e90bd 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -3,14 +3,14 @@
 
 use anyhow::{anyhow, bail, Result};
 use camino::Utf8PathBuf;
-use postgres_ffi::XLogSegNo;
 use serde::{Deserialize, Serialize};
 use tokio::fs;
 use tokio_util::sync::CancellationToken;
+use utils::id::TenantId;
 
 use std::cmp::max;
 use std::ops::{Deref, DerefMut};
-use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
 use std::sync::Arc;
 use std::time::Duration;
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
@@ -26,7 +26,6 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 
 use crate::receive_wal::WalReceivers;
-use crate::recovery::{recovery_main, Donor, RecoveryNeededInfo};
 use crate::safekeeper::{
     AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
     INVALID_TERM,
@@ -38,8 +37,8 @@ use crate::wal_backup::{self};
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
 use crate::metrics::FullTimelineInfo;
-use crate::wal_storage::Storage as wal_storage_iface;
-use crate::{debug_dump, timeline_manager, wal_backup_partial, wal_storage};
+use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
+use crate::{debug_dump, timeline_manager, wal_storage};
 use crate::{GlobalTimelines, SafeKeeperConf};
 
 /// Things safekeeper should know about timeline state on peers.
@@ -169,7 +168,6 @@ pub struct SharedState {
     pub(crate) sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
     /// In memory list containing state of peers sent in latest messages from them.
     pub(crate) peers_info: PeersInfo,
-    pub(crate) last_removed_segno: XLogSegNo,
 }
 
 impl SharedState {
@@ -197,33 +195,33 @@ impl SharedState {
 
         // We don't want to write anything to disk, because we may have existing timeline there.
         // These functions should not change anything on disk.
-        let timeline_dir = conf.timeline_dir(ttid);
-        let control_store = control_file::FileStorage::create_new(timeline_dir, conf, state)?;
+        let timeline_dir = get_timeline_dir(conf, ttid);
+        let control_store =
+            control_file::FileStorage::create_new(timeline_dir.clone(), conf, state)?;
         let wal_store =
-            wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
+            wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
         let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
 
         Ok(Self {
             sk,
             peers_info: PeersInfo(vec![]),
-            last_removed_segno: 0,
         })
     }
 
     /// Restore SharedState from control file. If file doesn't exist, bails out.
     fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
+        let timeline_dir = get_timeline_dir(conf, ttid);
         let control_store = control_file::FileStorage::restore_new(ttid, conf)?;
         if control_store.server.wal_seg_size == 0 {
             bail!(TimelineError::UninitializedWalSegSize(*ttid));
         }
 
         let wal_store =
-            wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
+            wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
 
         Ok(Self {
             sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
             peers_info: PeersInfo(vec![]),
-            last_removed_segno: 0,
         })
     }
 
@@ -244,7 +242,7 @@ impl SharedState {
                 timeline_id: ttid.timeline_id.as_ref().to_owned(),
             }),
             term: self.sk.state.acceptor_state.term,
-            last_log_term: self.sk.get_epoch(),
+            last_log_term: self.sk.get_last_log_term(),
             flush_lsn: self.sk.flush_lsn().0,
             // note: this value is not flushed to control file yet and can be lost
             commit_lsn: self.sk.state.inmem.commit_lsn.0,
@@ -275,24 +273,6 @@ impl SharedState {
             .cloned()
             .collect()
     }
-
-    /// Get oldest segno we still need to keep. We hold WAL till it is consumed
-    /// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
-    /// offloading.
-    /// While it is safe to use inmem values for determining horizon,
-    /// we use persistent to make possible normal states less surprising.
-    fn get_horizon_segno(&self, extra_horizon_lsn: Option<Lsn>) -> XLogSegNo {
-        let state = &self.sk.state;
-
-        use std::cmp::min;
-        let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn);
-        // we don't want to remove WAL that is not yet offloaded to s3
-        horizon_lsn = min(horizon_lsn, state.backup_lsn);
-        if let Some(extra_horizon_lsn) = extra_horizon_lsn {
-            horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
-        }
-        horizon_lsn.segment_number(state.server.wal_seg_size as usize)
-    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -349,22 +329,15 @@ pub struct Timeline {
     mutex: RwLock<SharedState>,
     walsenders: Arc<WalSenders>,
     walreceivers: Arc<WalReceivers>,
+    timeline_dir: Utf8PathBuf,
 
     /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires
     pub(crate) cancel: CancellationToken,
 
-    /// Directory where timeline state is stored.
-    pub timeline_dir: Utf8PathBuf,
-
-    /// Should we keep WAL on disk for active replication connections.
-    /// Especially useful for sharding, when different shards process WAL
-    /// with different speed.
-    // TODO: add `Arc<SafeKeeperConf>` here instead of adding each field separately.
-    walsenders_keep_horizon: bool,
-
     // timeline_manager controlled state
     pub(crate) broker_active: AtomicBool,
     pub(crate) wal_backup_active: AtomicBool,
+    pub(crate) last_removed_segno: AtomicU64,
 }
 
 impl Timeline {
@@ -394,10 +367,10 @@ impl Timeline {
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
             cancel: CancellationToken::default(),
-            timeline_dir: conf.timeline_dir(&ttid),
-            walsenders_keep_horizon: conf.walsenders_keep_horizon,
+            timeline_dir: get_timeline_dir(conf, &ttid),
             broker_active: AtomicBool::new(false),
             wal_backup_active: AtomicBool::new(false),
+            last_removed_segno: AtomicU64::new(0),
         })
     }
 
@@ -430,10 +403,10 @@ impl Timeline {
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
             cancel: CancellationToken::default(),
-            timeline_dir: conf.timeline_dir(&ttid),
-            walsenders_keep_horizon: conf.walsenders_keep_horizon,
+            timeline_dir: get_timeline_dir(conf, &ttid),
             broker_active: AtomicBool::new(false),
             wal_backup_active: AtomicBool::new(false),
+            last_removed_segno: AtomicU64::new(0),
         })
     }
 
@@ -494,15 +467,6 @@ impl Timeline {
             conf.clone(),
             broker_active_set,
         ));
-
-        // Start recovery task which always runs on the timeline.
-        if conf.peer_recovery_enabled {
-            tokio::spawn(recovery_main(self.clone(), conf.clone()));
-        }
-        // TODO: migrate to timeline_manager
-        if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
-            tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
-        }
     }
 
     /// Delete timeline from disk completely, by removing timeline directory.
@@ -555,36 +519,6 @@ impl Timeline {
         self.mutex.read().await
     }
 
-    /// Returns true if walsender should stop sending WAL to pageserver. We
-    /// terminate it if remote_consistent_lsn reached commit_lsn and there is no
-    /// computes. While there might be nothing to stream already, we learn about
-    /// remote_consistent_lsn update through replication feedback, and we want
-    /// to stop pushing to the broker if pageserver is fully caughtup.
-    pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
-        if self.is_cancelled() {
-            return true;
-        }
-        let shared_state = self.read_shared_state().await;
-        if self.walreceivers.get_num() == 0 {
-            return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
-            reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
-        }
-        false
-    }
-
-    /// Ensure that current term is t, erroring otherwise, and lock the state.
-    pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
-        let ss = self.read_shared_state().await;
-        if ss.sk.state.acceptor_state.term != t {
-            bail!(
-                "failed to acquire term {}, current term {}",
-                t,
-                ss.sk.state.acceptor_state.term
-            );
-        }
-        Ok(ss)
-    }
-
     /// Returns commit_lsn watch channel.
     pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver<Lsn> {
         self.commit_lsn_watch_rx.clone()
@@ -600,28 +534,6 @@ impl Timeline {
         self.shared_state_version_rx.clone()
     }
 
-    /// Pass arrived message to the safekeeper.
-    pub async fn process_msg(
-        self: &Arc<Self>,
-        msg: &ProposerAcceptorMessage,
-    ) -> Result<Option<AcceptorProposerMessage>> {
-        if self.is_cancelled() {
-            bail!(TimelineError::Cancelled(self.ttid));
-        }
-
-        let mut rmsg: Option<AcceptorProposerMessage>;
-        {
-            let mut shared_state = self.write_shared_state().await;
-            rmsg = shared_state.sk.process_msg(msg).await?;
-
-            // if this is AppendResponse, fill in proper hot standby feedback.
-            if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
-                resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
-            }
-        }
-        Ok(rmsg)
-    }
-
     /// Returns wal_seg_size.
     pub async fn get_wal_seg_size(&self) -> usize {
         self.read_shared_state().await.get_wal_seg_size()
@@ -672,97 +584,11 @@ impl Timeline {
         Ok(())
     }
 
-    /// Update in memory remote consistent lsn.
-    pub async fn update_remote_consistent_lsn(self: &Arc<Self>, candidate: Lsn) {
-        let mut shared_state = self.write_shared_state().await;
-        shared_state.sk.state.inmem.remote_consistent_lsn =
-            max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
-    }
-
     pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
         let shared_state = self.read_shared_state().await;
         shared_state.get_peers(conf.heartbeat_timeout)
     }
 
-    /// Should we start fetching WAL from a peer safekeeper, and if yes, from
-    /// which? Answer is yes, i.e. .donors is not empty if 1) there is something
-    /// to fetch, and we can do that without running elections; 2) there is no
-    /// actively streaming compute, as we don't want to compete with it.
-    ///
-    /// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
-    /// to its last_log_term so we are sure such a leader ever had been elected.
-    ///
-    /// All possible donors are returned so that we could keep connection to the
-    /// current one if it is good even if it slightly lags behind.
-    ///
-    /// Note that term conditions above might be not met, but safekeepers are
-    /// still not aligned on last flush_lsn. Generally in this case until
-    /// elections are run it is not possible to say which safekeeper should
-    /// recover from which one -- history which would be committed is different
-    /// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
-    /// Thus we don't try to predict it here.
-    pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
-        let ss = self.read_shared_state().await;
-        let term = ss.sk.state.acceptor_state.term;
-        let last_log_term = ss.sk.get_epoch();
-        let flush_lsn = ss.sk.flush_lsn();
-        // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
-        let mut peers = ss.get_peers(heartbeat_timeout);
-        // Sort by <last log term, lsn> pairs.
-        peers.sort_by(|p1, p2| {
-            let tl1 = TermLsn {
-                term: p1.last_log_term,
-                lsn: p1.flush_lsn,
-            };
-            let tl2 = TermLsn {
-                term: p2.last_log_term,
-                lsn: p2.flush_lsn,
-            };
-            tl2.cmp(&tl1) // desc
-        });
-        let num_streaming_computes = self.walreceivers.get_num_streaming();
-        let donors = if num_streaming_computes > 0 {
-            vec![] // If there is a streaming compute, don't try to recover to not intervene.
-        } else {
-            peers
-                .iter()
-                .filter_map(|candidate| {
-                    // Are we interested in this candidate?
-                    let candidate_tl = TermLsn {
-                        term: candidate.last_log_term,
-                        lsn: candidate.flush_lsn,
-                    };
-                    let my_tl = TermLsn {
-                        term: last_log_term,
-                        lsn: flush_lsn,
-                    };
-                    if my_tl < candidate_tl {
-                        // Yes, we are interested. Can we pull from it without
-                        // (re)running elections? It is possible if 1) his term
-                        // is equal to his last_log_term so we could act on
-                        // behalf of leader of this term (we must be sure he was
-                        // ever elected) and 2) our term is not higher, or we'll refuse data.
-                        if candidate.term == candidate.last_log_term && candidate.term >= term {
-                            Some(Donor::from(candidate))
-                        } else {
-                            None
-                        }
-                    } else {
-                        None
-                    }
-                })
-                .collect()
-        };
-        RecoveryNeededInfo {
-            term,
-            last_log_term,
-            flush_lsn,
-            peers,
-            num_streaming_computes,
-            donors,
-        }
-    }
-
     pub fn get_walsenders(&self) -> &Arc<WalSenders> {
         &self.walsenders
     }
@@ -776,58 +602,6 @@ impl Timeline {
         self.read_shared_state().await.sk.wal_store.flush_lsn()
     }
 
-    /// Delete WAL segments from disk that are no longer needed. This is determined
-    /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn.
-    pub async fn remove_old_wal(self: &Arc<Self>) -> Result<()> {
-        if self.is_cancelled() {
-            bail!(TimelineError::Cancelled(self.ttid));
-        }
-
-        // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
-        // This allows to get better read speed for pageservers that are lagging behind,
-        // at the cost of keeping more WAL on disk.
-        let replication_horizon_lsn = if self.walsenders_keep_horizon {
-            self.walsenders.laggard_lsn()
-        } else {
-            None
-        };
-
-        let horizon_segno: XLogSegNo;
-        let remover = {
-            let shared_state = self.read_shared_state().await;
-            horizon_segno = shared_state.get_horizon_segno(replication_horizon_lsn);
-            if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
-                return Ok(()); // nothing to do
-            }
-
-            // release the lock before removing
-            shared_state.sk.wal_store.remove_up_to(horizon_segno - 1)
-        };
-
-        // delete old WAL files
-        remover.await?;
-
-        // update last_removed_segno
-        let mut shared_state = self.write_shared_state().await;
-        if shared_state.last_removed_segno != horizon_segno {
-            shared_state.last_removed_segno = horizon_segno;
-        } else {
-            shared_state.skip_update = true;
-        }
-        Ok(())
-    }
-
-    /// Persist control file if there is something to save and enough time
-    /// passed after the last save. This helps to keep remote_consistent_lsn up
-    /// to date so that storage nodes restart doesn't cause many pageserver ->
-    /// safekeeper reconnections.
-    pub async fn maybe_persist_control_file(self: &Arc<Self>, force: bool) -> Result<()> {
-        let mut guard = self.write_shared_state().await;
-        let changed = guard.sk.maybe_persist_inmem_control_file(force).await?;
-        guard.skip_update = !changed;
-        Ok(())
-    }
-
     /// Gather timeline data for metrics.
     pub async fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
         if self.is_cancelled() {
@@ -843,8 +617,8 @@ impl Timeline {
             wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
             timeline_is_active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
-            last_removed_segno: state.last_removed_segno,
-            epoch_start_lsn: state.sk.epoch_start_lsn,
+            last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
+            epoch_start_lsn: state.sk.term_start_lsn,
             mem_state: state.sk.state.inmem.clone(),
             persisted_state: state.sk.state.clone(),
             flush_lsn: state.sk.wal_store.flush_lsn(),
@@ -866,8 +640,8 @@ impl Timeline {
             wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
             active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
-            last_removed_segno: state.last_removed_segno,
-            epoch_start_lsn: state.sk.epoch_start_lsn,
+            last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
+            epoch_start_lsn: state.sk.term_start_lsn,
             mem_state: state.sk.state.inmem.clone(),
             write_lsn,
             write_record_lsn,
@@ -889,6 +663,110 @@ impl Timeline {
         state.sk.state.finish_change(&persistent_state).await?;
         Ok(res)
     }
+
+    /// Get the timeline guard for reading/writing WAL files.
+    /// TODO: if WAL files are not present on disk (evicted), they will be
+    /// downloaded from S3. Also there will logic for preventing eviction
+    /// while someone is holding FullAccessTimeline guard.
+    pub async fn full_access_guard(self: &Arc<Self>) -> Result<FullAccessTimeline> {
+        if self.is_cancelled() {
+            bail!(TimelineError::Cancelled(self.ttid));
+        }
+        Ok(FullAccessTimeline { tli: self.clone() })
+    }
+}
+
+/// This is a guard that allows to read/write disk timeline state.
+/// All tasks that are using the disk should use this guard.
+#[derive(Clone)]
+pub struct FullAccessTimeline {
+    pub tli: Arc<Timeline>,
+}
+
+impl Deref for FullAccessTimeline {
+    type Target = Arc<Timeline>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.tli
+    }
+}
+
+impl FullAccessTimeline {
+    /// Returns true if walsender should stop sending WAL to pageserver. We
+    /// terminate it if remote_consistent_lsn reached commit_lsn and there is no
+    /// computes. While there might be nothing to stream already, we learn about
+    /// remote_consistent_lsn update through replication feedback, and we want
+    /// to stop pushing to the broker if pageserver is fully caughtup.
+    pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
+        if self.is_cancelled() {
+            return true;
+        }
+        let shared_state = self.read_shared_state().await;
+        if self.walreceivers.get_num() == 0 {
+            return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
+            reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
+        }
+        false
+    }
+
+    /// Ensure that current term is t, erroring otherwise, and lock the state.
+    pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
+        let ss = self.read_shared_state().await;
+        if ss.sk.state.acceptor_state.term != t {
+            bail!(
+                "failed to acquire term {}, current term {}",
+                t,
+                ss.sk.state.acceptor_state.term
+            );
+        }
+        Ok(ss)
+    }
+
+    /// Pass arrived message to the safekeeper.
+    pub async fn process_msg(
+        &self,
+        msg: &ProposerAcceptorMessage,
+    ) -> Result<Option<AcceptorProposerMessage>> {
+        if self.is_cancelled() {
+            bail!(TimelineError::Cancelled(self.ttid));
+        }
+
+        let mut rmsg: Option<AcceptorProposerMessage>;
+        {
+            let mut shared_state = self.write_shared_state().await;
+            rmsg = shared_state.sk.process_msg(msg).await?;
+
+            // if this is AppendResponse, fill in proper hot standby feedback.
+            if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
+                resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
+            }
+        }
+        Ok(rmsg)
+    }
+
+    pub async fn get_walreader(&self, start_lsn: Lsn) -> Result<WalReader> {
+        let (_, persisted_state) = self.get_state().await;
+        let enable_remote_read = GlobalTimelines::get_global_config().is_wal_backup_enabled();
+
+        WalReader::new(
+            &self.ttid,
+            self.timeline_dir.clone(),
+            &persisted_state,
+            start_lsn,
+            enable_remote_read,
+        )
+    }
+
+    pub fn get_timeline_dir(&self) -> Utf8PathBuf {
+        self.timeline_dir.clone()
+    }
+
+    /// Update in memory remote consistent lsn.
+    pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) {
+        let mut shared_state = self.write_shared_state().await;
+        shared_state.sk.state.inmem.remote_consistent_lsn =
+            max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
+    }
 }
 
 /// Deletes directory and it's contents. Returns false if directory does not exist.
@@ -899,3 +777,16 @@ async fn delete_dir(path: &Utf8PathBuf) -> Result<bool> {
         Err(e) => Err(e.into()),
     }
 }
+
+/// Get a path to the tenant directory. If you just need to get a timeline directory,
+/// use FullAccessTimeline::get_timeline_dir instead.
+pub(crate) fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf {
+    conf.workdir.join(tenant_id.to_string())
+}
+
+/// Get a path to the timeline directory. If you need to read WAL files from disk,
+/// use FullAccessTimeline::get_timeline_dir instead. This function does not check
+/// timeline eviction status and WAL files might not be present on disk.
+pub(crate) fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf {
+    get_tenant_dir(conf, &ttid.tenant_id).join(ttid.timeline_id.to_string())
+}
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index ed544352f9..7174d843fc 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -3,23 +3,42 @@
 //! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
 //! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
 
-use std::{sync::Arc, time::Duration};
+use std::{
+    sync::Arc,
+    time::{Duration, Instant},
+};
 
-use tracing::{info, instrument, warn};
+use postgres_ffi::XLogSegNo;
+use tokio::task::{JoinError, JoinHandle};
+use tracing::{info, info_span, instrument, warn, Instrument};
 use utils::lsn::Lsn;
 
 use crate::{
+    control_file::Storage,
     metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL},
+    recovery::recovery_main,
+    remove_wal::calc_horizon_lsn,
+    send_wal::WalSenders,
     timeline::{PeerInfo, ReadGuardSharedState, Timeline},
-    timelines_set::TimelinesSet,
+    timelines_set::{TimelineSetGuard, TimelinesSet},
     wal_backup::{self, WalBackupTaskHandle},
-    SafeKeeperConf,
+    wal_backup_partial, SafeKeeperConf,
 };
 
 pub struct StateSnapshot {
+    // inmem values
     pub commit_lsn: Lsn,
     pub backup_lsn: Lsn,
     pub remote_consistent_lsn: Lsn,
+
+    // persistent control file values
+    pub cfile_peer_horizon_lsn: Lsn,
+    pub cfile_remote_consistent_lsn: Lsn,
+    pub cfile_backup_lsn: Lsn,
+
+    // misc
+    pub cfile_last_persist_at: Instant,
+    pub inmem_flush_pending: bool,
     pub peers: Vec<PeerInfo>,
 }
 
@@ -30,17 +49,34 @@ impl StateSnapshot {
             commit_lsn: read_guard.sk.state.inmem.commit_lsn,
             backup_lsn: read_guard.sk.state.inmem.backup_lsn,
             remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn,
+            cfile_peer_horizon_lsn: read_guard.sk.state.peer_horizon_lsn,
+            cfile_remote_consistent_lsn: read_guard.sk.state.remote_consistent_lsn,
+            cfile_backup_lsn: read_guard.sk.state.backup_lsn,
+            cfile_last_persist_at: read_guard.sk.state.pers.last_persist_at(),
+            inmem_flush_pending: Self::has_unflushed_inmem_state(&read_guard),
             peers: read_guard.get_peers(heartbeat_timeout),
         }
     }
+
+    fn has_unflushed_inmem_state(read_guard: &ReadGuardSharedState) -> bool {
+        let state = &read_guard.sk.state;
+        state.inmem.commit_lsn > state.commit_lsn
+            || state.inmem.backup_lsn > state.backup_lsn
+            || state.inmem.peer_horizon_lsn > state.peer_horizon_lsn
+            || state.inmem.remote_consistent_lsn > state.remote_consistent_lsn
+    }
 }
 
 /// Control how often the manager task should wake up to check updates.
 /// There is no need to check for updates more often than this.
 const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
 
+/// How often to save the control file if the is no other activity.
+const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
+
 /// This task gets spawned alongside each timeline and is responsible for managing the timeline's
 /// background tasks.
+/// Be careful, this task is not respawned on panic, so it should not panic.
 #[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))]
 pub async fn main_task(
     tli: Arc<Timeline>,
@@ -55,20 +91,50 @@ pub async fn main_task(
         }
     };
 
-    // sets whether timeline is active for broker pushes or not
-    let mut tli_broker_active = broker_active_set.guard(tli.clone());
-
-    let ttid = tli.ttid;
+    // configuration & dependencies
     let wal_seg_size = tli.get_wal_seg_size().await;
     let heartbeat_timeout = conf.heartbeat_timeout;
-
-    let mut state_version_rx = tli.get_state_version_rx();
-
+    let walsenders = tli.get_walsenders();
     let walreceivers = tli.get_walreceivers();
+
+    // current state
+    let mut state_version_rx = tli.get_state_version_rx();
     let mut num_computes_rx = walreceivers.get_num_rx();
+    let mut tli_broker_active = broker_active_set.guard(tli.clone());
+    let mut last_removed_segno = 0 as XLogSegNo;
 
     // list of background tasks
     let mut backup_task: Option<WalBackupTaskHandle> = None;
+    let mut recovery_task: Option<JoinHandle<()>> = None;
+    let mut partial_backup_task: Option<JoinHandle<()>> = None;
+    let mut wal_removal_task: Option<JoinHandle<anyhow::Result<u64>>> = None;
+
+    // Start recovery task which always runs on the timeline.
+    if conf.peer_recovery_enabled {
+        match tli.full_access_guard().await {
+            Ok(tli) => {
+                recovery_task = Some(tokio::spawn(recovery_main(tli, conf.clone())));
+            }
+            Err(e) => {
+                warn!("failed to start recovery task: {:?}", e);
+            }
+        }
+    }
+
+    // Start partial backup task which always runs on the timeline.
+    if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
+        match tli.full_access_guard().await {
+            Ok(tli) => {
+                partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
+                    tli,
+                    conf.clone(),
+                )));
+            }
+            Err(e) => {
+                warn!("failed to start partial backup task: {:?}", e);
+            }
+        }
+    }
 
     let last_state = 'outer: loop {
         MANAGER_ITERATIONS_TOTAL.inc();
@@ -76,47 +142,36 @@ pub async fn main_task(
         let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout);
         let num_computes = *num_computes_rx.borrow();
 
-        let is_wal_backup_required =
-            wal_backup::is_wal_backup_required(wal_seg_size, num_computes, &state_snapshot);
+        let is_wal_backup_required = update_backup(
+            &conf,
+            &tli,
+            wal_seg_size,
+            num_computes,
+            &state_snapshot,
+            &mut backup_task,
+        )
+        .await;
 
-        if conf.is_wal_backup_enabled() {
-            wal_backup::update_task(
-                &conf,
-                ttid,
-                is_wal_backup_required,
-                &state_snapshot,
-                &mut backup_task,
-            )
-            .await;
-        }
+        let _is_active = update_is_active(
+            is_wal_backup_required,
+            num_computes,
+            &state_snapshot,
+            &mut tli_broker_active,
+            &tli,
+        );
 
-        let is_active = is_wal_backup_required
-            || num_computes > 0
-            || state_snapshot.remote_consistent_lsn < state_snapshot.commit_lsn;
+        let next_cfile_save = update_control_file_save(&state_snapshot, &tli).await;
 
-        // update the broker timeline set
-        if tli_broker_active.set(is_active) {
-            // write log if state has changed
-            info!(
-                "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
-                is_active, state_snapshot.remote_consistent_lsn, state_snapshot.commit_lsn,
-            );
-
-            MANAGER_ACTIVE_CHANGES.inc();
-
-            if !is_active {
-                // TODO: maybe use tokio::spawn?
-                if let Err(e) = tli.maybe_persist_control_file(false).await {
-                    warn!("control file save in update_status failed: {:?}", e);
-                }
-            }
-        }
-
-        // update the state in Arc<Timeline>
-        tli.wal_backup_active
-            .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
-        tli.broker_active
-            .store(is_active, std::sync::atomic::Ordering::Relaxed);
+        update_wal_removal(
+            &conf,
+            walsenders,
+            &tli,
+            wal_seg_size,
+            &state_snapshot,
+            last_removed_segno,
+            &mut wal_removal_task,
+        )
+        .await;
 
         // wait until something changes. tx channels are stored under Arc, so they will not be
         // dropped until the manager task is finished.
@@ -135,11 +190,192 @@ pub async fn main_task(
             _ = num_computes_rx.changed() => {
                 // number of connected computes was updated
             }
+            _ = async {
+                if let Some(timeout) = next_cfile_save {
+                    tokio::time::sleep_until(timeout).await
+                } else {
+                    futures::future::pending().await
+                }
+            } => {
+                // it's time to save the control file
+            }
+            res = async {
+                if let Some(task) = &mut wal_removal_task {
+                    task.await
+                } else {
+                    futures::future::pending().await
+                }
+            } => {
+                // WAL removal task finished
+                wal_removal_task = None;
+                update_wal_removal_end(res, &tli, &mut last_removed_segno);
+            }
         }
     };
 
     // shutdown background tasks
     if conf.is_wal_backup_enabled() {
-        wal_backup::update_task(&conf, ttid, false, &last_state, &mut backup_task).await;
+        wal_backup::update_task(&conf, &tli, false, &last_state, &mut backup_task).await;
+    }
+
+    if let Some(recovery_task) = recovery_task {
+        if let Err(e) = recovery_task.await {
+            warn!("recovery task failed: {:?}", e);
+        }
+    }
+
+    if let Some(partial_backup_task) = partial_backup_task {
+        if let Err(e) = partial_backup_task.await {
+            warn!("partial backup task failed: {:?}", e);
+        }
+    }
+
+    if let Some(wal_removal_task) = wal_removal_task {
+        let res = wal_removal_task.await;
+        update_wal_removal_end(res, &tli, &mut last_removed_segno);
     }
 }
+
+/// Spawns/kills backup task and returns true if backup is required.
+async fn update_backup(
+    conf: &SafeKeeperConf,
+    tli: &Arc<Timeline>,
+    wal_seg_size: usize,
+    num_computes: usize,
+    state: &StateSnapshot,
+    backup_task: &mut Option<WalBackupTaskHandle>,
+) -> bool {
+    let is_wal_backup_required =
+        wal_backup::is_wal_backup_required(wal_seg_size, num_computes, state);
+
+    if conf.is_wal_backup_enabled() {
+        wal_backup::update_task(conf, tli, is_wal_backup_required, state, backup_task).await;
+    }
+
+    // update the state in Arc<Timeline>
+    tli.wal_backup_active
+        .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
+    is_wal_backup_required
+}
+
+/// Update is_active flag and returns its value.
+fn update_is_active(
+    is_wal_backup_required: bool,
+    num_computes: usize,
+    state: &StateSnapshot,
+    tli_broker_active: &mut TimelineSetGuard,
+    tli: &Arc<Timeline>,
+) -> bool {
+    let is_active = is_wal_backup_required
+        || num_computes > 0
+        || state.remote_consistent_lsn < state.commit_lsn;
+
+    // update the broker timeline set
+    if tli_broker_active.set(is_active) {
+        // write log if state has changed
+        info!(
+            "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
+            is_active, state.remote_consistent_lsn, state.commit_lsn,
+        );
+
+        MANAGER_ACTIVE_CHANGES.inc();
+    }
+
+    // update the state in Arc<Timeline>
+    tli.broker_active
+        .store(is_active, std::sync::atomic::Ordering::Relaxed);
+    is_active
+}
+
+/// Save control file if needed. Returns Instant if we should persist the control file in the future.
+async fn update_control_file_save(
+    state: &StateSnapshot,
+    tli: &Arc<Timeline>,
+) -> Option<tokio::time::Instant> {
+    if !state.inmem_flush_pending {
+        return None;
+    }
+
+    if state.cfile_last_persist_at.elapsed() > CF_SAVE_INTERVAL {
+        let mut write_guard = tli.write_shared_state().await;
+        // this can be done in the background because it blocks manager task, but flush() should
+        // be fast enough not to be a problem now
+        if let Err(e) = write_guard.sk.state.flush().await {
+            warn!("failed to save control file: {:?}", e);
+        }
+
+        None
+    } else {
+        // we should wait until next CF_SAVE_INTERVAL
+        Some((state.cfile_last_persist_at + CF_SAVE_INTERVAL).into())
+    }
+}
+
+/// Spawns WAL removal task if needed.
+async fn update_wal_removal(
+    conf: &SafeKeeperConf,
+    walsenders: &Arc<WalSenders>,
+    tli: &Arc<Timeline>,
+    wal_seg_size: usize,
+    state: &StateSnapshot,
+    last_removed_segno: u64,
+    wal_removal_task: &mut Option<JoinHandle<anyhow::Result<u64>>>,
+) {
+    if wal_removal_task.is_some() {
+        // WAL removal is already in progress
+        return;
+    }
+
+    // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
+    // This allows to get better read speed for pageservers that are lagging behind,
+    // at the cost of keeping more WAL on disk.
+    let replication_horizon_lsn = if conf.walsenders_keep_horizon {
+        walsenders.laggard_lsn()
+    } else {
+        None
+    };
+
+    let removal_horizon_lsn = calc_horizon_lsn(state, replication_horizon_lsn);
+    let removal_horizon_segno = removal_horizon_lsn
+        .segment_number(wal_seg_size)
+        .saturating_sub(1);
+
+    if removal_horizon_segno > last_removed_segno {
+        // we need to remove WAL
+        let remover = crate::wal_storage::Storage::remove_up_to(
+            &tli.read_shared_state().await.sk.wal_store,
+            removal_horizon_segno,
+        );
+        *wal_removal_task = Some(tokio::spawn(
+            async move {
+                remover.await?;
+                Ok(removal_horizon_segno)
+            }
+            .instrument(info_span!("WAL removal", ttid=%tli.ttid)),
+        ));
+    }
+}
+
+/// Update the state after WAL removal task finished.
+fn update_wal_removal_end(
+    res: Result<anyhow::Result<u64>, JoinError>,
+    tli: &Arc<Timeline>,
+    last_removed_segno: &mut u64,
+) {
+    let new_last_removed_segno = match res {
+        Ok(Ok(segno)) => segno,
+        Err(e) => {
+            warn!("WAL removal task failed: {:?}", e);
+            return;
+        }
+        Ok(Err(e)) => {
+            warn!("WAL removal task failed: {:?}", e);
+            return;
+        }
+    };
+
+    *last_removed_segno = new_last_removed_segno;
+    // update the state in Arc<Timeline>
+    tli.last_removed_segno
+        .store(new_last_removed_segno, std::sync::atomic::Ordering::Relaxed);
+}
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 8d37bd6371..45e08ede3c 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -3,7 +3,7 @@
 //! all from the disk on startup and keeping them in memory.
 
 use crate::safekeeper::ServerInfo;
-use crate::timeline::{Timeline, TimelineError};
+use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
 use crate::timelines_set::TimelinesSet;
 use crate::SafeKeeperConf;
 use anyhow::{bail, Context, Result};
@@ -127,7 +127,7 @@ impl GlobalTimelines {
             state.get_dependencies()
         };
 
-        let timelines_dir = conf.tenant_dir(&tenant_id);
+        let timelines_dir = get_tenant_dir(&conf, &tenant_id);
         for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
             .with_context(|| format!("failed to list timelines dir {}", timelines_dir))?
         {
@@ -348,11 +348,7 @@ impl GlobalTimelines {
             }
             Err(_) => {
                 // Timeline is not memory, but it may still exist on disk in broken state.
-                let dir_path = TIMELINES_STATE
-                    .lock()
-                    .unwrap()
-                    .get_conf()
-                    .timeline_dir(ttid);
+                let dir_path = get_timeline_dir(TIMELINES_STATE.lock().unwrap().get_conf(), ttid);
                 let dir_existed = delete_dir(dir_path)?;
 
                 Ok(TimelineDeleteForceResult {
@@ -401,13 +397,10 @@ impl GlobalTimelines {
         // Note that we could concurrently create new timelines while we were deleting them,
         // so the directory may be not empty. In this case timelines will have bad state
         // and timeline background jobs can panic.
-        delete_dir(
-            TIMELINES_STATE
-                .lock()
-                .unwrap()
-                .get_conf()
-                .tenant_dir(tenant_id),
-        )?;
+        delete_dir(get_tenant_dir(
+            TIMELINES_STATE.lock().unwrap().get_conf(),
+            tenant_id,
+        ))?;
 
         // FIXME: we temporarily disabled removing timelines from the map, see `delete_force`
         // let tlis_after_delete = Self::get_all_for_tenant(*tenant_id);
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 84680557f9..58591aecfa 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -30,9 +30,9 @@ use tracing::*;
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
 use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
-use crate::timeline::{PeerInfo, Timeline};
+use crate::timeline::{FullAccessTimeline, PeerInfo, Timeline};
 use crate::timeline_manager::StateSnapshot;
-use crate::{GlobalTimelines, SafeKeeperConf, WAL_BACKUP_RUNTIME};
+use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME};
 
 use once_cell::sync::OnceCell;
 
@@ -63,13 +63,13 @@ pub fn is_wal_backup_required(
 /// is running, kill it.
 pub async fn update_task(
     conf: &SafeKeeperConf,
-    ttid: TenantTimelineId,
+    tli: &Arc<Timeline>,
     need_backup: bool,
     state: &StateSnapshot,
     entry: &mut Option<WalBackupTaskHandle>,
 ) {
     let (offloader, election_dbg_str) =
-        determine_offloader(&state.peers, state.backup_lsn, ttid, conf);
+        determine_offloader(&state.peers, state.backup_lsn, tli.ttid, conf);
     let elected_me = Some(conf.my_id) == offloader;
 
     let should_task_run = need_backup && elected_me;
@@ -80,15 +80,8 @@ pub async fn update_task(
             info!("elected for backup: {}", election_dbg_str);
 
             let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
-            let timeline_dir = conf.timeline_dir(&ttid);
 
-            let async_task = backup_task_main(
-                ttid,
-                timeline_dir,
-                conf.workdir.clone(),
-                conf.backup_parallel_jobs,
-                shutdown_rx,
-            );
+            let async_task = backup_task_main(tli.clone(), conf.backup_parallel_jobs, shutdown_rx);
 
             let handle = if conf.current_thread_runtime {
                 tokio::spawn(async_task)
@@ -198,39 +191,32 @@ pub fn init_remote_storage(conf: &SafeKeeperConf) {
 }
 
 struct WalBackupTask {
-    timeline: Arc<Timeline>,
+    timeline: FullAccessTimeline,
     timeline_dir: Utf8PathBuf,
-    workspace_dir: Utf8PathBuf,
     wal_seg_size: usize,
     parallel_jobs: usize,
     commit_lsn_watch_rx: watch::Receiver<Lsn>,
 }
 
 /// Offload single timeline.
-#[instrument(name = "WAL backup", skip_all, fields(ttid = %ttid))]
-async fn backup_task_main(
-    ttid: TenantTimelineId,
-    timeline_dir: Utf8PathBuf,
-    workspace_dir: Utf8PathBuf,
-    parallel_jobs: usize,
-    mut shutdown_rx: Receiver<()>,
-) {
+#[instrument(name = "WAL backup", skip_all, fields(ttid = %tli.ttid))]
+async fn backup_task_main(tli: Arc<Timeline>, parallel_jobs: usize, mut shutdown_rx: Receiver<()>) {
     let _guard = WAL_BACKUP_TASKS.guard();
 
+    let tli = match tli.full_access_guard().await {
+        Ok(tli) => tli,
+        Err(e) => {
+            error!("backup error: {}", e);
+            return;
+        }
+    };
     info!("started");
-    let res = GlobalTimelines::get(ttid);
-    if let Err(e) = res {
-        error!("backup error: {}", e);
-        return;
-    }
-    let tli = res.unwrap();
 
     let mut wb = WalBackupTask {
         wal_seg_size: tli.get_wal_seg_size().await,
         commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
+        timeline_dir: tli.get_timeline_dir(),
         timeline: tli,
-        timeline_dir,
-        workspace_dir,
         parallel_jobs,
     };
 
@@ -297,7 +283,6 @@ impl WalBackupTask {
                 commit_lsn,
                 self.wal_seg_size,
                 &self.timeline_dir,
-                &self.workspace_dir,
                 self.parallel_jobs,
             )
             .await
@@ -319,18 +304,18 @@ impl WalBackupTask {
 }
 
 async fn backup_lsn_range(
-    timeline: &Arc<Timeline>,
+    timeline: &FullAccessTimeline,
     backup_lsn: &mut Lsn,
     end_lsn: Lsn,
     wal_seg_size: usize,
     timeline_dir: &Utf8Path,
-    workspace_dir: &Utf8Path,
     parallel_jobs: usize,
 ) -> Result<()> {
     if parallel_jobs < 1 {
         anyhow::bail!("parallel_jobs must be >= 1");
     }
 
+    let remote_timeline_path = remote_timeline_path(&timeline.ttid)?;
     let start_lsn = *backup_lsn;
     let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
 
@@ -343,7 +328,11 @@ async fn backup_lsn_range(
     loop {
         let added_task = match iter.next() {
             Some(s) => {
-                uploads.push_back(backup_single_segment(s, timeline_dir, workspace_dir));
+                uploads.push_back(backup_single_segment(
+                    s,
+                    timeline_dir,
+                    &remote_timeline_path,
+                ));
                 true
             }
             None => false,
@@ -381,18 +370,10 @@ async fn backup_lsn_range(
 async fn backup_single_segment(
     seg: &Segment,
     timeline_dir: &Utf8Path,
-    workspace_dir: &Utf8Path,
+    remote_timeline_path: &RemotePath,
 ) -> Result<Segment> {
     let segment_file_path = seg.file_path(timeline_dir)?;
-    let remote_segment_path = segment_file_path
-        .strip_prefix(workspace_dir)
-        .context("Failed to strip workspace dir prefix")
-        .and_then(RemotePath::new)
-        .with_context(|| {
-            format!(
-                "Failed to resolve remote part of path {segment_file_path:?} for base {workspace_dir:?}",
-            )
-        })?;
+    let remote_segment_path = seg.remote_path(remote_timeline_path);
 
     let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await;
     if res.is_ok() {
@@ -430,6 +411,10 @@ impl Segment {
         Ok(timeline_dir.join(self.object_name()))
     }
 
+    pub fn remote_path(self, remote_timeline_path: &RemotePath) -> RemotePath {
+        remote_timeline_path.join(self.object_name())
+    }
+
     pub fn size(self) -> usize {
         (u64::from(self.end_lsn) - u64::from(self.start_lsn)) as usize
     }
@@ -530,8 +515,7 @@ pub async fn read_object(
 /// when called.
 pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
     let storage = get_configured_remote_storage();
-    let ttid_path = Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string());
-    let remote_path = RemotePath::new(&ttid_path)?;
+    let remote_path = remote_timeline_path(ttid)?;
 
     // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
     // const Option unwrap is not stable, otherwise it would be const.
@@ -613,15 +597,17 @@ pub async fn copy_s3_segments(
         .as_ref()
         .unwrap();
 
-    let relative_dst_path =
-        Utf8Path::new(&dst_ttid.tenant_id.to_string()).join(dst_ttid.timeline_id.to_string());
-
-    let remote_path = RemotePath::new(&relative_dst_path)?;
+    let remote_dst_path = remote_timeline_path(dst_ttid)?;
 
     let cancel = CancellationToken::new();
 
     let files = storage
-        .list(Some(&remote_path), ListingMode::NoDelimiter, None, &cancel)
+        .list(
+            Some(&remote_dst_path),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
         .await?
         .keys;
 
@@ -635,9 +621,6 @@ pub async fn copy_s3_segments(
         uploaded_segments
     );
 
-    let relative_src_path =
-        Utf8Path::new(&src_ttid.tenant_id.to_string()).join(src_ttid.timeline_id.to_string());
-
     for segno in from_segment..to_segment {
         if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 {
             info!("copied all segments from {} until {}", from_segment, segno);
@@ -649,8 +632,8 @@ pub async fn copy_s3_segments(
         }
         debug!("copying segment {}", segment_name);
 
-        let from = RemotePath::new(&relative_src_path.join(&segment_name))?;
-        let to = RemotePath::new(&relative_dst_path.join(&segment_name))?;
+        let from = remote_timeline_path(src_ttid)?.join(&segment_name);
+        let to = remote_dst_path.join(&segment_name);
 
         storage.copy_object(&from, &to, &cancel).await?;
     }
@@ -661,3 +644,8 @@ pub async fn copy_s3_segments(
     );
     Ok(())
 }
+
+/// Get S3 (remote_storage) prefix path used for timeline files.
+pub fn remote_timeline_path(ttid: &TenantTimelineId) -> Result<RemotePath> {
+    RemotePath::new(&Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string()))
+}
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 29e944bff3..6c0f35095b 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -18,22 +18,21 @@
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.
 
-use std::sync::Arc;
-
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use rand::Rng;
 use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};
 
-use tracing::{debug, error, info, instrument};
+use tracing::{debug, error, info, instrument, warn};
 use utils::lsn::Lsn;
 
 use crate::{
     metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
     safekeeper::Term,
-    timeline::Timeline,
-    wal_backup, SafeKeeperConf,
+    timeline::FullAccessTimeline,
+    wal_backup::{self, remote_timeline_path},
+    SafeKeeperConf,
 };
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
@@ -83,10 +82,10 @@ impl State {
 
 struct PartialBackup {
     wal_seg_size: usize,
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
     conf: SafeKeeperConf,
     local_prefix: Utf8PathBuf,
-    remote_prefix: Utf8PathBuf,
+    remote_timeline_path: RemotePath,
 
     state: State,
 }
@@ -153,7 +152,7 @@ impl PartialBackup {
         let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size);
 
         let local_path = self.local_prefix.join(self.local_segment_name(segno));
-        let remote_path = RemotePath::new(self.remote_prefix.join(&prepared.name).as_ref())?;
+        let remote_path = self.remote_timeline_path.join(&prepared.name);
 
         // Upload first `backup_bytes` bytes of the segment to the remote storage.
         wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?;
@@ -253,7 +252,7 @@ impl PartialBackup {
         info!("deleting objects: {:?}", segments_to_delete);
         let mut objects_to_delete = vec![];
         for seg in segments_to_delete.iter() {
-            let remote_path = RemotePath::new(self.remote_prefix.join(seg).as_ref())?;
+            let remote_path = self.remote_timeline_path.join(seg);
             objects_to_delete.push(remote_path);
         }
 
@@ -273,7 +272,7 @@ impl PartialBackup {
 }
 
 #[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))]
-pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
+pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
     debug!("started");
     let await_duration = conf.partial_backup_timeout;
 
@@ -289,11 +288,11 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
     let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
     let wal_seg_size = tli.get_wal_seg_size().await;
 
-    let local_prefix = tli.timeline_dir.clone();
-    let remote_prefix = match tli.timeline_dir.strip_prefix(&conf.workdir) {
-        Ok(path) => path.to_owned(),
+    let local_prefix = tli.get_timeline_dir();
+    let remote_timeline_path = match remote_timeline_path(&tli.ttid) {
+        Ok(path) => path,
         Err(e) => {
-            error!("failed to strip workspace dir prefix: {:?}", e);
+            error!("failed to create remote path: {:?}", e);
             return;
         }
     };
@@ -304,12 +303,28 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
         state: persistent_state.partial_backup,
         conf,
         local_prefix,
-        remote_prefix,
+        remote_timeline_path,
     };
 
     debug!("state: {:?}", backup.state);
 
+    // The general idea is that each safekeeper keeps only one partial segment
+    // both in remote storage and in local state. If this is not true, something
+    // went wrong.
+    const MAX_SIMULTANEOUS_SEGMENTS: usize = 10;
+
     'outer: loop {
+        if backup.state.segments.len() > MAX_SIMULTANEOUS_SEGMENTS {
+            warn!(
+                "too many segments in control_file state, running gc: {}",
+                backup.state.segments.len()
+            );
+
+            backup.gc().await.unwrap_or_else(|e| {
+                error!("failed to run gc: {:#}", e);
+            });
+        }
+
         // wait until we have something to upload
         let uploaded_segment = backup.state.uploaded_segment();
         if let Some(seg) = &uploaded_segment {
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 6bc8c7c3f9..45e27e1951 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -25,7 +25,7 @@ use utils::crashsafe::durable_rename;
 
 use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
 use crate::state::TimelinePersistentState;
-use crate::wal_backup::read_object;
+use crate::wal_backup::{read_object, remote_timeline_path};
 use crate::SafeKeeperConf;
 use postgres_ffi::waldecoder::WalStreamDecoder;
 use postgres_ffi::XLogFileName;
@@ -536,7 +536,7 @@ async fn remove_segments_from_disk(
 }
 
 pub struct WalReader {
-    workdir: Utf8PathBuf,
+    remote_path: RemotePath,
     timeline_dir: Utf8PathBuf,
     wal_seg_size: usize,
     pos: Lsn,
@@ -558,7 +558,7 @@ pub struct WalReader {
 
 impl WalReader {
     pub fn new(
-        workdir: Utf8PathBuf,
+        ttid: &TenantTimelineId,
         timeline_dir: Utf8PathBuf,
         state: &TimelinePersistentState,
         start_pos: Lsn,
@@ -586,7 +586,7 @@ impl WalReader {
         }
 
         Ok(Self {
-            workdir,
+            remote_path: remote_timeline_path(ttid)?,
             timeline_dir,
             wal_seg_size: state.server.wal_seg_size as usize,
             pos: start_pos,
@@ -684,7 +684,7 @@ impl WalReader {
         let xlogoff = self.pos.segment_offset(self.wal_seg_size);
         let segno = self.pos.segment_number(self.wal_seg_size);
         let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size);
-        let wal_file_path = self.timeline_dir.join(wal_file_name);
+        let wal_file_path = self.timeline_dir.join(&wal_file_name);
 
         // Try to open local file, if we may have WAL locally
         if self.pos >= self.local_start_lsn {
@@ -712,16 +712,7 @@ impl WalReader {
 
         // Try to open remote file, if remote reads are enabled
         if self.enable_remote_read {
-            let remote_wal_file_path = wal_file_path
-                .strip_prefix(&self.workdir)
-                .context("Failed to strip workdir prefix")
-                .and_then(RemotePath::new)
-                .with_context(|| {
-                    format!(
-                        "Failed to resolve remote part of path {:?} for base {:?}",
-                        wal_file_path, self.workdir,
-                    )
-                })?;
+            let remote_wal_file_path = self.remote_path.join(&wal_file_name);
             return read_object(&remote_wal_file_path, xlogoff as u64).await;
         }
 
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 604ad6fbaa..bbb6d2cb32 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -142,52 +142,6 @@ async fn handle_tenant_create(
     )
 }
 
-// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
-// we're done" semantic, we wrap with a retry loop to expose a simpler API upstream.  This avoids
-// needing to track a "deleting" state for tenants.
-async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
-where
-    R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
-    F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
-{
-    let started_at = Instant::now();
-    // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
-    // completed.
-    let mut retry_period = Duration::from_secs(1);
-    // On subsequent retries, wait longer.
-    let max_retry_period = Duration::from_secs(5);
-    // Enable callers with a 30 second request timeout to reliably get a response
-    let max_wait = Duration::from_secs(25);
-
-    loop {
-        let status = f(service.clone()).await?;
-        match status {
-            StatusCode::ACCEPTED => {
-                tracing::info!("Deletion accepted, waiting to try again...");
-                tokio::time::sleep(retry_period).await;
-                retry_period = max_retry_period;
-            }
-            StatusCode::NOT_FOUND => {
-                tracing::info!("Deletion complete");
-                return json_response(StatusCode::OK, ());
-            }
-            _ => {
-                tracing::warn!("Unexpected status {status}");
-                return json_response(status, ());
-            }
-        }
-
-        let now = Instant::now();
-        if now + retry_period > started_at + max_wait {
-            tracing::info!("Deletion timed out waiting for 404");
-            // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
-            // the pageserver's swagger definition for this endpoint, and has the same desired
-            // effect of causing the control plane to retry later.
-            return json_response(StatusCode::CONFLICT, ());
-        }
-    }
-}
-
 async fn handle_tenant_location_config(
     service: Arc<Service>,
     mut req: Request<Body>,
@@ -283,13 +237,17 @@ async fn handle_tenant_delete(
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
 
-    deletion_wrapper(service, move |service| async move {
-        service
-            .tenant_delete(tenant_id)
-            .await
-            .and_then(map_reqwest_hyper_status)
-    })
-    .await
+    let status_code = service
+        .tenant_delete(tenant_id)
+        .await
+        .and_then(map_reqwest_hyper_status)?;
+
+    if status_code == StatusCode::NOT_FOUND {
+        // The pageserver uses 404 for successful deletion, but we use 200
+        json_response(StatusCode::OK, ())
+    } else {
+        json_response(status_code, ())
+    }
 }
 
 async fn handle_tenant_timeline_create(
@@ -317,6 +275,51 @@ async fn handle_tenant_timeline_delete(
 
     let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
 
+    // For timeline deletions, which both implement an "initially return 202, then 404 once
+    // we're done" semantic, we wrap with a retry loop to expose a simpler API upstream.
+    async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
+    where
+        R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
+        F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
+    {
+        let started_at = Instant::now();
+        // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
+        // completed.
+        let mut retry_period = Duration::from_secs(1);
+        // On subsequent retries, wait longer.
+        let max_retry_period = Duration::from_secs(5);
+        // Enable callers with a 30 second request timeout to reliably get a response
+        let max_wait = Duration::from_secs(25);
+
+        loop {
+            let status = f(service.clone()).await?;
+            match status {
+                StatusCode::ACCEPTED => {
+                    tracing::info!("Deletion accepted, waiting to try again...");
+                    tokio::time::sleep(retry_period).await;
+                    retry_period = max_retry_period;
+                }
+                StatusCode::NOT_FOUND => {
+                    tracing::info!("Deletion complete");
+                    return json_response(StatusCode::OK, ());
+                }
+                _ => {
+                    tracing::warn!("Unexpected status {status}");
+                    return json_response(status, ());
+                }
+            }
+
+            let now = Instant::now();
+            if now + retry_period > started_at + max_wait {
+                tracing::info!("Deletion timed out waiting for 404");
+                // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
+                // the pageserver's swagger definition for this endpoint, and has the same desired
+                // effect of causing the control plane to retry later.
+                return json_response(StatusCode::CONFLICT, ());
+            }
+        }
+    }
+
     deletion_wrapper(service, move |service| async move {
         service
             .tenant_timeline_delete(tenant_id, timeline_id)
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index f914f4e0bb..756dc10a2a 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2376,61 +2376,80 @@ impl Service {
         let _tenant_lock =
             trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await;
 
-        self.ensure_attached_wait(tenant_id).await?;
-
-        // TODO: refactor into helper
-        let targets = {
-            let locked = self.inner.read().unwrap();
-            let mut targets = Vec::new();
-
+        // Detach all shards
+        let (detach_waiters, shard_ids, node) = {
+            let mut shard_ids = Vec::new();
+            let mut detach_waiters = Vec::new();
+            let mut locked = self.inner.write().unwrap();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
             for (tenant_shard_id, shard) in
-                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+                tenants.range_mut(TenantShardId::tenant_range(tenant_id))
             {
-                let node_id = shard.intent.get_attached().ok_or_else(|| {
-                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
-                })?;
-                let node = locked
-                    .nodes
-                    .get(&node_id)
-                    .expect("Pageservers may not be deleted while referenced");
+                shard_ids.push(*tenant_shard_id);
 
-                targets.push((*tenant_shard_id, node.clone()));
+                // Update the tenant's intent to remove all attachments
+                shard.policy = PlacementPolicy::Detached;
+                shard
+                    .schedule(scheduler, &mut ScheduleContext::default())
+                    .expect("De-scheduling is infallible");
+                debug_assert!(shard.intent.get_attached().is_none());
+                debug_assert!(shard.intent.get_secondary().is_empty());
+
+                if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
+                    detach_waiters.push(waiter);
+                }
             }
-            targets
+
+            // Pick an arbitrary node to use for remote deletions (does not have to be where the tenant
+            // was attached, just has to be able to see the S3 content)
+            let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
+            let node = nodes
+                .get(&node_id)
+                .expect("Pageservers may not be deleted while lock is active");
+            (detach_waiters, shard_ids, node.clone())
         };
 
-        // Phase 1: delete on the pageservers
-        let mut any_pending = false;
-        for (tenant_shard_id, node) in targets {
-            let client = PageserverClient::new(
-                node.get_id(),
-                node.base_url(),
-                self.config.jwt_token.as_deref(),
-            );
-            // TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not
-            // surface immediately as an error to our caller.
-            let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
-                ApiError::InternalServerError(anyhow::anyhow!(
-                    "Error deleting shard {tenant_shard_id} on node {node}: {e}",
-                ))
-            })?;
-            tracing::info!(
-                "Shard {tenant_shard_id} on node {node}, delete returned {}",
-                status
-            );
-            if status == StatusCode::ACCEPTED {
-                any_pending = true;
-            }
+        if let Err(e) = self.await_waiters(detach_waiters, RECONCILE_TIMEOUT).await {
+            // Failing to detach shouldn't hold up deletion, e.g. if a node is offline we should be able
+            // to use some other node to run the remote deletion.
+            tracing::warn!("Failed to detach some locations: {e}");
         }
 
-        if any_pending {
-            // Caller should call us again later.  When we eventually see 404s from
-            // all the shards, we may proceed to delete our records of the tenant.
-            tracing::info!(
-                "Tenant {} has some shards pending deletion, returning 202",
-                tenant_id
-            );
-            return Ok(StatusCode::ACCEPTED);
+        let locations = shard_ids
+            .into_iter()
+            .map(|s| (s, node.clone()))
+            .collect::<Vec<_>>();
+        let results = self.tenant_for_shards_api(
+            locations,
+            |tenant_shard_id, client| async move { client.tenant_delete(tenant_shard_id).await },
+            1,
+            3,
+            RECONCILE_TIMEOUT,
+            &self.cancel,
+        )
+        .await;
+        for result in results {
+            match result {
+                Ok(StatusCode::ACCEPTED) => {
+                    // This could happen if we failed detach above, and hit a pageserver where the tenant
+                    // is still attached: it will accept the deletion in the background
+                    tracing::warn!(
+                        "Unexpectedly still attached on {}, client should retry",
+                        node
+                    );
+                    return Ok(StatusCode::ACCEPTED);
+                }
+                Ok(_) => {}
+                Err(mgmt_api::Error::Cancelled) => {
+                    return Err(ApiError::ShuttingDown);
+                }
+                Err(e) => {
+                    // This is unexpected: remote deletion should be infallible, unless the object store
+                    // at large is unavailable.
+                    tracing::error!("Error deleting via node {}: {e}", node);
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
+                }
+            }
         }
 
         // Fall through: deletion of the tenant on pageservers is complete, we may proceed to drop
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index e9be765669..147264762c 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -72,6 +72,18 @@ class Lsn:
     def segment_lsn(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> "Lsn":
         return Lsn(self.lsn_int - (self.lsn_int % seg_sz))
 
+    def segno(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> int:
+        return self.lsn_int // seg_sz
+
+    def segment_name(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> str:
+        segno = self.segno(seg_sz)
+        # The filename format is 00000001XXXXXXXX000000YY, where XXXXXXXXYY is segno in hex.
+        # XXXXXXXX is the higher 8 hex digits of segno
+        high_bits = segno >> 8
+        # YY is the lower 2 hex digits of segno
+        low_bits = segno & 0xFF
+        return f"00000001{high_bits:08X}000000{low_bits:02X}"
+
 
 @dataclass(frozen=True)
 class Key:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b8ef63faa9..a25b8bfca1 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -973,6 +973,9 @@ class NeonEnvBuilder:
             for pageserver in self.env.pageservers:
                 pageserver.assert_no_errors()
 
+            for safekeeper in self.env.safekeepers:
+                safekeeper.assert_no_errors()
+
             self.env.storage_controller.assert_no_errors()
 
         try:
@@ -3813,6 +3816,9 @@ class Safekeeper(LogUtils):
         self.running = False
         return self
 
+    def assert_no_errors(self):
+        assert not self.log_contains("manager task finished prematurely")
+
     def append_logical_message(
         self, tenant_id: TenantId, timeline_id: TimelineId, request: Dict[str, Any]
     ) -> Dict[str, Any]:
@@ -3898,6 +3904,15 @@ class Safekeeper(LogUtils):
         """
         cli = self.http_client()
 
+        target_segment_file = lsn.segment_name()
+
+        def are_segments_removed():
+            segments = self.list_segments(tenant_id, timeline_id)
+            log.info(
+                f"waiting for all segments before {target_segment_file} to be removed from sk {self.id}, current segments: {segments}"
+            )
+            assert all(target_segment_file <= s for s in segments)
+
         def are_lsns_advanced():
             stat = cli.timeline_status(tenant_id, timeline_id)
             log.info(
@@ -3909,6 +3924,7 @@ class Safekeeper(LogUtils):
         # pageserver to this safekeeper
         wait_until(30, 1, are_lsns_advanced)
         cli.checkpoint(tenant_id, timeline_id)
+        wait_until(30, 1, are_segments_removed)
 
     def wait_until_paused(self, failpoint: str):
         msg = f"at failpoint {failpoint}"
@@ -3982,6 +3998,30 @@ class S3Scrubber:
         )
         log.info(f"tenant-snapshot output: {stdout}")
 
+    def pageserver_physical_gc(
+        self, min_age_secs: int, tenant_ids: Optional[list[TenantId]] = None
+    ):
+        args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"]
+
+        if tenant_ids is None:
+            tenant_ids = []
+
+        for tenant_id in tenant_ids:
+            args.extend(["--tenant-id", str(tenant_id)])
+
+        stdout = self.scrubber_cli(
+            args,
+            timeout=30,
+        )
+        try:
+            return json.loads(stdout)
+        except:
+            log.error(
+                "Failed to decode JSON output from `pageserver-physical_gc`.  Dumping stdout:"
+            )
+            log.error(stdout)
+            raise
+
 
 def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path:
     """Compute the path to a working directory for an individual test."""
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index ad8bbe2021..ef412cade7 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -66,7 +66,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
     ".*task iteration took longer than the configured period.*",
     # these can happen anytime we do compactions from background task and shutdown pageserver
-    r".*ERROR.*ancestor timeline \S+ is being stopped",
+    ".*could not compact.*cancelled.*",
     # this is expected given our collaborative shutdown approach for the UploadQueue
     ".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
     ".*Compaction failed.*, retrying in .*: ShuttingDown",
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index ee18c53b52..6f6526d3fc 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -171,6 +171,8 @@ class S3Storage:
     """Is this MOCK_S3 (false) or REAL_S3 (true)"""
     real: bool
     endpoint: Optional[str] = None
+    """formatting deserialized with humantime crate, for example "1s"."""
+    custom_timeout: Optional[str] = None
 
     def access_env_vars(self) -> Dict[str, str]:
         if self.aws_profile is not None:
@@ -208,6 +210,9 @@ class S3Storage:
         if self.endpoint is not None:
             rv["endpoint"] = self.endpoint
 
+        if self.custom_timeout is not None:
+            rv["timeout"] = self.custom_timeout
+
         return rv
 
     def to_toml_inline_table(self) -> str:
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index a5480f557f..11e6fef28f 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -19,7 +19,8 @@ class Walreceiver:
 
 @dataclass
 class SafekeeperTimelineStatus:
-    acceptor_epoch: int
+    term: int
+    last_log_term: int
     pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
     flush_lsn: Lsn
     commit_lsn: Lsn
@@ -156,7 +157,8 @@ class SafekeeperHttpClient(requests.Session):
         resj = res.json()
         walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
         return SafekeeperTimelineStatus(
-            acceptor_epoch=resj["acceptor_state"]["epoch"],
+            term=resj["acceptor_state"]["term"],
+            last_log_term=resj["acceptor_state"]["epoch"],
             pg_version=resj["pg_info"]["pg_version"],
             flush_lsn=Lsn(resj["flush_lsn"]),
             commit_lsn=Lsn(resj["commit_lsn"]),
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 4850a5c688..49dcb9b86a 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -81,11 +81,19 @@ page_cache_size=10
 
     non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum")
     non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count")
-    non_vectored_average = non_vectored_sum.value / non_vectored_count.value
-
+    if non_vectored_count.value != 0:
+        non_vectored_average = non_vectored_sum.value / non_vectored_count.value
+    else:
+        non_vectored_average = 0
     vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
     vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
-    vectored_average = vectored_sum.value / vectored_count.value
+    if vectored_count.value > 0:
+        assert vectored_sum.value > 0
+        vectored_average = vectored_sum.value / vectored_count.value
+    else:
+        # special case: running local tests with default legacy configuration
+        assert vectored_sum.value == 0
+        vectored_average = 0
 
     log.info(f"{non_vectored_average=} {vectored_average=}")
 
@@ -230,7 +238,7 @@ def test_uploads_and_deletions(
     # https://github.com/neondatabase/neon/issues/7707
     # https://github.com/neondatabase/neon/issues/7759
     allowed_errors = [
-        ".*duplicated L1 layer.*",
+        ".*/checkpoint.*rename temporary file as correct path for.*",  # EEXIST
         ".*delta layer created with.*duplicate values.*",
         ".*assertion failed: self.lsn_range.start <= lsn.*",
         ".*HTTP request handler task panicked: task.*panicked.*",
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index b137fb3a5c..4a25dfd874 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -3,8 +3,10 @@
 
 import time
 from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
 from typing import Any, DefaultDict, Dict, Tuple
 
+import pytest
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -13,7 +15,7 @@ from fixtures.neon_fixtures import (
     last_flush_lsn_upload,
     wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
     assert_tenant_state,
     wait_for_last_record_lsn,
@@ -21,7 +23,7 @@ from fixtures.pageserver.utils import (
     wait_for_upload_queue_empty,
     wait_until_tenant_active,
 )
-from fixtures.remote_storage import RemoteStorageKind
+from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
 from fixtures.utils import query_scalar, wait_until
 
 
@@ -402,7 +404,7 @@ def test_download_remote_layers_api(
     env.pageserver.allowed_errors.extend(
         [
             ".*download failed: downloading evicted layer file failed.*",
-            f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed: downloading evicted layer file failed",
+            f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed.*downloading evicted layer file failed",
         ]
     )
 
@@ -656,5 +658,200 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne
     assert dict(kinds_after) == {"Delta": 4, "Image": 1}
 
 
+def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBuilder):
+    """
+    Demonstrates that tenant shutdown will cancel on-demand download and secondary doing warmup.
+    """
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+
+    # turn off background tasks so that they don't interfere with the downloads
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+        }
+    )
+    client = env.pageserver.http_client()
+    failpoint = "before-downloading-layer-stream-pausable"
+    client.configure_failpoints((failpoint, "pause"))
+
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*downloading failed, possibly for shutdown.*",
+        ]
+    )
+
+    info = client.layer_map_info(env.initial_tenant, env.initial_timeline)
+    assert len(info.delta_layers()) == 1
+
+    layer = info.delta_layers()[0]
+
+    client.tenant_heatmap_upload(env.initial_tenant)
+
+    # evict the initdb layer so we can download it
+    client.evict_layer(env.initial_tenant, env.initial_timeline, layer.layer_file_name)
+
+    with ThreadPoolExecutor(max_workers=2) as exec:
+        download = exec.submit(
+            client.download_layer,
+            env.initial_tenant,
+            env.initial_timeline,
+            layer.layer_file_name,
+        )
+
+        _, offset = wait_until(
+            20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+        )
+
+        location_conf = {"mode": "Detached", "tenant_conf": {}}
+        # assume detach removes the layers
+        detach = exec.submit(client.tenant_location_conf, env.initial_tenant, location_conf)
+
+        _, offset = wait_until(
+            20,
+            0.5,
+            lambda: env.pageserver.assert_log_contains(
+                "closing is taking longer than expected", offset
+            ),
+        )
+
+        client.configure_failpoints((failpoint, "off"))
+
+        with pytest.raises(
+            PageserverApiException, match="downloading failed, possibly for shutdown"
+        ):
+            download.result()
+
+        env.pageserver.assert_log_contains(".*downloading failed, possibly for shutdown.*")
+
+        detach.result()
+
+        client.configure_failpoints((failpoint, "pause"))
+
+        _, offset = wait_until(
+            20,
+            0.5,
+            lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset),
+        )
+
+        location_conf = {
+            "mode": "Secondary",
+            "secondary_conf": {"warm": True},
+            "tenant_conf": {},
+        }
+
+        client.tenant_location_conf(env.initial_tenant, location_conf)
+
+        warmup = exec.submit(client.tenant_secondary_download, env.initial_tenant, wait_ms=30000)
+
+        _, offset = wait_until(
+            20,
+            0.5,
+            lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}", offset),
+        )
+
+        client.configure_failpoints((failpoint, "off"))
+        location_conf = {"mode": "Detached", "tenant_conf": {}}
+        client.tenant_location_conf(env.initial_tenant, location_conf)
+
+        client.configure_failpoints((failpoint, "off"))
+
+        # here we have nothing in the log, but we see that the warmup and conf location update worked
+        warmup.result()
+
+
+def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder):
+    """
+    Pause using a pausable_failpoint longer than the client timeout to simulate the timeout happening.
+    """
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    assert isinstance(neon_env_builder.pageserver_remote_storage, S3Storage)
+    neon_env_builder.pageserver_remote_storage.custom_timeout = "1s"
+
+    # turn off background tasks so that they don't interfere with the downloads
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+        }
+    )
+    client = env.pageserver.http_client()
+    failpoint = "before-downloading-layer-stream-pausable"
+    client.configure_failpoints((failpoint, "pause"))
+
+    info = client.layer_map_info(env.initial_tenant, env.initial_timeline)
+    assert len(info.delta_layers()) == 1
+
+    layer = info.delta_layers()[0]
+
+    client.tenant_heatmap_upload(env.initial_tenant)
+
+    # evict so we can download it
+    client.evict_layer(env.initial_tenant, env.initial_timeline, layer.layer_file_name)
+
+    with ThreadPoolExecutor(max_workers=2) as exec:
+        download = exec.submit(
+            client.download_layer,
+            env.initial_tenant,
+            env.initial_timeline,
+            layer.layer_file_name,
+        )
+
+        _, offset = wait_until(
+            20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+        )
+        # ensure enough time while paused to trip the timeout
+        time.sleep(2)
+
+        client.configure_failpoints((failpoint, "off"))
+        download.result()
+
+        _, offset = env.pageserver.assert_log_contains(
+            ".*failed, will retry \\(attempt 0\\): timeout.*"
+        )
+        _, offset = env.pageserver.assert_log_contains(".*succeeded after [0-9]+ retries.*", offset)
+
+        client.evict_layer(env.initial_tenant, env.initial_timeline, layer.layer_file_name)
+
+        client.configure_failpoints((failpoint, "pause"))
+
+        # capture the next offset for a new synchronization with the failpoint
+        _, offset = wait_until(
+            20,
+            0.5,
+            lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset),
+        )
+
+        location_conf = {
+            "mode": "Secondary",
+            "secondary_conf": {"warm": True},
+            "tenant_conf": {},
+        }
+
+        client.tenant_location_conf(
+            env.initial_tenant,
+            location_conf,
+        )
+
+        started = time.time()
+
+        warmup = exec.submit(client.tenant_secondary_download, env.initial_tenant, wait_ms=30000)
+        # ensure enough time while paused to trip the timeout
+        time.sleep(2)
+
+        client.configure_failpoints((failpoint, "off"))
+
+        warmup.result()
+
+        elapsed = time.time() - started
+
+        _, offset = env.pageserver.assert_log_contains(
+            ".*failed, will retry \\(attempt 0\\): timeout.*", offset
+        )
+        _, offset = env.pageserver.assert_log_contains(".*succeeded after [0-9]+ retries.*", offset)
+
+        assert elapsed < 30, "too long passed: {elapsed=}"
+
+
 def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
     return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_pageserver_crash_consistency.py
similarity index 66%
rename from test_runner/regress/test_duplicate_layers.py
rename to test_runner/regress/test_pageserver_crash_consistency.py
index 0ebb99c712..3831d2f917 100644
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_pageserver_crash_consistency.py
@@ -12,42 +12,14 @@ from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from requests.exceptions import ConnectionError
 
 
-def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    env = neon_env_builder.init_start()
-    pageserver_http = env.pageserver.http_client()
-
-    # use a failpoint to return all L0s as L1s
-    message = ".*duplicated L1 layer layer=.*"
-    env.pageserver.allowed_errors.append(message)
-
-    # Use aggressive compaction and checkpoint settings
-    tenant_id, _ = env.neon_cli.create_tenant(
-        conf={
-            "checkpoint_distance": f"{1024 ** 2}",
-            "compaction_target_size": f"{1024 ** 2}",
-            "compaction_period": "5 s",
-            "compaction_threshold": "3",
-        }
-    )
-
-    pageserver_http.configure_failpoints(("compact-level0-phase1-return-same", "return"))
-
-    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
-    connstr = endpoint.connstr(options="-csynchronous_commit=off")
-    pg_bin.run_capture(["pgbench", "-i", "-s1", connstr])
-
-    time.sleep(10)  # let compaction to be performed
-    env.pageserver.assert_log_contains("compact-level0-phase1-return-same")
-
-
-def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     """
-    Test sets fail point at the end of first compaction phase: after
-    flushing new L1 layer but before deletion of L0 layers.
+    Test case for docs/rfcs/027-crash-consistent-layer-map-through-index-part.md.
 
-    The L1 used to be overwritten, but with crash-consistency via remote
-    index_part.json, we end up deleting the not yet uploaded L1 layer on
-    startup.
+    Simulate crash after compaction has written layers to disk
+    but before they have been uploaded/linked into remote index_part.json.
+
+    Startup handles this situation by deleting the not yet uploaded L1 layer files.
     """
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
@@ -126,13 +98,6 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
     # give time for log flush
     time.sleep(1)
 
-    message = f".*duplicated L1 layer layer={l1_found}"
-    found_msg = env.pageserver.log_contains(message)
-    # resident or evicted, it should not be overwritten, however it should had been non-existing at startup
-    assert (
-        found_msg is None
-    ), "layer should had been removed during startup, did it live on as evicted?"
-
     assert env.pageserver.layer_exists(tenant_id, timeline_id, l1_found), "the L1 reappears"
 
     wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
@@ -141,3 +106,6 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
         tenant_id, timeline_id, l1_found.to_str()
     )
     assert uploaded.exists(), "the L1 is uploaded"
+
+
+# TODO: same test for L0s produced by ingest.
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index 759e845927..4ce53df214 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -163,11 +163,6 @@ def test_pageserver_chaos(
 
     env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
 
-    # these can happen, if we shutdown at a good time. to be fixed as part of #5172.
-    message = ".*duplicated L1 layer layer=.*"
-    for ps in env.pageservers:
-        ps.allowed_errors.append(message)
-
     # Use a tiny checkpoint distance, to create a lot of layers quickly.
     # That allows us to stress the compaction and layer flushing logic more.
     tenant, _ = env.neon_cli.create_tenant(
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 25a3f8521c..5bfa9cce8c 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -15,7 +15,7 @@ from fixtures.pageserver.utils import (
     tenant_delete_wait_completed,
     wait_for_upload_queue_empty,
 )
-from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage
+from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
 
@@ -73,7 +73,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     """
     neon_env_builder.num_pageservers = 3
     neon_env_builder.enable_pageserver_remote_storage(
-        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+        remote_storage_kind=s3_storage(),
     )
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
 
@@ -100,10 +100,6 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
             ]
         )
 
-        # these can happen, if we shutdown at a good time. to be fixed as part of #5172.
-        message = ".*duplicated L1 layer layer=.*"
-        ps.allowed_errors.append(message)
-
     workload = Workload(env, tenant_id, timeline_id)
     workload.init(env.pageservers[0].id)
     workload.write_rows(256, env.pageservers[0].id)
@@ -215,6 +211,13 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
                 )
                 workload.validate(pageserver.id)
 
+    # Having done a bunch of attach/detach cycles, we will have generated some index garbage: check
+    # that the scrubber sees it and cleans it up.  We do this before the final attach+validate pass,
+    # to also validate that the scrubber isn't breaking anything.
+    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] > 0
+
     # Attach all pageservers
     for ps in env.pageservers:
         location_conf = {"mode": "AttachedMulti", "secondary_conf": None, "tenant_conf": {}}
@@ -227,10 +230,11 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     # Detach all pageservers
     for ps in env.pageservers:
         location_conf = {"mode": "Detached", "secondary_conf": None, "tenant_conf": {}}
+        assert ps.list_layers(tenant_id, timeline_id) != []
         ps.tenant_location_configure(tenant_id, location_conf)
 
-    # Confirm that all local disk state was removed on detach
-    # TODO
+        # Confirm that all local disk state was removed on detach
+        assert ps.list_layers(tenant_id, timeline_id) == []
 
 
 def test_live_migration(neon_env_builder: NeonEnvBuilder):
diff --git a/test_runner/regress/test_s3_scrubber.py b/test_runner/regress/test_s3_scrubber.py
index 8981000c24..6baba190f3 100644
--- a/test_runner/regress/test_s3_scrubber.py
+++ b/test_runner/regress/test_s3_scrubber.py
@@ -3,7 +3,7 @@ import shutil
 from typing import Optional
 
 import pytest
-from fixtures.common_types import TenantShardId
+from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     S3Scrubber,
@@ -109,3 +109,52 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
 
     # Check we can read everything
     workload.validate()
+
+
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.num_pageservers = 2
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=shard_count)
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+
+    # We will end up with an index per shard, per cycle, plus one for the initial startup
+    n_cycles = 4
+    expect_indices_per_shard = n_cycles + 1
+    shard_count = 1 if shard_count is None else shard_count
+
+    # For each cycle, detach and attach the tenant to bump the generation, and do some writes to generate uploads
+    for _i in range(0, n_cycles):
+        env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})
+        env.storage_controller.reconcile_until_idle()
+
+        env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}})
+        env.storage_controller.reconcile_until_idle()
+
+        # This write includes remote upload, will generate an index in this generation
+        workload.write_rows(1)
+
+    # With a high min_age, the scrubber should decline to delete anything
+    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=3600)
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+
+    # If targeting a different tenant, the scrubber shouldn't do anything
+    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(
+        min_age_secs=1, tenant_ids=[TenantId.generate()]
+    )
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+
+    #  With a low min_age, the scrubber should go ahead and clean up all but the latest 2 generations
+    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == (expect_indices_per_shard - 2) * shard_count
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 3a9a522f3f..2031feaa83 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -7,6 +7,7 @@ from typing import Any, Dict, List, Union
 
 import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
+from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -18,6 +19,8 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
     MANY_SMALL_LAYERS_TENANT_CONFIG,
+    assert_prefix_empty,
+    assert_prefix_not_empty,
     enable_remote_storage_versioning,
     list_prefix,
     remote_storage_delete_key,
@@ -839,6 +842,86 @@ def test_storage_controller_tenant_conf(neon_env_builder: NeonEnvBuilder):
     env.storage_controller.consistency_check()
 
 
+def test_storage_controller_tenant_deletion(
+    neon_env_builder: NeonEnvBuilder,
+    compute_reconfigure_listener: ComputeReconfigure,
+):
+    """
+    Validate that:
+    - Deleting a tenant deletes all its shards
+    - Deletion does not require the compute notification hook to be responsive
+    - Deleting a tenant also removes all secondary locations
+    """
+    neon_env_builder.num_pageservers = 4
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.control_plane_compute_hook_api = (
+        compute_reconfigure_listener.control_plane_compute_hook_api
+    )
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(
+        tenant_id, timeline_id, shard_count=2, placement_policy='{"Attached":1}'
+    )
+
+    # Ensure all the locations are configured, including secondaries
+    env.storage_controller.reconcile_until_idle()
+
+    shard_ids = [
+        TenantShardId.parse(shard["shard_id"]) for shard in env.storage_controller.locate(tenant_id)
+    ]
+
+    # Assert attachments all have local content
+    for shard_id in shard_ids:
+        pageserver = env.get_tenant_pageserver(shard_id)
+        assert pageserver.tenant_dir(shard_id).exists()
+
+    # Assert all shards have some content in remote storage
+    for shard_id in shard_ids:
+        assert_prefix_not_empty(
+            neon_env_builder.pageserver_remote_storage,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(shard_id),
+                )
+            ),
+        )
+
+    # Break the compute hook: we are checking that deletion does not depend on the compute hook being available
+    def break_hook():
+        raise RuntimeError("Unexpected call to compute hook")
+
+    compute_reconfigure_listener.register_on_notify(break_hook)
+
+    # No retry loop: deletion should complete in one shot without polling for 202 responses, because
+    # it cleanly detaches all the shards first, and then deletes them in remote storage
+    env.storage_controller.pageserver_api().tenant_delete(tenant_id)
+
+    # Assert no pageservers have any local content
+    for pageserver in env.pageservers:
+        for shard_id in shard_ids:
+            assert not pageserver.tenant_dir(shard_id).exists()
+
+    for shard_id in shard_ids:
+        assert_prefix_empty(
+            neon_env_builder.pageserver_remote_storage,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(shard_id),
+                )
+            ),
+        )
+
+    # Assert the tenant is not visible in storage controller API
+    with pytest.raises(StorageControllerApiException):
+        env.storage_controller.tenant_describe(tenant_id)
+
+
 class Failure:
     pageserver_id: int
 
diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py
new file mode 100644
index 0000000000..d7f3962620
--- /dev/null
+++ b/test_runner/regress/test_subscriber_restart.py
@@ -0,0 +1,57 @@
+import threading
+import time
+
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.utils import wait_until
+
+
+# This test checks of logical replication subscriber is able to correctly restart replication without receiving duplicates.
+# It requires tracking information about replication origins at page server side
+def test_subscriber_restart(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    env.neon_cli.create_branch("publisher")
+    pub = env.endpoints.create("publisher")
+    pub.start()
+
+    env.neon_cli.create_branch("subscriber")
+    sub = env.endpoints.create("subscriber")
+    sub.start()
+
+    n_records = 100000
+    n_restarts = 100
+
+    def check_that_changes_propagated():
+        scur.execute("SELECT count(*) FROM t")
+        res = scur.fetchall()
+        assert res[0][0] == n_records
+
+    def insert_data(pub):
+        with pub.cursor() as pcur:
+            for i in range(0, n_records):
+                pcur.execute("INSERT into t values (%s,random()*100000)", (i,))
+
+    with pub.cursor() as pcur:
+        with sub.cursor() as scur:
+            pcur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
+            pcur.execute("CREATE PUBLICATION pub FOR TABLE t")
+            scur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
+            # scur.execute("CREATE INDEX on t(sk)") # slowdown applying WAL at replica
+            pub_conn = f"host=localhost port={pub.pg_port} dbname=postgres user=cloud_admin"
+            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub"
+            scur.execute(query)
+            time.sleep(2)  # let initial table sync complete
+
+        thread = threading.Thread(target=insert_data, args=(pub,), daemon=True)
+        thread.start()
+
+        for _ in range(n_restarts):
+            # restart subscriber
+            # time.sleep(2)
+            sub.stop("immediate")
+            sub.start()
+
+        thread.join()
+        pcur.execute(f"INSERT into t values ({n_records}, 0)")
+        n_records += 1
+        with sub.cursor() as scur:
+            wait_until(10, 0.5, check_that_changes_propagated)
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 3fc44de6fa..fa7cead1bd 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -54,9 +54,26 @@ def test_tenant_delete_smoke(
 
     # first try to delete non existing tenant
     tenant_id = TenantId.generate()
-    env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
-    with pytest.raises(PageserverApiException, match=f"NotFound: tenant {tenant_id}"):
-        ps_http.tenant_delete(tenant_id=tenant_id)
+    env.pageserver.allowed_errors.append(".*NotFound.*")
+    env.pageserver.allowed_errors.append(".*simulated failure.*")
+
+    # Check that deleting a non-existent tenant gives the expected result: this is a loop because we
+    # may need to retry on some remote storage errors injected by the test harness
+    while True:
+        try:
+            ps_http.tenant_delete(tenant_id=tenant_id)
+        except PageserverApiException as e:
+            if e.status_code == 500:
+                # This test uses failure injection, which can produce 500s as the pageserver expects
+                # the object store to always be available, and the ListObjects during deletion is generally
+                # an infallible operation
+                assert "simulated failure of remote operation" in e.message
+            elif e.status_code == 404:
+                # This is our expected result: trying to erase a non-existent tenant gives us 404
+                assert "NotFound" in e.message
+                break
+            else:
+                raise
 
     env.neon_cli.create_tenant(
         tenant_id=tenant_id,
@@ -88,6 +105,9 @@ def test_tenant_delete_smoke(
 
         parent = timeline
 
+    # Upload a heatmap so that we exercise deletion of that too
+    ps_http.tenant_heatmap_upload(tenant_id)
+
     iterations = poll_for_remote_storage_iterations(remote_storage_kind)
 
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 12a4730e69..871351b2d5 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -302,7 +302,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
 
     # gc should not try to even start on a timeline that doesn't exist
     with pytest.raises(
-        expected_exception=PageserverApiException, match="gc target timeline does not exist"
+        expected_exception=PageserverApiException, match="NotFound: Timeline not found"
     ):
         bogus_timeline_id = TimelineId.generate()
         pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0)
@@ -310,7 +310,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
     env.pageserver.allowed_errors.extend(
         [
             # the error will be printed to the log too
-            ".*gc target timeline does not exist.*",
+            ".*NotFound: Timeline not found.*",
             # Timelines get stopped during detach, ignore the gc calls that error, witnessing that
             ".*InternalServerError\\(timeline is Stopping.*",
         ]
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index cff13e74ee..dce30f5388 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -17,6 +17,7 @@ import psycopg2
 import psycopg2.errors
 import psycopg2.extras
 import pytest
+import requests
 from fixtures.broker import NeonBroker
 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
@@ -841,7 +842,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
 
     # fetch something sensible from status
     tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id)
-    epoch = tli_status.acceptor_epoch
+    term = tli_status.term
     timeline_start_lsn = tli_status.timeline_start_lsn
 
     if auth_enabled:
@@ -862,8 +863,8 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     endpoint.safe_psql("insert into t values(10)")
 
     tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id)
-    epoch_after_reboot = tli_status.acceptor_epoch
-    assert epoch_after_reboot > epoch
+    term_after_reboot = tli_status.term
+    assert term_after_reboot > term
 
     # and timeline_start_lsn stays the same
     assert tli_status.timeline_start_lsn == timeline_start_lsn
@@ -1104,11 +1105,11 @@ def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: Timeline
     # First check that term / flush_lsn are the same: it is easier to
     # report/understand if WALs are different due to that.
     statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis]
-    term_flush_lsns = [(s.acceptor_epoch, s.flush_lsn) for s in statuses]
+    term_flush_lsns = [(s.last_log_term, s.flush_lsn) for s in statuses]
     for tfl, sk in zip(term_flush_lsns[1:], sks[1:]):
         assert (
             term_flush_lsns[0] == tfl
-        ), f"(term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}"
+        ), f"(last_log_term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}"
 
     # check that WALs are identic.
     segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks]
@@ -1867,6 +1868,65 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
     assert digests[0] == digests[1], f"digest on src is {digests[0]} but on dst is {digests[1]}"
 
 
+# Test pull_timeline while concurrently changing term on the donor:
+# 1) Start pull_timeline, listing files to fetch.
+# 2) Change term on the donor
+# 3) Finish pull_timeline.
+#
+# Currently (until proper membership change procedure), we want to pull_timeline
+# to fetch the log up to <last_log_term, flush_lsn>. This is unsafe if term
+# changes during the procedure (unless timeline is locked all the time but we
+# don't want that): recepient might end up with mix of WAL from different
+# histories. Thus the schedule above is expected to fail. Later we'd allow
+# pull_timeline to only initialize timeline to any valid state (up to
+# commit_lsn), holding switch to fully new configuration until it recovers
+# enough, so it won't be affected by term change anymore.
+#
+# Expected to fail while term check is not implemented.
+@pytest.mark.xfail
+def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2])
+
+    log.info("use only first 2 safekeepers, 3rd will be seeded")
+    ep = env.endpoints.create("main")
+    ep.active_safekeepers = [1, 2]
+    ep.start()
+    ep.safe_psql("create table t(key int, value text)")
+    ep.safe_psql("insert into t select generate_series(1, 1000), 'pear'")
+
+    dst_http = dst_sk.http_client()
+    # run pull_timeline which will halt before downloading files
+    dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "pause"))
+    pt_handle = PropagatingThread(
+        target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id)
+    )
+    pt_handle.start()
+    dst_sk.wait_until_paused("sk-pull-timeline-after-list-pausable")
+
+    src_http = src_sk.http_client()
+    term_before = src_http.timeline_status(tenant_id, timeline_id).term
+
+    # restart compute to bump term
+    ep.stop()
+    ep = env.endpoints.create("main")
+    ep.active_safekeepers = [1, 2]
+    ep.start()
+    ep.safe_psql("insert into t select generate_series(1, 100), 'pear'")
+
+    term_after = src_http.timeline_status(tenant_id, timeline_id).term
+    assert term_after > term_before, f"term_after={term_after}, term_before={term_before}"
+
+    dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "off"))
+    with pytest.raises(requests.exceptions.HTTPError):
+        pt_handle.join()
+
+
 # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
 # when compute is active, but there are no writes to the timeline. In that case
 # pageserver should maintain a single connection to safekeeper and don't attempt