Add dependabot

Our dependencies don't necessarily receive updates unless people run into issues. This should help keep our dependencies in check a little bit better. Signed-off-by: Tristan Partin <tristan@neon.tech>
Disallow archived timelines to be detached or reparented (#9578 )
2026-05-16 20:50:37 +00:00 · 2024-10-30 19:48:16 -05:00 · 2024-10-30 17:04:57 +01:00 · 2024-10-30 09:58:29 -05:00 · 2024-10-30 14:46:39 +03:00 · 2024-10-30 11:07:02 +01:00
203 changed files with 9174 additions and 3852 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -0,0 +1,116 @@
+version: 2
+
+updates:
+  - directory: /
+    package-ecosystem: cargo
+    schedule:
+      interval: daily
+    rebase-strategy: auto
+
+  - directory: /
+    package-ecosystem: docker
+    schedule:
+      interval: daily
+    rebase-strategy: auto
+
+  - directory: /
+    package-ecosystem: github-actions
+    schedule:
+      interval: daily
+    rebase-strategy: auto
+
+  - directory: /
+    package-ecosystem: pip
+    schedule:
+      interval: daily
+    rebase-strategy: auto
+
+  - directory: test_runner/pg_clients/csharp/npgsql
+    package-ecosystem: docker
+    schedule:
+      interval: weekly
+    rebase-strategy: auto
+
+  - directory: test_runner/pg_clients/csharp/npgsql
+    package-ecosystem: nuget
+    schedule:
+      interval: weekly
+    rebase-strategy: auto
+
+  - directory: test_runner/pg_clients/java/jdbc/
+    package-ecosystem: docker
+    schedule:
+      interval: weekly
+    rebase-strategy: auto
+
+  - directory: test_runner/pg_clients/python/asyncpg/
+    package-ecosystem: pip
+    schedule:
+      interval: weekly
+    rebase-strategy: auto
+
+  - directory: test_runner/pg_clients/python/pg8000/
+    package-ecosystem: pip
+    schedule:
+      interval: weekly
+    rebase-strategy: auto
+
+  - directory: test_runner/pg_clients/rust/tokio-postgres/
+    package-ecosystem: cargo
+    schedule:
+      interval: weekly
+    rebase-strategy: auto
+
+  - directory: test_runner/pg_clients/rust/tokio-postgres/
+    package-ecosystem: docker
+    schedule:
+      interval: weekly
+    rebase-strategy: auto
+
+  - directory: test_runner/pg_clients/swift/PostgresNIOExample/
+    package-ecosystem: docker
+    schedule:
+      interval: weekly
+    rebase-strategy: auto
+
+  - directory: test_runner/pg_clients/swift/PostgresNIOExample/
+    package-ecosystem: swift
+    schedule:
+      interval: weekly
+    rebase-strategy: auto
+
+  - directory: test_runner/pg_clients/swift/PostgresClientKitExample/
+    package-ecosystem: docker
+    schedule:
+      interval: weekly
+    rebase-strategy: auto
+
+  - directory: test_runner/pg_clients/swift/PostgresClientKitExample/
+    package-ecosystem: swift
+    schedule:
+      interval: weekly
+    rebase-strategy: auto
+
+  - directory: test_runner/pg_clients/typescript/postgresql-client/
+    package-ecosystem: docker
+    schedule:
+      interval: weekly
+    rebase-strategy: auto
+
+  - directory: test_runner/pg_clients/typescript/postgresql-client/
+    package-ecosystem: npm
+    schedule:
+      interval: weekly
+    rebase-strategy: auto
+
+  - directory: test_runner/pg_clients/typescript/serverless-driver/
+    package-ecosystem: docker
+    schedule:
+      interval: weekly
+    rebase-strategy: auto
+
+  - directory: test_runner/pg_clients/typescript/serverless-driver/
+    package-ecosystem: npm
+    schedule:
+      interval: weekly
+    rebase-strategy: auto
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -53,20 +53,6 @@ jobs:
      BUILD_TAG: ${{ inputs.build-tag }}

    steps:
-      - name: Fix git ownership
-        run: |
-          # Workaround for `fatal: detected dubious ownership in repository at ...`
-          #
-          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
-          #   Ref https://github.com/actions/checkout/issues/785
-          #
-          git config --global --add safe.directory ${{ github.workspace }}
-          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16 17; do
-            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
-            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
-          done
-
      - uses: actions/checkout@v4
        with:
          submodules: true
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -671,6 +671,10 @@ jobs:
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init

+    # Increase timeout to 12h, default timeout is 6h
+    # we have regression in clickbench causing it to run 2-3x longer
+    timeout-minutes: 720
+
    steps:
    - uses: actions/checkout@v4

@@ -716,7 +720,7 @@ jobs:
        test_selection: performance/test_perf_olap.py
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
+        extra_params: -m remote_cluster --timeout 43200 -k test_clickbench
        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -839,6 +839,7 @@ jobs:
      - name: Build vm image
        run: |
          ./vm-builder \
+            -size=2G \
            -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
            -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
            -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
@@ -1078,20 +1079,6 @@ jobs:
    runs-on: [ self-hosted, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
    steps:
-      - name: Fix git ownership
-        run: |
-          # Workaround for `fatal: detected dubious ownership in repository at ...`
-          #
-          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
-          #   Ref https://github.com/actions/checkout/issues/785
-          #
-          git config --global --add safe.directory ${{ github.workspace }}
-          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16 17; do
-            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
-            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
-          done
-
      - uses: actions/checkout@v4

      - name: Trigger deploy workflow
@@ -1130,7 +1117,10 @@ jobs:

            gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
              -f deployPgSniRouter=true \
-              -f deployProxy=true \
+              -f deployProxyLink=true \
+              -f deployPrivatelinkProxy=true \
+              -f deployProxyScram=true \
+              -f deployProxyAuthBroker=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}}
          else
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,8 @@ __pycache__/
 test_output/
 .vscode
 .idea
+*.swp
+tags
 neon.iml
 /.neon
 /integration_tests/.neon
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3749,6 +3749,7 @@ dependencies = [
 "tracing",
 "url",
 "utils",
+ "wal_decoder",
 "walkdir",
 "workspace_hack",
 ]
@@ -4186,6 +4187,7 @@ dependencies = [
 "regex",
 "serde",
 "thiserror",
+ "tracing",
 "utils",
 ]

@@ -6272,7 +6274,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
 dependencies = [
 "futures",
 "nix 0.26.4",
@@ -6788,7 +6790,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
 dependencies = [
 "bytes",
 "io-uring",
@@ -6954,6 +6956,20 @@ dependencies = [
 "utils",
 ]

+[[package]]
+name = "wal_decoder"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "bytes",
+ "pageserver_api",
+ "postgres_ffi",
+ "serde",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "walkdir"
 version = "2.3.3"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,6 +33,7 @@ members = [
    "libs/postgres_ffi/wal_craft",
    "libs/vm_monitor",
    "libs/walproposer",
+    "libs/wal_decoder",
 ]

 [workspace.package]
@@ -238,6 +239,7 @@ tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
 walproposer = { version = "0.1", path = "./libs/walproposer/" }
+wal_decoder = { version = "0.1", path = "./libs/wal_decoder" }

 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
--- a/4
+++ b/4
@@ -297,7 +297,7 @@ clean: postgres-clean neon-pg-clean-ext
 # This removes everything
 .PHONY: distclean
 distclean:
-	rm -rf $(POSTGRES_INSTALL_DIR)
+	$(RM) -r $(POSTGRES_INSTALL_DIR)
 	$(CARGO_CMD_PREFIX) cargo clean

 .PHONY: fmt
@@ -329,7 +329,7 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
 		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
 		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
 		--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
-	rm -f pg*.BAK
+	$(RM) pg*.BAK

 # Indent pxgn/neon.
 .PHONY: neon-pgindent
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -57,6 +57,18 @@ RUN set -e \
        zstd \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

+# sql_exporter
+
+# Keep the version the same as in compute/compute-node.Dockerfile and
+# test_runner/regress/test_compute_metrics.py.
+ENV SQL_EXPORTER_VERSION=0.13.1
+RUN curl -fsSL \
+    "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
+    --output sql_exporter.tar.gz \
+    && mkdir /tmp/sql_exporter \
+    && tar xzvf sql_exporter.tar.gz -C /tmp/sql_exporter --strip-components=1 \
+    && mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter
+
 # protobuf-compiler (protoc)
 ENV PROTOC_VERSION=25.1
 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -22,6 +22,7 @@ sql_exporter.yml: $(jsonnet_files)
 		--output-file etc/$@ \
 		--tla-str collector_name=neon_collector \
 		--tla-str collector_file=neon_collector.yml \
+		--tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter' \
 		etc/sql_exporter.jsonnet

 sql_exporter_autoscaling.yml: $(jsonnet_files)
@@ -29,12 +30,12 @@ sql_exporter_autoscaling.yml: $(jsonnet_files)
 		--output-file etc/$@ \
 		--tla-str collector_name=neon_collector_autoscaling \
 		--tla-str collector_file=neon_collector_autoscaling.yml \
-		--tla-str application_name=sql_exporter_autoscaling \
+		--tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling' \
 		etc/sql_exporter.jsonnet

 .PHONY: clean
 clean:
-	rm -f \
+	$(RM) \
 		etc/neon_collector.yml \
 		etc/neon_collector_autoscaling.yml \
 		etc/sql_exporter.yml \
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -666,7 +666,7 @@ RUN apt-get update && \
 #
 # Use new version only for v17
 # because Release_2024_09_1 has some backward incompatible changes
-# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 
+# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
 ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN case "${PG_VERSION}" in \
    "v17") \
@@ -860,18 +860,98 @@ ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
 USER nonroot
 WORKDIR /home/nonroot

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 is not supported yet by pgrx. Quit" && exit 0;; \
-    esac && \
-    curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
    chmod +x rustup-init && \
    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
    rm rustup-init && \
+    case "${PG_VERSION}" in \
+        'v17') \
+            echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \
+    esac && \
    cargo install --locked --version 0.11.3 cargo-pgrx && \
    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'

 USER root

+#########################################################################################
+#
+# Layer "rust extensions pgrx12"
+#
+# pgrx started to support Postgres 17 since version 12,
+# but some older extension aren't compatible with it.
+# This layer should be used as a base for new pgrx extensions,
+# and eventually get merged with `rust-extensions-build`
+#
+#########################################################################################
+FROM build-deps AS rust-extensions-build-pgrx12
+ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends -y curl libclang-dev && \
+    useradd -ms /bin/bash nonroot -b /home
+
+ENV HOME=/home/nonroot
+ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
+USER nonroot
+WORKDIR /home/nonroot
+
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+    chmod +x rustup-init && \
+    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
+    rm rustup-init && \
+    cargo install --locked --version 0.12.6 cargo-pgrx && \
+    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
+
+USER root
+
+#########################################################################################
+#
+# Layers "pg-onnx-build" and "pgrag-pg-build"
+# Compile "pgrag" extensions
+#
+#########################################################################################
+
+FROM rust-extensions-build-pgrx12 AS pg-onnx-build
+
+# cmake 3.26 or higher is required, so installing it using pip (bullseye-backports has cmake 3.25).
+# Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise
+RUN apt-get update && apt-get install -y python3 python3-pip python3-venv && \
+    python3 -m venv venv && \
+    . venv/bin/activate && \
+    python3 -m pip install cmake==3.30.5 && \
+    wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \
+    mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
+    ./build.sh --config Release --parallel --skip_submodule_sync --skip_tests --allow_running_as_root
+
+
+FROM pg-onnx-build AS pgrag-pg-build
+
+RUN apt-get install -y protobuf-compiler && \
+    wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz &&  \
+    echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
+    mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \
+    \
+    cd exts/rag && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \
+    \
+    cd ../rag_bge_small_en_v15 && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
+        REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
+        cargo pgrx install --release --features remote_onnx && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \
+    \
+    cd ../rag_jina_reranker_v1_tiny_en && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
+        REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
+        cargo pgrx install --release --features remote_onnx && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_jina_reranker_v1_tiny_en.control
+
+
 #########################################################################################
 #
 # Layer "pg-jsonschema-pg-build"
@@ -1041,6 +1121,34 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control

+#########################################################################################
+#
+# Layer "pg_mooncake"
+# compile pg_mooncake extension
+#
+#########################################################################################
+FROM rust-extensions-build AS pg-mooncake-build
+ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+# The topmost commit in the `neon` branch at the time of writing this
+# https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
+# https://github.com/Mooncake-Labs/pg_mooncake/commit/568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
+ENV PG_MOONCAKE_VERSION=568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
+
+RUN case "${PG_VERSION}" in \
+        'v14') \
+            echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
+    esac && \
+    git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \
+    cd pg_mooncake-src && \
+    git checkout "${PG_MOONCAKE_VERSION}" && \
+    git submodule update --init --depth 1 --recursive && \
+    make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \
+    make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -1059,6 +1167,7 @@ COPY --from=h3-pg-build /h3/usr /
 COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1084,6 +1193,7 @@ COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -1191,7 +1301,10 @@ RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin l
 #########################################################################################

 FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
-FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
+
+# Keep the version the same as in build-tools.Dockerfile and
+# test_runner/regress/test_compute_metrics.py.
+FROM burningalchemist/sql_exporter:0.13.1 AS sql-exporter

 #########################################################################################
 #
@@ -1247,6 +1360,7 @@ COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
 COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
 COPY --from=vector-pg-build /pgvector.patch /ext-src/
 COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
+#COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 #COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
 #COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
 #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -3,6 +3,7 @@
  metrics: [
    import 'sql_exporter/checkpoints_req.libsonnet',
    import 'sql_exporter/checkpoints_timed.libsonnet',
+    import 'sql_exporter/compute_backpressure_throttling_ms.libsonnet',
    import 'sql_exporter/compute_current_lsn.libsonnet',
    import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
    import 'sql_exporter/compute_receive_lsn.libsonnet',
--- a/compute/etc/sql_exporter.jsonnet
+++ b/compute/etc/sql_exporter.jsonnet
@@ -1,4 +1,4 @@
-function(collector_name, collector_file, application_name='sql_exporter') {
+function(collector_name, collector_file, connection_string) {
  // Configuration for sql_exporter for autoscaling-agent
  // Global defaults.
  global: {
@@ -23,7 +23,7 @@ function(collector_name, collector_file, application_name='sql_exporter') {
  target: {
    // Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
    // the schema gets dropped or replaced to match the driver expected DSN format.
-    data_source_name: std.format('postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=%s', [application_name]),
+    data_source_name: connection_string,

    // Collectors (referenced by name) to execute on the target.
    // Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
--- a/compute/etc/sql_exporter/checkpoints_timed.libsonnet
+++ b/compute/etc/sql_exporter/checkpoints_timed.libsonnet
@@ -1,7 +1,7 @@
 local neon = import 'neon.libsonnet';

-local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql';
-local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql';
+local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_timed.sql';
+local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_timed.17.sql';

 {
  metric_name: 'checkpoints_timed',
--- a/compute/etc/sql_exporter/compute_backpressure_throttling_ms.libsonnet
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_ms.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'compute_backpressure_throttling_ms',
+  type: 'gauge',
+  help: 'Time compute has spent throttled',
+  key_labels: null,
+  values: [
+    'throttled',
+  ],
+  query: importstr 'sql_exporter/compute_backpressure_throttling_ms.sql',
+}
--- a/compute/etc/sql_exporter/compute_backpressure_throttling_ms.sql
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_ms.sql
@@ -0,0 +1 @@
+SELECT neon.backpressure_throttling_time() AS throttled;
--- a/compute/etc/sql_exporter/retained_wal.sql
+++ b/compute/etc/sql_exporter/retained_wal.sql
@@ -1,5 +1,10 @@
 SELECT
  slot_name,
-  pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
+  pg_wal_lsn_diff(
+    CASE
+      WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn()
+      ELSE pg_current_wal_lsn()
+    END,
+    restart_lsn)::FLOAT8 AS retained_wal
 FROM pg_replication_slots
 WHERE active = false;
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -18,7 +18,7 @@ commands:
  - name: pgbouncer
    user: postgres
    sysvInitAction: respawn
-    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
+    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0'
  - name: local_proxy
    user: postgres
    sysvInitAction: respawn
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -18,7 +18,7 @@ commands:
  - name: pgbouncer
    user: postgres
    sysvInitAction: respawn
-    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
+    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0'
  - name: local_proxy
    user: postgres
    sysvInitAction: respawn
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,7 +1,6 @@
 use std::collections::HashMap;
 use std::env;
 use std::fs;
-use std::io::BufRead;
 use std::os::unix::fs::{symlink, PermissionsExt};
 use std::path::Path;
 use std::process::{Command, Stdio};
@@ -365,8 +364,7 @@ impl ComputeNode {
        let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;

        let basebackup_cmd = match lsn {
-            // HACK We don't use compression on first start (Lsn(0)) because there's no API for it
-            Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id),
+            Lsn(0) => format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id),
            _ => format!(
                "basebackup {} {} {} --gzip",
                spec.tenant_id, spec.timeline_id, lsn
@@ -375,38 +373,16 @@ impl ComputeNode {

        let copyreader = client.copy_out(basebackup_cmd.as_str())?;
        let mut measured_reader = MeasuredReader::new(copyreader);
-
-        // Check the magic number to see if it's a gzip or not. Even though
-        // we might explicitly ask for gzip, an old pageserver with no implementation
-        // of gzip compression might send us uncompressed data. After some time
-        // passes we can assume all pageservers know how to compress and we can
-        // delete this check.
-        //
-        // If the data is not gzip, it will be tar. It will not be mistakenly
-        // recognized as gzip because tar starts with an ascii encoding of a filename,
-        // and 0x1f and 0x8b are unlikely first characters for any filename. Moreover,
-        // we send the "global" directory first from the pageserver, so it definitely
-        // won't be recognized as gzip.
        let mut bufreader = std::io::BufReader::new(&mut measured_reader);
-        let gzip = {
-            let peek = bufreader.fill_buf().unwrap();
-            peek[0] == 0x1f && peek[1] == 0x8b
-        };

        // Read the archive directly from the `CopyOutReader`
        //
        // Set `ignore_zeros` so that unpack() reads all the Copy data and
        // doesn't stop at the end-of-archive marker. Otherwise, if the server
        // sends an Error after finishing the tarball, we will not notice it.
-        if gzip {
-            let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
-            ar.set_ignore_zeros(true);
-            ar.unpack(&self.pgdata)?;
-        } else {
-            let mut ar = tar::Archive::new(&mut bufreader);
-            ar.set_ignore_zeros(true);
-            ar.unpack(&self.pgdata)?;
-        };
+        let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
+        ar.set_ignore_zeros(true);
+        ar.unpack(&self.pgdata)?;

        // Report metrics
        let mut state = self.state.lock().unwrap();
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1073,10 +1073,10 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any
                    tenant_id,
                    TimelineCreateRequest {
                        new_timeline_id,
-                        ancestor_timeline_id: None,
-                        ancestor_start_lsn: None,
-                        existing_initdb_timeline_id: None,
-                        pg_version: Some(args.pg_version),
+                        mode: pageserver_api::models::TimelineCreateRequestMode::Bootstrap {
+                            existing_initdb_timeline_id: None,
+                            pg_version: Some(args.pg_version),
+                        },
                    },
                )
                .await?;
@@ -1133,10 +1133,10 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
            let storage_controller = StorageController::from_env(env);
            let create_req = TimelineCreateRequest {
                new_timeline_id,
-                ancestor_timeline_id: None,
-                existing_initdb_timeline_id: None,
-                ancestor_start_lsn: None,
-                pg_version: Some(args.pg_version),
+                mode: pageserver_api::models::TimelineCreateRequestMode::Bootstrap {
+                    existing_initdb_timeline_id: None,
+                    pg_version: Some(args.pg_version),
+                },
            };
            let timeline_info = storage_controller
                .tenant_timeline_create(tenant_id, create_req)
@@ -1189,10 +1189,11 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
            let storage_controller = StorageController::from_env(env);
            let create_req = TimelineCreateRequest {
                new_timeline_id,
-                ancestor_timeline_id: Some(ancestor_timeline_id),
-                existing_initdb_timeline_id: None,
-                ancestor_start_lsn: start_lsn,
-                pg_version: None,
+                mode: pageserver_api::models::TimelineCreateRequestMode::Branch {
+                    ancestor_timeline_id,
+                    ancestor_start_lsn: start_lsn,
+                    pg_version: None,
+                },
            };
            let timeline_info = storage_controller
                .tenant_timeline_create(tenant_id, create_req)
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,7 +17,7 @@ use std::time::Duration;

 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use pageserver_api::models::{self, AuxFilePolicy, TenantInfo, TimelineInfo};
+use pageserver_api::models::{self, TenantInfo, TimelineInfo};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
@@ -399,11 +399,6 @@ impl PageServerNode {
                .map(serde_json::from_str)
                .transpose()
                .context("parse `timeline_get_throttle` from json")?,
-            switch_aux_file_policy: settings
-                .remove("switch_aux_file_policy")
-                .map(|x| x.parse::<AuxFilePolicy>())
-                .transpose()
-                .context("Failed to parse 'switch_aux_file_policy'")?,
            lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
            lsn_lease_length_for_ts: settings
                .remove("lsn_lease_length_for_ts")
@@ -499,11 +494,6 @@ impl PageServerNode {
                    .map(serde_json::from_str)
                    .transpose()
                    .context("parse `timeline_get_throttle` from json")?,
-                switch_aux_file_policy: settings
-                    .remove("switch_aux_file_policy")
-                    .map(|x| x.parse::<AuxFilePolicy>())
-                    .transpose()
-                    .context("Failed to parse 'switch_aux_file_policy'")?,
                lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
                lsn_lease_length_for_ts: settings
                    .remove("lsn_lease_length_for_ts")
@@ -529,28 +519,6 @@ impl PageServerNode {
        Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
    }

-    pub async fn timeline_create(
-        &self,
-        tenant_shard_id: TenantShardId,
-        new_timeline_id: TimelineId,
-        ancestor_start_lsn: Option<Lsn>,
-        ancestor_timeline_id: Option<TimelineId>,
-        pg_version: Option<u32>,
-        existing_initdb_timeline_id: Option<TimelineId>,
-    ) -> anyhow::Result<TimelineInfo> {
-        let req = models::TimelineCreateRequest {
-            new_timeline_id,
-            ancestor_start_lsn,
-            ancestor_timeline_id,
-            pg_version,
-            existing_initdb_timeline_id,
-        };
-        Ok(self
-            .http_client
-            .timeline_create(tenant_shard_id, &req)
-            .await?)
-    }
-
    /// Import a basebackup prepared using either:
    /// a) `pg_basebackup -F tar`, or
    /// b) The `fullbackup` pageserver endpoint
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -20,7 +20,16 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock};
+use std::{
+    ffi::OsStr,
+    fs,
+    net::SocketAddr,
+    path::PathBuf,
+    process::ExitStatus,
+    str::FromStr,
+    sync::OnceLock,
+    time::{Duration, Instant},
+};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
@@ -168,16 +177,6 @@ impl StorageController {
        .expect("non-Unicode path")
    }

-    /// PIDFile for the postgres instance used to store storage controller state
-    fn postgres_pid_file(&self) -> Utf8PathBuf {
-        Utf8PathBuf::from_path_buf(
-            self.env
-                .base_data_dir
-                .join("storage_controller_postgres.pid"),
-        )
-        .expect("non-Unicode path")
-    }
-
    /// Find the directory containing postgres subdirectories, such `bin` and `lib`
    ///
    /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
@@ -296,6 +295,31 @@ impl StorageController {
            .map_err(anyhow::Error::new)
    }

+    /// Wrapper for the pg_ctl binary, which we spawn as a short-lived subprocess when starting and stopping postgres
+    async fn pg_ctl<I, S>(&self, args: I) -> ExitStatus
+    where
+        I: IntoIterator<Item = S>,
+        S: AsRef<OsStr>,
+    {
+        let pg_bin_dir = self.get_pg_bin_dir().await.unwrap();
+        let bin_path = pg_bin_dir.join("pg_ctl");
+
+        let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
+        let envs = [
+            ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+        ];
+
+        Command::new(bin_path)
+            .args(args)
+            .envs(envs)
+            .spawn()
+            .expect("Failed to spawn pg_ctl, binary_missing?")
+            .wait()
+            .await
+            .expect("Failed to wait for pg_ctl termination")
+    }
+
    pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> {
        let instance_dir = self.storage_controller_instance_dir(start_args.instance_id);
        if let Err(err) = tokio::fs::create_dir(&instance_dir).await {
@@ -404,20 +428,34 @@ impl StorageController {
                db_start_args
            );

-            background_process::start_process(
-                "storage_controller_db",
-                &self.env.base_data_dir,
-                pg_bin_dir.join("pg_ctl").as_std_path(),
-                db_start_args,
-                vec![
-                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                ],
-                background_process::InitialPidFile::Create(self.postgres_pid_file()),
-                &start_args.start_timeout,
-                || self.pg_isready(&pg_bin_dir, postgres_port),
-            )
-            .await?;
+            let db_start_status = self.pg_ctl(db_start_args).await;
+            let start_timeout: Duration = start_args.start_timeout.into();
+            let db_start_deadline = Instant::now() + start_timeout;
+            if !db_start_status.success() {
+                return Err(anyhow::anyhow!(
+                    "Failed to start postgres {}",
+                    db_start_status.code().unwrap()
+                ));
+            }
+
+            loop {
+                if Instant::now() > db_start_deadline {
+                    return Err(anyhow::anyhow!("Timed out waiting for postgres to start"));
+                }
+
+                match self.pg_isready(&pg_bin_dir, postgres_port).await {
+                    Ok(true) => {
+                        tracing::info!("storage controller postgres is now ready");
+                        break;
+                    }
+                    Ok(false) => {
+                        tokio::time::sleep(Duration::from_millis(100)).await;
+                    }
+                    Err(e) => {
+                        tracing::warn!("Failed to check postgres status: {e}")
+                    }
+                }
+            }

            self.setup_database(postgres_port).await?;
        }
@@ -583,15 +621,10 @@ impl StorageController {
        }

        let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
-        let pg_bin_dir = self.get_pg_bin_dir().await?;

        println!("Stopping storage controller database...");
        let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
-        let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
-            .args(pg_stop_args)
-            .spawn()?
-            .wait()
-            .await?;
+        let stop_status = self.pg_ctl(pg_stop_args).await;
        if !stop_status.success() {
            match self.is_postgres_running().await {
                Ok(false) => {
@@ -612,14 +645,9 @@ impl StorageController {

    async fn is_postgres_running(&self) -> anyhow::Result<bool> {
        let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
-        let pg_bin_dir = self.get_pg_bin_dir().await?;

        let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
-        let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
-            .args(pg_status_args)
-            .spawn()?
-            .wait()
-            .await?;
+        let status_exitcode = self.pg_ctl(pg_status_args).await;

        // pg_ctl status returns this exit code if postgres is not running: in this case it is
        // fine that stop failed.  Otherwise it is an error that stop failed.
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -111,6 +111,11 @@ enum Command {
        #[arg(long)]
        node: NodeId,
    },
+    /// Cancel any ongoing reconciliation for this shard
+    TenantShardCancelReconcile {
+        #[arg(long)]
+        tenant_shard_id: TenantShardId,
+    },
    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
    /// that is passed through to pageservers, and does not affect storage controller behavior.
    TenantConfig {
@@ -535,6 +540,15 @@ async fn main() -> anyhow::Result<()> {
                )
                .await?;
        }
+        Command::TenantShardCancelReconcile { tenant_shard_id } => {
+            storcon_client
+                .dispatch::<(), ()>(
+                    Method::PUT,
+                    format!("control/v1/tenant/{tenant_shard_id}/cancel_reconcile"),
+                    None,
+                )
+                .await?;
+        }
        Command::TenantConfig { tenant_id, config } => {
            let tenant_conf = serde_json::from_str(&config)?;

--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -19,6 +19,7 @@ use once_cell::sync::Lazy;
 use prometheus::core::{
    Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
 };
+pub use prometheus::local::LocalHistogram;
 pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::Error;
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -250,12 +250,6 @@ pub struct TenantConfigToml {
    // Expresed in multiples of checkpoint distance.
    pub image_layer_creation_check_threshold: u8,

-    /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
-    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
-    /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
-    /// file is written.
-    pub switch_aux_file_policy: crate::models::AuxFilePolicy,
-
    /// The length for an explicit LSN lease request.
    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
    #[serde(with = "humantime_serde")]
@@ -475,7 +469,6 @@ impl Default for TenantConfigToml {
            lazy_slru_download: false,
            timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
-            switch_aux_file_policy: crate::models::AuxFilePolicy::default_tenant_config(),
            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
        }
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -5,9 +5,11 @@ pub mod controller_api;
 pub mod key;
 pub mod keyspace;
 pub mod models;
+pub mod record;
 pub mod reltag;
 pub mod shard;
 /// Public API types
 pub mod upcall_api;
+pub mod value;

 pub mod config;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -10,7 +10,6 @@ use std::{
    io::{BufRead, Read},
    num::{NonZeroU32, NonZeroU64, NonZeroUsize},
    str::FromStr,
-    sync::atomic::AtomicUsize,
    time::{Duration, SystemTime},
 };

@@ -211,13 +210,30 @@ pub enum TimelineState {
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TimelineCreateRequest {
    pub new_timeline_id: TimelineId,
-    #[serde(default)]
-    pub ancestor_timeline_id: Option<TimelineId>,
-    #[serde(default)]
-    pub existing_initdb_timeline_id: Option<TimelineId>,
-    #[serde(default)]
-    pub ancestor_start_lsn: Option<Lsn>,
-    pub pg_version: Option<u32>,
+    #[serde(flatten)]
+    pub mode: TimelineCreateRequestMode,
+}
+
+#[derive(Serialize, Deserialize, Clone)]
+#[serde(untagged)]
+pub enum TimelineCreateRequestMode {
+    Branch {
+        ancestor_timeline_id: TimelineId,
+        #[serde(default)]
+        ancestor_start_lsn: Option<Lsn>,
+        // TODO: cplane sets this, but, the branching code always
+        // inherits the ancestor's pg_version. Earlier code wasn't
+        // using a flattened enum, so, it was an accepted field, and
+        // we continue to accept it by having it here.
+        pg_version: Option<u32>,
+    },
+    // NB: Bootstrap is all-optional, and thus the serde(untagged) will cause serde to stop at Bootstrap.
+    // (serde picks the first matching enum variant, in declaration order).
+    Bootstrap {
+        #[serde(default)]
+        existing_initdb_timeline_id: Option<TimelineId>,
+        pg_version: Option<u32>,
+    },
 }

 #[derive(Serialize, Deserialize, Clone)]
@@ -292,7 +308,6 @@ pub struct TenantConfig {
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
    pub image_layer_creation_check_threshold: Option<u8>,
-    pub switch_aux_file_policy: Option<AuxFilePolicy>,
    pub lsn_lease_length: Option<String>,
    pub lsn_lease_length_for_ts: Option<String>,
 }
@@ -333,68 +348,6 @@ pub enum AuxFilePolicy {
    CrossValidation,
 }

-impl AuxFilePolicy {
-    pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
-        matches!(
-            (from, to),
-            (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
-        )
-    }
-
-    /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
-    pub fn default_tenant_config() -> Self {
-        Self::V2
-    }
-}
-
-/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
-pub struct AtomicAuxFilePolicy(AtomicUsize);
-
-impl AtomicAuxFilePolicy {
-    pub fn new(policy: Option<AuxFilePolicy>) -> Self {
-        Self(AtomicUsize::new(
-            policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
-        ))
-    }
-
-    pub fn load(&self) -> Option<AuxFilePolicy> {
-        match self.0.load(std::sync::atomic::Ordering::Acquire) {
-            0 => None,
-            other => Some(AuxFilePolicy::from_usize(other)),
-        }
-    }
-
-    pub fn store(&self, policy: Option<AuxFilePolicy>) {
-        self.0.store(
-            policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
-            std::sync::atomic::Ordering::Release,
-        );
-    }
-}
-
-impl AuxFilePolicy {
-    pub fn to_usize(self) -> usize {
-        match self {
-            Self::V1 => 1,
-            Self::CrossValidation => 2,
-            Self::V2 => 3,
-        }
-    }
-
-    pub fn try_from_usize(this: usize) -> Option<Self> {
-        match this {
-            1 => Some(Self::V1),
-            2 => Some(Self::CrossValidation),
-            3 => Some(Self::V2),
-            _ => None,
-        }
-    }
-
-    pub fn from_usize(this: usize) -> Self {
-        Self::try_from_usize(this).unwrap()
-    }
-}
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(tag = "kind")]
 pub enum EvictionPolicy {
@@ -699,6 +652,8 @@ pub struct OffloadedTimelineInfo {
    pub ancestor_timeline_id: Option<TimelineId>,
    /// Whether to retain the branch lsn at the ancestor or not
    pub ancestor_retain_lsn: Option<Lsn>,
+    /// The time point when the timeline was archived
+    pub archived_at: chrono::DateTime<chrono::Utc>,
 }

 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
@@ -1049,6 +1004,12 @@ pub mod virtual_file {
    }
 }

+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ScanDisposableKeysResponse {
+    pub disposable_count: usize,
+    pub not_disposable_count: usize,
+}
+
 // Wrapped in libpq CopyData
 #[derive(PartialEq, Eq, Debug)]
 pub enum PagestreamFeMessage {
@@ -1608,71 +1569,6 @@ mod tests {
        }
    }

-    #[test]
-    fn test_aux_file_migration_path() {
-        assert!(AuxFilePolicy::is_valid_migration_path(
-            None,
-            AuxFilePolicy::V1
-        ));
-        assert!(AuxFilePolicy::is_valid_migration_path(
-            None,
-            AuxFilePolicy::V2
-        ));
-        assert!(AuxFilePolicy::is_valid_migration_path(
-            None,
-            AuxFilePolicy::CrossValidation
-        ));
-        // Self-migration is not a valid migration path, and the caller should handle it by itself.
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V1),
-            AuxFilePolicy::V1
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V2),
-            AuxFilePolicy::V2
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::CrossValidation),
-            AuxFilePolicy::CrossValidation
-        ));
-        // Migrations not allowed
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::CrossValidation),
-            AuxFilePolicy::V1
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V1),
-            AuxFilePolicy::V2
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V2),
-            AuxFilePolicy::V1
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V2),
-            AuxFilePolicy::CrossValidation
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V1),
-            AuxFilePolicy::CrossValidation
-        ));
-        // Migrations allowed
-        assert!(AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::CrossValidation),
-            AuxFilePolicy::V2
-        ));
-    }
-
-    #[test]
-    fn test_aux_parse() {
-        assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
-        assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
-        assert_eq!(
-            AuxFilePolicy::from_str("cross-validation").unwrap(),
-            AuxFilePolicy::CrossValidation
-        );
-    }
-
    #[test]
    fn test_image_compression_algorithm_parsing() {
        use ImageCompressionAlgorithm::*;
--- a/libs/pageserver_api/src/record.rs
+++ b/libs/pageserver_api/src/record.rs
@@ -0,0 +1,113 @@
+//! This module defines the WAL record format used within the pageserver.
+
+use bytes::Bytes;
+use postgres_ffi::walrecord::{describe_postgres_wal_record, MultiXactMember};
+use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId};
+use serde::{Deserialize, Serialize};
+use utils::bin_ser::DeserializeError;
+
+/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper
+/// around a PostgreSQL WAL record, or a custom neon-specific "record".
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub enum NeonWalRecord {
+    /// Native PostgreSQL WAL record
+    Postgres { will_init: bool, rec: Bytes },
+
+    /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
+    ClearVisibilityMapFlags {
+        new_heap_blkno: Option<u32>,
+        old_heap_blkno: Option<u32>,
+        flags: u8,
+    },
+    /// Mark transaction IDs as committed on a CLOG page
+    ClogSetCommitted {
+        xids: Vec<TransactionId>,
+        timestamp: TimestampTz,
+    },
+    /// Mark transaction IDs as aborted on a CLOG page
+    ClogSetAborted { xids: Vec<TransactionId> },
+    /// Extend multixact offsets SLRU
+    MultixactOffsetCreate {
+        mid: MultiXactId,
+        moff: MultiXactOffset,
+    },
+    /// Extend multixact members SLRU.
+    MultixactMembersCreate {
+        moff: MultiXactOffset,
+        members: Vec<MultiXactMember>,
+    },
+    /// Update the map of AUX files, either writing or dropping an entry
+    AuxFile {
+        file_path: String,
+        content: Option<Bytes>,
+    },
+
+    /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
+    #[cfg(feature = "testing")]
+    Test {
+        /// Append a string to the image.
+        append: String,
+        /// Clear the image before appending.
+        clear: bool,
+        /// Treat this record as an init record. `clear` should be set to true if this field is set
+        /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
+        /// its references in `timeline.rs`.
+        will_init: bool,
+    },
+}
+
+impl NeonWalRecord {
+    /// Does replaying this WAL record initialize the page from scratch, or does
+    /// it need to be applied over the previous image of the page?
+    pub fn will_init(&self) -> bool {
+        // If you change this function, you'll also need to change ValueBytes::will_init
+        match self {
+            NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
+            #[cfg(feature = "testing")]
+            NeonWalRecord::Test { will_init, .. } => *will_init,
+            // None of the special neon record types currently initialize the page
+            _ => false,
+        }
+    }
+
+    #[cfg(feature = "testing")]
+    pub fn wal_append(s: impl AsRef<str>) -> Self {
+        Self::Test {
+            append: s.as_ref().to_string(),
+            clear: false,
+            will_init: false,
+        }
+    }
+
+    #[cfg(feature = "testing")]
+    pub fn wal_clear() -> Self {
+        Self::Test {
+            append: "".to_string(),
+            clear: true,
+            will_init: false,
+        }
+    }
+
+    #[cfg(feature = "testing")]
+    pub fn wal_init() -> Self {
+        Self::Test {
+            append: "".to_string(),
+            clear: true,
+            will_init: true,
+        }
+    }
+}
+
+/// Build a human-readable string to describe a WAL record
+///
+/// For debugging purposes
+pub fn describe_wal_record(rec: &NeonWalRecord) -> Result<String, DeserializeError> {
+    match rec {
+        NeonWalRecord::Postgres { will_init, rec } => Ok(format!(
+            "will_init: {}, {}",
+            will_init,
+            describe_postgres_wal_record(rec)?
+        )),
+        _ => Ok(format!("{:?}", rec)),
+    }
+}
--- a/libs/pageserver_api/src/value.rs
+++ b/libs/pageserver_api/src/value.rs
@@ -1,13 +1,16 @@
-use crate::walrecord::NeonWalRecord;
-use anyhow::Result;
+//! This module defines the value type used by the storage engine.
+//!
+//! A [`Value`] represents either a completely new value for one Key ([`Value::Image`]),
+//! or a "delta" of how to get from previous version of the value to the new one
+//! ([`Value::WalRecord`]])
+//!
+//! Note that the [`Value`] type is used for the permananent storage format, so any
+//! changes to it must be backwards compatible.
+
+use crate::record::NeonWalRecord;
 use bytes::Bytes;
 use serde::{Deserialize, Serialize};
-use std::ops::AddAssign;
-use std::time::Duration;

-pub use pageserver_api::key::{Key, KEY_SIZE};
-
-/// A 'value' stored for a one Key.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub enum Value {
    /// An Image value contains a full copy of the value
@@ -20,10 +23,12 @@ pub enum Value {
 }

 impl Value {
+    #[inline(always)]
    pub fn is_image(&self) -> bool {
        matches!(self, Value::Image(_))
    }

+    #[inline(always)]
    pub fn will_init(&self) -> bool {
        match self {
            Value::Image(_) => true,
@@ -33,17 +38,18 @@ impl Value {
 }

 #[derive(Debug, PartialEq)]
-pub(crate) enum InvalidInput {
+pub enum InvalidInput {
    TooShortValue,
    TooShortPostgresRecord,
 }

 /// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
 /// use this type for querying if a slice looks some particular way.
-pub(crate) struct ValueBytes;
+pub struct ValueBytes;

 impl ValueBytes {
-    pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
+    #[inline(always)]
+    pub fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
        if raw.len() < 12 {
            return Err(InvalidInput::TooShortValue);
        }
@@ -79,6 +85,7 @@ impl ValueBytes {
 mod test {
    use super::*;

+    use bytes::Bytes;
    use utils::bin_ser::BeSer;

    macro_rules! roundtrip {
@@ -229,56 +236,3 @@ mod test {
        assert!(!ValueBytes::will_init(&expected).unwrap());
    }
 }
-
-///
-/// Result of performing GC
-///
-#[derive(Default, Serialize, Debug)]
-pub struct GcResult {
-    pub layers_total: u64,
-    pub layers_needed_by_cutoff: u64,
-    pub layers_needed_by_pitr: u64,
-    pub layers_needed_by_branches: u64,
-    pub layers_needed_by_leases: u64,
-    pub layers_not_updated: u64,
-    pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
-
-    #[serde(serialize_with = "serialize_duration_as_millis")]
-    pub elapsed: Duration,
-
-    /// The layers which were garbage collected.
-    ///
-    /// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
-    /// dropped in tests.
-    #[cfg(feature = "testing")]
-    #[serde(skip)]
-    pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
-}
-
-// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
-fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
-where
-    S: serde::Serializer,
-{
-    d.as_millis().serialize(serializer)
-}
-
-impl AddAssign for GcResult {
-    fn add_assign(&mut self, other: Self) {
-        self.layers_total += other.layers_total;
-        self.layers_needed_by_pitr += other.layers_needed_by_pitr;
-        self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
-        self.layers_needed_by_branches += other.layers_needed_by_branches;
-        self.layers_needed_by_leases += other.layers_needed_by_leases;
-        self.layers_not_updated += other.layers_not_updated;
-        self.layers_removed += other.layers_removed;
-
-        self.elapsed += other.elapsed;
-
-        #[cfg(feature = "testing")]
-        {
-            let mut other = other;
-            self.doomed_layers.append(&mut other.doomed_layers);
-        }
-    }
-}
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -15,6 +15,7 @@ memoffset.workspace = true
 thiserror.workspace = true
 serde.workspace = true
 utils.workspace = true
+tracing.workspace = true

 [dev-dependencies]
 env_logger.workspace = true
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -36,6 +36,7 @@ macro_rules! postgres_ffi {
            pub mod controlfile_utils;
            pub mod nonrelfile_utils;
            pub mod wal_craft_test_export;
+            pub mod wal_generator;
            pub mod waldecoder_handler;
            pub mod xlog_utils;

@@ -217,6 +218,7 @@ macro_rules! enum_pgversion {

 pub mod pg_constants;
 pub mod relfile_utils;
+pub mod walrecord;

 // Export some widely used datatypes that are unlikely to change across Postgres versions
 pub use v14::bindings::RepOriginId;
--- a/libs/postgres_ffi/src/wal_generator.rs
+++ b/libs/postgres_ffi/src/wal_generator.rs
@@ -0,0 +1,203 @@
+use std::ffi::CStr;
+
+use bytes::{Bytes, BytesMut};
+use crc32c::crc32c_append;
+use utils::lsn::Lsn;
+
+use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC};
+use super::xlog_utils::{
+    XlLogicalMessage, XLOG_RECORD_CRC_OFFS, XLOG_SIZE_OF_XLOG_RECORD, XLP_BKP_REMOVABLE,
+    XLP_FIRST_IS_CONTRECORD,
+};
+use super::XLogRecord;
+use crate::pg_constants::{
+    RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE, XLP_LONG_HEADER, XLR_BLOCK_ID_DATA_LONG,
+    XLR_BLOCK_ID_DATA_SHORT,
+};
+use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
+
+/// Generates binary WAL records for use in tests and benchmarks. Currently only generates logical
+/// messages (effectively noops) with a fixed payload. It is used as an iterator which yields
+/// encoded bytes for a single WAL record, including internal page headers if it spans pages.
+/// Concatenating the bytes will yield a complete, well-formed WAL, which can be chunked at segment
+/// boundaries if desired. Not optimized for performance.
+///
+/// The WAL format is version-dependant (see e.g. `XLOG_PAGE_MAGIC`), so make sure to import this
+/// for the appropriate Postgres version (e.g. `postgres_ffi::v17::wal_generator::WalGenerator`).
+///
+/// A WAL is split into 16 MB segments. Each segment is split into 8 KB pages, with headers.
+/// Records are arbitrary length, 8-byte aligned, and may span pages. The layout is e.g.:
+///
+/// |        Segment 1         |        Segment 2         |        Segment 3         |
+/// | Page 1 | Page 2 | Page 3 | Page 4 | Page 5 | Page 6 | Page 7 | Page 8 | Page 9 |
+/// | R1 |   R2  |R3|  R4  | R5  |  R6  |                 R7            | R8  |
+///
+/// TODO: support generating actual tables and rows.
+#[derive(Default)]
+pub struct WalGenerator {
+    /// Current LSN to append the next record at.
+    ///
+    /// Callers can modify this (and prev_lsn) to restart generation at a different LSN, but should
+    /// ensure that the LSN is on a valid record boundary (i.e. we can't start appending in the
+    /// middle on an existing record or header, or beyond the end of the existing WAL).
+    pub lsn: Lsn,
+    /// The starting LSN of the previous record. Used in WAL record headers. The Safekeeper doesn't
+    /// care about this, unlike Postgres, but we include it for completeness.
+    pub prev_lsn: Lsn,
+}
+
+impl WalGenerator {
+    // For now, hardcode the message payload.
+    // TODO: support specifying the payload size.
+    const PREFIX: &CStr = c"prefix";
+    const MESSAGE: &[u8] = b"message";
+
+    // Hardcode the sys, timeline, and DB IDs. We can make them configurable if we care about them.
+    const SYS_ID: u64 = 0;
+    const TIMELINE_ID: u32 = 1;
+    const DB_ID: u32 = 0;
+
+    /// Creates a new WAL generator, which emits logical message records (noops).
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Encodes a logical message (basically a noop), with the given prefix and message.
+    pub(crate) fn encode_logical_message(prefix: &CStr, message: &[u8]) -> Bytes {
+        let prefix = prefix.to_bytes_with_nul();
+        let header = XlLogicalMessage {
+            db_id: Self::DB_ID,
+            transactional: 0,
+            prefix_size: prefix.len() as u64,
+            message_size: message.len() as u64,
+        };
+        [&header.encode(), prefix, message].concat().into()
+    }
+
+    /// Encode a WAL record with the given payload data (e.g. a logical message).
+    pub(crate) fn encode_record(data: Bytes, rmid: u8, info: u8, prev_lsn: Lsn) -> Bytes {
+        // Prefix data with block ID and length.
+        let data_header = Bytes::from(match data.len() {
+            0 => vec![],
+            1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, data.len() as u8],
+            256.. => {
+                let len_bytes = (data.len() as u32).to_le_bytes();
+                [&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat()
+            }
+        });
+
+        // Construct the WAL record header.
+        let mut header = XLogRecord {
+            xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + data.len()) as u32,
+            xl_xid: 0,
+            xl_prev: prev_lsn.into(),
+            xl_info: info,
+            xl_rmid: rmid,
+            __bindgen_padding_0: [0; 2],
+            xl_crc: 0, // see below
+        };
+
+        // Compute the CRC checksum for the data, and the header up to the CRC field.
+        let mut crc = 0;
+        crc = crc32c_append(crc, &data_header);
+        crc = crc32c_append(crc, &data);
+        crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]);
+        header.xl_crc = crc;
+
+        // Encode the final header and record.
+        let header = header.encode().unwrap();
+
+        [header, data_header, data].concat().into()
+    }
+
+    /// Injects page headers on 8KB page boundaries. Takes the current LSN position where the record
+    /// is to be appended.
+    fn encode_pages(record: Bytes, mut lsn: Lsn) -> Bytes {
+        // Fast path: record fits in current page, and the page already has a header.
+        if lsn.remaining_in_block() as usize >= record.len() && lsn.block_offset() > 0 {
+            return record;
+        }
+
+        let mut pages = BytesMut::new();
+        let mut remaining = record.clone(); // Bytes::clone() is cheap
+        while !remaining.is_empty() {
+            // At new page boundary, inject page header.
+            if lsn.block_offset() == 0 {
+                let mut page_header = XLogPageHeaderData {
+                    xlp_magic: XLOG_PAGE_MAGIC as u16,
+                    xlp_info: XLP_BKP_REMOVABLE,
+                    xlp_tli: Self::TIMELINE_ID,
+                    xlp_pageaddr: lsn.0,
+                    xlp_rem_len: 0,
+                    __bindgen_padding_0: [0; 4],
+                };
+                // If the record was split across page boundaries, mark as continuation.
+                if remaining.len() < record.len() {
+                    page_header.xlp_rem_len = remaining.len() as u32;
+                    page_header.xlp_info |= XLP_FIRST_IS_CONTRECORD;
+                }
+                // At start of segment, use a long page header.
+                let page_header = if lsn.segment_offset(WAL_SEGMENT_SIZE) == 0 {
+                    page_header.xlp_info |= XLP_LONG_HEADER;
+                    XLogLongPageHeaderData {
+                        std: page_header,
+                        xlp_sysid: Self::SYS_ID,
+                        xlp_seg_size: WAL_SEGMENT_SIZE as u32,
+                        xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
+                    }
+                    .encode()
+                    .unwrap()
+                } else {
+                    page_header.encode().unwrap()
+                };
+                pages.extend_from_slice(&page_header);
+                lsn += page_header.len() as u64;
+            }
+
+            // Append the record up to the next page boundary, if any.
+            let page_free = lsn.remaining_in_block() as usize;
+            let chunk = remaining.split_to(std::cmp::min(page_free, remaining.len()));
+            pages.extend_from_slice(&chunk);
+            lsn += chunk.len() as u64;
+        }
+        pages.freeze()
+    }
+
+    /// Records must be 8-byte aligned. Take an encoded record (including any injected page
+    /// boundaries), starting at the given LSN, and add any necessary padding at the end.
+    fn pad_record(record: Bytes, mut lsn: Lsn) -> Bytes {
+        lsn += record.len() as u64;
+        let padding = lsn.calc_padding(8u64) as usize;
+        if padding == 0 {
+            return record;
+        }
+        [record, Bytes::from(vec![0; padding])].concat().into()
+    }
+
+    /// Generates a record with an arbitrary payload at the current LSN, then increments the LSN.
+    pub fn generate_record(&mut self, data: Bytes, rmid: u8, info: u8) -> Bytes {
+        let record = Self::encode_record(data, rmid, info, self.prev_lsn);
+        let record = Self::encode_pages(record, self.lsn);
+        let record = Self::pad_record(record, self.lsn);
+        self.prev_lsn = self.lsn;
+        self.lsn += record.len() as u64;
+        record
+    }
+
+    /// Generates a logical message at the current LSN. Can be used to construct arbitrary messages.
+    pub fn generate_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> Bytes {
+        let data = Self::encode_logical_message(prefix, message);
+        self.generate_record(data, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE)
+    }
+}
+
+/// Generate WAL records as an iterator.
+impl Iterator for WalGenerator {
+    type Item = (Lsn, Bytes);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let lsn = self.lsn;
+        let record = self.generate_logical_message(Self::PREFIX, Self::MESSAGE);
+        Some((lsn, record))
+    }
+}
--- a/libs/postgres_ffi/src/walrecord.rs
+++ b/libs/postgres_ffi/src/walrecord.rs
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -7,15 +7,14 @@
 // have been named the same as the corresponding PostgreSQL functions instead.
 //

-use crc32c::crc32c_append;
-
 use super::super::waldecoder::WalStreamDecoder;
 use super::bindings::{
    CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz,
    XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
 };
+use super::wal_generator::WalGenerator;
 use super::PG_MAJORVERSION;
-use crate::pg_constants;
+use crate::pg_constants::{self, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE};
 use crate::PG_TLI;
 use crate::{uint32, uint64, Oid};
 use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
@@ -26,7 +25,7 @@ use bytes::{Buf, Bytes};
 use log::*;

 use serde::Serialize;
-use std::ffi::OsStr;
+use std::ffi::{CString, OsStr};
 use std::fs::File;
 use std::io::prelude::*;
 use std::io::ErrorKind;
@@ -39,6 +38,7 @@ use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;

 pub const XLOG_FNAME_LEN: usize = 24;
+pub const XLP_BKP_REMOVABLE: u16 = 0x0004;
 pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8;
 pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
@@ -489,64 +489,16 @@ impl XlLogicalMessage {
 /// Create new WAL record for non-transactional logical message.
 /// Used for creating artificial WAL for tests, as LogicalMessage
 /// record is basically no-op.
-///
-/// NOTE: This leaves the xl_prev field zero. The safekeeper and
-/// pageserver tolerate that, but PostgreSQL does not.
-pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
-    let mut prefix_bytes: Vec<u8> = Vec::with_capacity(prefix.len() + 1);
-    prefix_bytes.write_all(prefix.as_bytes()).unwrap();
-    prefix_bytes.push(0);
-
-    let message_bytes = message.as_bytes();
-
-    let logical_message = XlLogicalMessage {
-        db_id: 0,
-        transactional: 0,
-        prefix_size: prefix_bytes.len() as u64,
-        message_size: message_bytes.len() as u64,
-    };
-
-    let mainrdata = logical_message.encode();
-    let mainrdata_len: usize = mainrdata.len() + prefix_bytes.len() + message_bytes.len();
-    // only short mainrdata is supported for now
-    assert!(mainrdata_len <= 255);
-    let mainrdata_len = mainrdata_len as u8;
-
-    let mut data: Vec<u8> = vec![pg_constants::XLR_BLOCK_ID_DATA_SHORT, mainrdata_len];
-    data.extend_from_slice(&mainrdata);
-    data.extend_from_slice(&prefix_bytes);
-    data.extend_from_slice(message_bytes);
-
-    let total_len = XLOG_SIZE_OF_XLOG_RECORD + data.len();
-
-    let mut header = XLogRecord {
-        xl_tot_len: total_len as u32,
-        xl_xid: 0,
-        xl_prev: 0,
-        xl_info: 0,
-        xl_rmid: 21,
-        __bindgen_padding_0: [0u8; 2usize],
-        xl_crc: 0, // crc will be calculated later
-    };
-
-    let header_bytes = header.encode().expect("failed to encode header");
-    let crc = crc32c_append(0, &data);
-    let crc = crc32c_append(crc, &header_bytes[0..XLOG_RECORD_CRC_OFFS]);
-    header.xl_crc = crc;
-
-    let mut wal: Vec<u8> = Vec::new();
-    wal.extend_from_slice(&header.encode().expect("failed to encode header"));
-    wal.extend_from_slice(&data);
-
-    // WAL start position must be aligned at 8 bytes,
-    // this will add padding for the next WAL record.
-    const PADDING: usize = 8;
-    let padding_rem = wal.len() % PADDING;
-    if padding_rem != 0 {
-        wal.resize(wal.len() + PADDING - padding_rem, 0);
-    }
-
-    wal
+pub fn encode_logical_message(prefix: &str, message: &str) -> Bytes {
+    // This function can take untrusted input, so discard any NUL bytes in the prefix string.
+    let prefix = CString::new(prefix.replace('\0', "")).expect("no NULs");
+    let message = message.as_bytes();
+    WalGenerator::encode_record(
+        WalGenerator::encode_logical_message(&prefix, message),
+        RM_LOGICALMSG_ID,
+        XLOG_LOGICAL_MESSAGE,
+        Lsn(0),
+    )
 }

 #[cfg(test)]
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -357,22 +357,20 @@ impl RemoteStorage for LocalFs {
                .list_recursive(prefix)
                .await
                .map_err(DownloadError::Other)?;
-            let objects = keys
-                .into_iter()
-                .filter_map(|k| {
-                    let path = k.with_base(&self.storage_root);
-                    if path.is_dir() {
-                        None
-                    } else {
-                        Some(ListingObject {
-                            key: k.clone(),
-                            // LocalFs is just for testing, so just specify a dummy time
-                            last_modified: SystemTime::now(),
-                            size: 0,
-                        })
-                    }
-                })
-                .collect();
+            let mut objects = Vec::with_capacity(keys.len());
+            for key in keys {
+                let path = key.with_base(&self.storage_root);
+                let metadata = file_metadata(&path).await?;
+                if metadata.is_dir() {
+                    continue;
+                }
+                objects.push(ListingObject {
+                    key: key.clone(),
+                    last_modified: metadata.modified()?,
+                    size: metadata.len(),
+                });
+            }
+            let objects = objects;

            if let ListingMode::NoDelimiter = mode {
                result.keys = objects;
@@ -410,9 +408,8 @@ impl RemoteStorage for LocalFs {
                    } else {
                        result.keys.push(ListingObject {
                            key: RemotePath::from_string(&relative_key).unwrap(),
-                            // LocalFs is just for testing
-                            last_modified: SystemTime::now(),
-                            size: 0,
+                            last_modified: object.last_modified,
+                            size: object.size,
                        });
                    }
                }
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -12,7 +12,7 @@ use crate::seqwait::MonotonicCounter;
 pub const XLOG_BLCKSZ: u32 = 8192;

 /// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
-#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
+#[derive(Clone, Copy, Default, Eq, Ord, PartialEq, PartialOrd, Hash)]
 pub struct Lsn(pub u64);

 impl Serialize for Lsn {
--- a/libs/wal_decoder/Cargo.toml
+++ b/libs/wal_decoder/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "wal_decoder"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[features]
+testing = []
+
+[dependencies]
+anyhow.workspace = true
+bytes.workspace = true
+pageserver_api.workspace = true
+postgres_ffi.workspace = true
+serde.workspace = true
+tracing.workspace = true
+utils.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/wal_decoder/src/decoder.rs
+++ b/libs/wal_decoder/src/decoder.rs
@@ -0,0 +1 @@
+
--- a/libs/wal_decoder/src/lib.rs
+++ b/libs/wal_decoder/src/lib.rs
@@ -0,0 +1,2 @@
+pub mod decoder;
+pub mod models;
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -0,0 +1,167 @@
+//! This module houses types which represent decoded PG WAL records
+//! ready for the pageserver to interpret. They are derived from the original
+//! WAL records, so that each struct corresponds closely to one WAL record of
+//! a specific kind. They contain the same information as the original WAL records,
+//! just decoded into structs and fields for easier access.
+//!
+//! The ingestion code uses these structs to help with parsing the WAL records,
+//! and it splits them into a stream of modifications to the key-value pairs that
+//! are ultimately stored in delta layers.  See also the split-out counterparts in
+//! [`postgres_ffi::walrecord`].
+//!
+//! The pipeline which processes WAL records is not super obvious, so let's follow
+//! the flow of an example XACT_COMMIT Postgres record:
+//!
+//! (Postgres XACT_COMMIT record)
+//! |
+//! |--> pageserver::walingest::WalIngest::decode_xact_record
+//!      |
+//!      |--> ([`XactRecord::Commit`])
+//!           |
+//!           |--> pageserver::walingest::WalIngest::ingest_xact_record
+//!                |
+//!                |--> (NeonWalRecord::ClogSetCommitted)
+//!                     |
+//!                     |--> write to KV store within the pageserver
+
+use bytes::Bytes;
+use pageserver_api::reltag::{RelTag, SlruKind};
+use postgres_ffi::walrecord::{
+    XlMultiXactCreate, XlMultiXactTruncate, XlRelmapUpdate, XlReploriginDrop, XlReploriginSet,
+    XlSmgrTruncate, XlXactParsedRecord,
+};
+use postgres_ffi::{Oid, TransactionId};
+use utils::lsn::Lsn;
+
+pub enum HeapamRecord {
+    ClearVmBits(ClearVmBits),
+}
+
+pub struct ClearVmBits {
+    pub new_heap_blkno: Option<u32>,
+    pub old_heap_blkno: Option<u32>,
+    pub vm_rel: RelTag,
+    pub flags: u8,
+}
+
+pub enum NeonrmgrRecord {
+    ClearVmBits(ClearVmBits),
+}
+
+pub enum SmgrRecord {
+    Create(SmgrCreate),
+    Truncate(XlSmgrTruncate),
+}
+
+pub struct SmgrCreate {
+    pub rel: RelTag,
+}
+
+pub enum DbaseRecord {
+    Create(DbaseCreate),
+    Drop(DbaseDrop),
+}
+
+pub struct DbaseCreate {
+    pub db_id: Oid,
+    pub tablespace_id: Oid,
+    pub src_db_id: Oid,
+    pub src_tablespace_id: Oid,
+}
+
+pub struct DbaseDrop {
+    pub db_id: Oid,
+    pub tablespace_ids: Vec<Oid>,
+}
+
+pub enum ClogRecord {
+    ZeroPage(ClogZeroPage),
+    Truncate(ClogTruncate),
+}
+
+pub struct ClogZeroPage {
+    pub segno: u32,
+    pub rpageno: u32,
+}
+
+pub struct ClogTruncate {
+    pub pageno: u32,
+    pub oldest_xid: TransactionId,
+    pub oldest_xid_db: Oid,
+}
+
+pub enum XactRecord {
+    Commit(XactCommon),
+    Abort(XactCommon),
+    CommitPrepared(XactCommon),
+    AbortPrepared(XactCommon),
+    Prepare(XactPrepare),
+}
+
+pub struct XactCommon {
+    pub parsed: XlXactParsedRecord,
+    pub origin_id: u16,
+    // Fields below are only used for logging
+    pub xl_xid: TransactionId,
+    pub lsn: Lsn,
+}
+
+pub struct XactPrepare {
+    pub xl_xid: TransactionId,
+    pub data: Bytes,
+}
+
+pub enum MultiXactRecord {
+    ZeroPage(MultiXactZeroPage),
+    Create(XlMultiXactCreate),
+    Truncate(XlMultiXactTruncate),
+}
+
+pub struct MultiXactZeroPage {
+    pub slru_kind: SlruKind,
+    pub segno: u32,
+    pub rpageno: u32,
+}
+
+pub enum RelmapRecord {
+    Update(RelmapUpdate),
+}
+
+pub struct RelmapUpdate {
+    pub update: XlRelmapUpdate,
+    pub buf: Bytes,
+}
+
+pub enum XlogRecord {
+    Raw(RawXlogRecord),
+}
+
+pub struct RawXlogRecord {
+    pub info: u8,
+    pub lsn: Lsn,
+    pub buf: Bytes,
+}
+
+pub enum LogicalMessageRecord {
+    Put(PutLogicalMessage),
+    #[cfg(feature = "testing")]
+    Failpoint,
+}
+
+pub struct PutLogicalMessage {
+    pub path: String,
+    pub buf: Bytes,
+}
+
+pub enum StandbyRecord {
+    RunningXacts(StandbyRunningXacts),
+}
+
+pub struct StandbyRunningXacts {
+    pub oldest_running_xid: TransactionId,
+}
+
+pub enum ReploriginRecord {
+    Set(XlReploriginSet),
+    Drop(XlReploriginDrop),
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 default = []
 # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
 # which adds some runtime cost to run tests on outage conditions
-testing = ["fail/failpoints", "pageserver_api/testing" ]
+testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"]

 [dependencies]
 anyhow.workspace = true
@@ -83,6 +83,7 @@ enum-map.workspace = true
 enumset = { workspace = true, features = ["serde"]}
 strum.workspace = true
 strum_macros.workspace = true
+wal_decoder.workspace = true

 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -8,13 +8,12 @@ use pageserver::{
    context::{DownloadBehavior, RequestContext},
    l0_flush::{L0FlushConfig, L0FlushGlobalState},
    page_cache,
-    repository::Value,
    task_mgr::TaskKind,
    tenant::storage_layer::inmemory_layer::SerializedBatch,
    tenant::storage_layer::InMemoryLayer,
    virtual_file,
 };
-use pageserver_api::{key::Key, shard::TenantShardId};
+use pageserver_api::{key::Key, shard::TenantShardId, value::Value};
 use utils::{
    bin_ser::BeSer,
    id::{TenantId, TimelineId},
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,9 +1,9 @@
 use criterion::measurement::WallTime;
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
-use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::storage_layer::PersistentLayerDesc;
+use pageserver_api::key::Key;
 use pageserver_api::shard::TenantShardId;
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -60,7 +60,8 @@ use anyhow::Context;
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
 use once_cell::sync::Lazy;
-use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
+use pageserver::{config::PageServerConf, walredo::PostgresRedoManager};
+use pageserver_api::record::NeonWalRecord;
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
    future::Future,
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -51,7 +51,7 @@
 //!

 use anyhow::{Context, Result};
-use pageserver::repository::Key;
+use pageserver_api::key::Key;
 use std::cmp::Ordering;
 use std::io::{self, BufRead};
 use std::path::PathBuf;
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -11,7 +11,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
    match cmd {
        IndexPartCmd::Dump { path } => {
            let bytes = tokio::fs::read(path).await.context("read file")?;
-            let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
+            let des: IndexPart = IndexPart::from_json_bytes(&bytes).context("deserialize")?;
            let output = serde_json::to_string_pretty(&des).context("serialize output")?;
            println!("{output}");
            Ok(())
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -2,7 +2,7 @@
 //!
 //! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.

-use anyhow::Result;
+use anyhow::{anyhow, Result};
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
@@ -11,15 +11,16 @@ use pageserver::virtual_file::api::IoMode;
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::ops::Range;
+use std::str::FromStr;
 use std::{fs, str};

 use pageserver::page_cache::{self, PAGE_SZ};
-use pageserver::repository::{Key, KEY_SIZE};
 use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
 use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
-use pageserver::tenant::storage_layer::range_overlaps;
+use pageserver::tenant::storage_layer::{range_overlaps, LayerName};
 use pageserver::virtual_file::{self, VirtualFile};
+use pageserver_api::key::{Key, KEY_SIZE};

 use utils::{bin_ser::BeSer, lsn::Lsn};

@@ -74,35 +75,15 @@ impl LayerFile {
    }
 }

-pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
-    let split: Vec<&str> = name.split("__").collect();
-    if split.len() != 2 {
-        return None;
-    }
-    let keys: Vec<&str> = split[0].split('-').collect();
-    let lsn_and_opt_generation: Vec<&str> = split[1].split('v').collect();
-    let lsns: Vec<&str> = lsn_and_opt_generation[0].split('-').collect();
-    let the_lsns: [&str; 2];
+pub(crate) fn parse_filename(name: &str) -> anyhow::Result<LayerFile> {
+    let layer_name =
+        LayerName::from_str(name).map_err(|e| anyhow!("failed to parse layer name: {e}"))?;

-    /*
-     * Generations add a -vX-XXXXXX postfix, which causes issues when we try to
-     * parse 'vX' as an LSN.
-     */
-    let is_delta = if lsns.len() == 1 || lsns[1].is_empty() {
-        the_lsns = [lsns[0], lsns[0]];
-        false
-    } else {
-        the_lsns = [lsns[0], lsns[1]];
-        true
-    };
-
-    let key_range = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
-    let lsn_range = Lsn::from_hex(the_lsns[0]).unwrap()..Lsn::from_hex(the_lsns[1]).unwrap();
    let holes = Vec::new();
-    Some(LayerFile {
-        key_range,
-        lsn_range,
-        is_delta,
+    Ok(LayerFile {
+        key_range: layer_name.key_range().clone(),
+        lsn_range: layer_name.lsn_as_range(),
+        is_delta: layer_name.is_delta(),
        holes,
    })
 }
@@ -179,7 +160,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {

            for layer in fs::read_dir(timeline.path())? {
                let layer = layer?;
-                if let Some(mut layer_file) =
+                if let Ok(mut layer_file) =
                    parse_filename(&layer.file_name().into_string().unwrap())
                {
                    if layer_file.is_delta {
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -5,24 +5,12 @@ use camino::{Utf8Path, Utf8PathBuf};
 use clap::Subcommand;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
-use pageserver::tenant::block_io::BlockCursor;
-use pageserver::tenant::disk_btree::DiskBtreeReader;
-use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
 use pageserver::tenant::storage_layer::{delta_layer, image_layer};
 use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use pageserver::virtual_file::api::IoMode;
 use pageserver::{page_cache, virtual_file};
-use pageserver::{
-    repository::{Key, KEY_SIZE},
-    tenant::{
-        block_io::FileBlockReader, disk_btree::VisitDirection,
-        storage_layer::delta_layer::DELTA_KEY_SIZE,
-    },
-    virtual_file::VirtualFile,
-};
-use std::fs;
-use utils::bin_ser::BeSer;
+use std::fs::{self, File};
 use utils::id::{TenantId, TimelineId};

 use crate::layer_map_analyzer::parse_filename;
@@ -59,44 +47,30 @@ pub(crate) enum LayerCmd {
 }

 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
-    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
    virtual_file::init(
        10,
        virtual_file::api::IoEngineKind::StdFs,
        IoMode::preferred(),
    );
    page_cache::init(100);
-    let file = VirtualFile::open(path, ctx).await?;
-    let file_id = page_cache::next_file_id();
-    let block_reader = FileBlockReader::new(&file, file_id);
-    let summary_blk = block_reader.read_blk(0, ctx).await?;
-    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
-    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-        actual_summary.index_start_blk,
-        actual_summary.index_root_blk,
-        &block_reader,
+    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
+    let file = File::open(path)?;
+    let delta_layer = DeltaLayer::new_for_path(path, file)?;
+    delta_layer.dump(true, ctx).await?;
+    Ok(())
+}
+
+async fn read_image_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
+    virtual_file::init(
+        10,
+        virtual_file::api::IoEngineKind::StdFs,
+        IoMode::preferred(),
    );
-    // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
-    let mut all = vec![];
-    tree_reader
-        .visit(
-            &[0u8; DELTA_KEY_SIZE],
-            VisitDirection::Forwards,
-            |key, value_offset| {
-                let curr = Key::from_slice(&key[..KEY_SIZE]);
-                all.push((curr, BlobRef(value_offset)));
-                true
-            },
-            ctx,
-        )
-        .await?;
-    let cursor = BlockCursor::new_fileblockreader(&block_reader);
-    for (k, v) in all {
-        let value = cursor.read_blob(v.pos(), ctx).await?;
-        println!("key:{} value_len:{}", k, value.len());
-        assert!(k.is_i128_representable(), "invalid key: ");
-    }
-    // TODO(chi): special handling for last key?
+    page_cache::init(100);
+    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
+    let file = File::open(path)?;
+    let image_layer = ImageLayer::new_for_path(path, file)?;
+    image_layer.dump(true, ctx).await?;
    Ok(())
 }

@@ -133,8 +107,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            let mut idx = 0;
            for layer in fs::read_dir(timeline_path)? {
                let layer = layer?;
-                if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
-                {
+                if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) {
                    println!(
                        "[{:3}]  key:{}-{}\n       lsn:{}-{}\n       delta:{}",
                        idx,
@@ -163,8 +136,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            let mut idx = 0;
            for layer in fs::read_dir(timeline_path)? {
                let layer = layer?;
-                if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
-                {
+                if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) {
                    if *id == idx {
                        // TODO(chi): dedup code
                        println!(
@@ -180,7 +152,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                        if layer_file.is_delta {
                            read_delta_file(layer.path(), &ctx).await?;
                        } else {
-                            anyhow::bail!("not supported yet :(");
+                            read_image_file(layer.path(), &ctx).await?;
                        }

                        break;
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -1,4 +1,4 @@
-use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
+use pageserver_api::models::{TenantConfig, TenantConfigRequest};
 use pageserver_api::shard::TenantShardId;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
@@ -66,10 +66,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
    mgmt_api_client
        .tenant_config(&TenantConfigRequest {
            tenant_id: timeline.tenant_id,
-            config: TenantConfig {
-                switch_aux_file_policy: Some(AuxFilePolicy::V2),
-                ..Default::default()
-            },
+            config: TenantConfig::default(),
        })
        .await?;

--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -59,6 +59,7 @@ pub async fn send_basebackup_tarball<'a, W>(
    req_lsn: Option<Lsn>,
    prev_lsn: Option<Lsn>,
    full_backup: bool,
+    replica: bool,
    ctx: &'a RequestContext,
 ) -> Result<(), BasebackupError>
 where
@@ -110,8 +111,8 @@ where
    };

    info!(
-        "taking basebackup lsn={}, prev_lsn={} (full_backup={})",
-        backup_lsn, prev_lsn, full_backup
+        "taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={})",
+        backup_lsn, prev_lsn, full_backup, replica
    );

    let basebackup = Basebackup {
@@ -120,6 +121,7 @@ where
        lsn: backup_lsn,
        prev_record_lsn: prev_lsn,
        full_backup,
+        replica,
        ctx,
    };
    basebackup
@@ -140,6 +142,7 @@ where
    lsn: Lsn,
    prev_record_lsn: Lsn,
    full_backup: bool,
+    replica: bool,
    ctx: &'a RequestContext,
 }

@@ -372,6 +375,10 @@ where

        for (path, content) in aux_files {
            if path.starts_with("pg_replslot") {
+                // Do not create LR slots at standby because they are not used but prevent WAL truncation
+                if self.replica {
+                    continue;
+                }
                let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
                let restart_lsn = Lsn(u64::from_le_bytes(
                    content[offs..offs + 8].try_into().unwrap(),
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -696,7 +696,7 @@ impl DeletionQueue {
 mod test {
    use camino::Utf8Path;
    use hex_literal::hex;
-    use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
+    use pageserver_api::{key::Key, shard::ShardIndex, upcall_api::ReAttachResponseTenant};
    use std::{io::ErrorKind, time::Duration};
    use tracing::info;

@@ -705,7 +705,6 @@ mod test {

    use crate::{
        controller_upcall_client::RetryForeverError,
-        repository::Key,
        tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
    };

--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -597,6 +597,10 @@ paths:
        Create a timeline. Returns new timeline id on success.
        Recreating the same timeline will succeed if the parameters match the existing timeline.
        If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver.
+
+        To ensure durability, the caller must retry the creation until success.
+        Just because the timeline is visible via other endpoints does not mean it is durable.
+        Future versions may stop showing timelines that are not yet durable.
      requestBody:
        content:
          application/json:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -38,6 +38,7 @@ use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantSorting;
 use pageserver_api::models::TimelineArchivalConfigRequest;
+use pageserver_api::models::TimelineCreateRequestMode;
 use pageserver_api::models::TimelinesInfoAndOffloaded;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::TopTenantShardsRequest;
@@ -85,6 +86,7 @@ use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
 use crate::tenant::OffloadedTimeline;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
+use crate::DEFAULT_PG_VERSION;
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
    StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
@@ -486,6 +488,7 @@ fn build_timeline_offloaded_info(offloaded: &Arc<OffloadedTimeline>) -> Offloade
        timeline_id,
        ancestor_retain_lsn,
        ancestor_timeline_id,
+        archived_at,
        ..
    } = offloaded.as_ref();
    OffloadedTimelineInfo {
@@ -493,6 +496,7 @@ fn build_timeline_offloaded_info(offloaded: &Arc<OffloadedTimeline>) -> Offloade
        timeline_id,
        ancestor_retain_lsn,
        ancestor_timeline_id,
+        archived_at: archived_at.and_utc(),
    }
 }

@@ -545,6 +549,26 @@ async fn timeline_create_handler(
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let new_timeline_id = request_data.new_timeline_id;
+    // fill in the default pg_version if not provided & convert request into domain model
+    let params: tenant::CreateTimelineParams = match request_data.mode {
+        TimelineCreateRequestMode::Bootstrap {
+            existing_initdb_timeline_id,
+            pg_version,
+        } => tenant::CreateTimelineParams::Bootstrap(tenant::CreateTimelineParamsBootstrap {
+            new_timeline_id,
+            existing_initdb_timeline_id,
+            pg_version: pg_version.unwrap_or(DEFAULT_PG_VERSION),
+        }),
+        TimelineCreateRequestMode::Branch {
+            ancestor_timeline_id,
+            ancestor_start_lsn,
+            pg_version: _,
+        } => tenant::CreateTimelineParams::Branch(tenant::CreateTimelineParamsBranch {
+            new_timeline_id,
+            ancestor_timeline_id,
+            ancestor_start_lsn,
+        }),
+    };

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);

@@ -557,22 +581,12 @@ async fn timeline_create_handler(

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

-        if let Some(ancestor_id) = request_data.ancestor_timeline_id.as_ref() {
-            tracing::info!(%ancestor_id, "starting to branch");
-        } else {
-            tracing::info!("bootstrapping");
-        }
+        // earlier versions of the code had pg_version and ancestor_lsn in the span
+        // => continue to provide that information, but, through a log message that doesn't require us to destructure
+        tracing::info!(?params, "creating timeline");

        match tenant
-            .create_timeline(
-                new_timeline_id,
-                request_data.ancestor_timeline_id,
-                request_data.ancestor_start_lsn,
-                request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
-                request_data.existing_initdb_timeline_id,
-                state.broker_client.clone(),
-                &ctx,
-            )
+            .create_timeline(params, state.broker_client.clone(), &ctx)
            .await
        {
            Ok(new_timeline) => {
@@ -623,8 +637,6 @@ async fn timeline_create_handler(
        tenant_id = %tenant_shard_id.tenant_id,
        shard_id = %tenant_shard_id.shard_slug(),
        timeline_id = %new_timeline_id,
-        lsn=?request_data.ancestor_start_lsn,
-        pg_version=?request_data.pg_version
    ))
    .await
 }
@@ -1281,6 +1293,99 @@ async fn layer_map_info_handler(
    json_response(StatusCode::OK, layer_map_info)
 }

+#[instrument(skip_all, fields(tenant_id, shard_id, timeline_id, layer_name))]
+async fn timeline_layer_scan_disposable_keys(
+    request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let layer_name: LayerName = parse_request_param(&request, "layer_name")?;
+
+    tracing::Span::current().record(
+        "tenant_id",
+        tracing::field::display(&tenant_shard_id.tenant_id),
+    );
+    tracing::Span::current().record(
+        "shard_id",
+        tracing::field::display(tenant_shard_id.shard_slug()),
+    );
+    tracing::Span::current().record("timeline_id", tracing::field::display(&timeline_id));
+    tracing::Span::current().record("layer_name", tracing::field::display(&layer_name));
+
+    let state = get_state(&request);
+
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    // technically the timeline need not be active for this scan to complete
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+
+    let guard = timeline.layers.read().await;
+    let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("Layer {tenant_shard_id}/{timeline_id}/{layer_name} not found").into(),
+        ));
+    };
+
+    let resident_layer = layer
+        .download_and_keep_resident()
+        .await
+        .map_err(|err| match err {
+            tenant::storage_layer::layer::DownloadError::TimelineShutdown
+            | tenant::storage_layer::layer::DownloadError::DownloadCancelled => {
+                ApiError::ShuttingDown
+            }
+            tenant::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads
+            | tenant::storage_layer::layer::DownloadError::DownloadRequired
+            | tenant::storage_layer::layer::DownloadError::NotFile(_)
+            | tenant::storage_layer::layer::DownloadError::DownloadFailed
+            | tenant::storage_layer::layer::DownloadError::PreStatFailed(_) => {
+                ApiError::InternalServerError(err.into())
+            }
+            #[cfg(test)]
+            tenant::storage_layer::layer::DownloadError::Failpoint(_) => {
+                ApiError::InternalServerError(err.into())
+            }
+        })?;
+
+    let keys = resident_layer
+        .load_keys(&ctx)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    let shard_identity = timeline.get_shard_identity();
+
+    let mut disposable_count = 0;
+    let mut not_disposable_count = 0;
+    let cancel = cancel.clone();
+    for (i, key) in keys.into_iter().enumerate() {
+        if shard_identity.is_key_disposable(&key) {
+            disposable_count += 1;
+            tracing::debug!(key = %key, key.dbg=?key, "disposable key");
+        } else {
+            not_disposable_count += 1;
+        }
+        #[allow(clippy::collapsible_if)]
+        if i % 10000 == 0 {
+            if cancel.is_cancelled() || timeline.cancel.is_cancelled() || timeline.is_stopping() {
+                return Err(ApiError::ShuttingDown);
+            }
+        }
+    }
+
+    json_response(
+        StatusCode::OK,
+        pageserver_api::models::ScanDisposableKeysResponse {
+            disposable_count,
+            not_disposable_count,
+        },
+    )
+}
+
 async fn layer_download_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -2127,13 +2232,13 @@ async fn getpage_at_lsn_handler(
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let state = get_state(&request);

-    struct Key(crate::repository::Key);
+    struct Key(pageserver_api::key::Key);

    impl std::str::FromStr for Key {
        type Err = anyhow::Error;

        fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
-            crate::repository::Key::from_hex(s).map(Key)
+            pageserver_api::key::Key::from_hex(s).map(Key)
        }
    }

@@ -3143,6 +3248,10 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| api_handler(r, evict_timeline_layer_handler),
        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_name/scan_disposable_keys",
+            |r| testing_api_handler("timeline_layer_scan_disposable_keys", r, timeline_layer_scan_disposable_keys),
+        )
        .post(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
            |r| api_handler(r, timeline_gc_blocking_handler),
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -19,12 +19,11 @@ use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
-use crate::walrecord::decode_wal_record;
-use crate::walrecord::DecodedWALRecord;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::*;
 use postgres_ffi::waldecoder::WalStreamDecoder;
+use postgres_ffi::walrecord::{decode_wal_record, DecodedWALRecord};
 use postgres_ffi::ControlFileData;
 use postgres_ffi::DBState_DB_SHUTDOWNED;
 use postgres_ffi::Oid;
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -24,7 +24,6 @@ pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
-pub mod repository;
 pub mod span;
 pub(crate) mod statvfs;
 pub mod task_mgr;
@@ -32,7 +31,6 @@ pub mod tenant;
 pub mod utilization;
 pub mod virtual_file;
 pub mod walingest;
-pub mod walrecord;
 pub mod walredo;

 use camino::Utf8Path;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2092,6 +2092,7 @@ pub(crate) struct WalIngestMetrics {
    pub(crate) records_received: IntCounter,
    pub(crate) records_committed: IntCounter,
    pub(crate) records_filtered: IntCounter,
+    pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
 }

 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
@@ -2115,6 +2116,11 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
        "Number of WAL records filtered out due to sharding"
    )
    .expect("failed to define a metric"),
+    gap_blocks_zeroed_on_rel_extend: register_int_counter!(
+        "pageserver_gap_blocks_zeroed_on_rel_extend",
+        "Total number of zero gap blocks written on relation extends"
+    )
+    .expect("failed to define a metric"),
 });

 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
@@ -3034,13 +3040,111 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }

 pub mod tokio_epoll_uring {
-    use metrics::{register_int_counter, UIntGauge};
+    use std::{
+        collections::HashMap,
+        sync::{Arc, Mutex},
+    };
+
+    use metrics::{register_histogram, register_int_counter, Histogram, LocalHistogram, UIntGauge};
    use once_cell::sync::Lazy;

+    /// Shared storage for tokio-epoll-uring thread local metrics.
+    pub(crate) static THREAD_LOCAL_METRICS_STORAGE: Lazy<ThreadLocalMetricsStorage> =
+        Lazy::new(|| {
+            let slots_submission_queue_depth = register_histogram!(
+                "pageserver_tokio_epoll_uring_slots_submission_queue_depth",
+                "The slots waiters queue depth of each tokio_epoll_uring system",
+                vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+            )
+            .expect("failed to define a metric");
+            ThreadLocalMetricsStorage {
+                observers: Mutex::new(HashMap::new()),
+                slots_submission_queue_depth,
+            }
+        });
+
+    pub struct ThreadLocalMetricsStorage {
+        /// List of thread local metrics observers.
+        observers: Mutex<HashMap<u64, Arc<ThreadLocalMetrics>>>,
+        /// A histogram shared between all thread local systems
+        /// for collecting slots submission queue depth.
+        slots_submission_queue_depth: Histogram,
+    }
+
+    /// Each thread-local [`tokio_epoll_uring::System`] gets one of these as its
+    /// [`tokio_epoll_uring::metrics::PerSystemMetrics`] generic.
+    ///
+    /// The System makes observations into [`Self`] and periodically, the collector
+    /// comes along and flushes [`Self`] into the shared storage [`THREAD_LOCAL_METRICS_STORAGE`].
+    ///
+    /// [`LocalHistogram`] is `!Send`, so, we need to put it behind a [`Mutex`].
+    /// But except for the periodic flush, the lock is uncontended so there's no waiting
+    /// for cache coherence protocol to get an exclusive cache line.
+    pub struct ThreadLocalMetrics {
+        /// Local observer of thread local tokio-epoll-uring system's slots waiters queue depth.
+        slots_submission_queue_depth: Mutex<LocalHistogram>,
+    }
+
+    impl ThreadLocalMetricsStorage {
+        /// Registers a new thread local system. Returns a thread local metrics observer.
+        pub fn register_system(&self, id: u64) -> Arc<ThreadLocalMetrics> {
+            let per_system_metrics = Arc::new(ThreadLocalMetrics::new(
+                self.slots_submission_queue_depth.local(),
+            ));
+            let mut g = self.observers.lock().unwrap();
+            g.insert(id, Arc::clone(&per_system_metrics));
+            per_system_metrics
+        }
+
+        /// Removes metrics observer for a thread local system.
+        /// This should be called before dropping a thread local system.
+        pub fn remove_system(&self, id: u64) {
+            let mut g = self.observers.lock().unwrap();
+            g.remove(&id);
+        }
+
+        /// Flush all thread local metrics to the shared storage.
+        pub fn flush_thread_local_metrics(&self) {
+            let g = self.observers.lock().unwrap();
+            g.values().for_each(|local| {
+                local.flush();
+            });
+        }
+    }
+
+    impl ThreadLocalMetrics {
+        pub fn new(slots_submission_queue_depth: LocalHistogram) -> Self {
+            ThreadLocalMetrics {
+                slots_submission_queue_depth: Mutex::new(slots_submission_queue_depth),
+            }
+        }
+
+        /// Flushes the thread local metrics to shared aggregator.
+        pub fn flush(&self) {
+            let Self {
+                slots_submission_queue_depth,
+            } = self;
+            slots_submission_queue_depth.lock().unwrap().flush();
+        }
+    }
+
+    impl tokio_epoll_uring::metrics::PerSystemMetrics for ThreadLocalMetrics {
+        fn observe_slots_submission_queue_depth(&self, queue_depth: u64) {
+            let Self {
+                slots_submission_queue_depth,
+            } = self;
+            slots_submission_queue_depth
+                .lock()
+                .unwrap()
+                .observe(queue_depth as f64);
+        }
+    }
+
    pub struct Collector {
        descs: Vec<metrics::core::Desc>,
        systems_created: UIntGauge,
        systems_destroyed: UIntGauge,
+        thread_local_metrics_storage: &'static ThreadLocalMetricsStorage,
    }

    impl metrics::core::Collector for Collector {
@@ -3050,7 +3154,7 @@ pub mod tokio_epoll_uring {

        fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
            let mut mfs = Vec::with_capacity(Self::NMETRICS);
-            let tokio_epoll_uring::metrics::Metrics {
+            let tokio_epoll_uring::metrics::GlobalMetrics {
                systems_created,
                systems_destroyed,
            } = tokio_epoll_uring::metrics::global();
@@ -3058,12 +3162,21 @@ pub mod tokio_epoll_uring {
            mfs.extend(self.systems_created.collect());
            self.systems_destroyed.set(systems_destroyed);
            mfs.extend(self.systems_destroyed.collect());
+
+            self.thread_local_metrics_storage
+                .flush_thread_local_metrics();
+
+            mfs.extend(
+                self.thread_local_metrics_storage
+                    .slots_submission_queue_depth
+                    .collect(),
+            );
            mfs
        }
    }

    impl Collector {
-        const NMETRICS: usize = 2;
+        const NMETRICS: usize = 3;

        #[allow(clippy::new_without_default)]
        pub fn new() -> Self {
@@ -3095,6 +3208,7 @@ pub mod tokio_epoll_uring {
                descs,
                systems_created,
                systems_destroyed,
+                thread_local_metrics_storage: &THREAD_LOCAL_METRICS_STORAGE,
            }
        }
    }
@@ -3454,6 +3568,7 @@ pub fn preinitialize_metrics() {
    Lazy::force(&RECONSTRUCT_TIME);
    Lazy::force(&BASEBACKUP_QUERY_TIME);
    Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
+    Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE);

    tenant_throttling::preinitialize_global_metrics();
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1080,6 +1080,7 @@ impl PageServerHandler {
        prev_lsn: Option<Lsn>,
        full_backup: bool,
        gzip: bool,
+        replica: bool,
        ctx: &RequestContext,
    ) -> Result<(), QueryError>
    where
@@ -1132,6 +1133,7 @@ impl PageServerHandler {
                lsn,
                prev_lsn,
                full_backup,
+                replica,
                ctx,
            )
            .await
@@ -1154,6 +1156,7 @@ impl PageServerHandler {
                    lsn,
                    prev_lsn,
                    full_backup,
+                    replica,
                    ctx,
                )
                .await
@@ -1170,6 +1173,7 @@ impl PageServerHandler {
                    lsn,
                    prev_lsn,
                    full_backup,
+                    replica,
                    ctx,
                )
                .await
@@ -1326,24 +1330,27 @@ where
                .for_command(ComputeCommandKind::Basebackup)
                .inc();

-            let (lsn, gzip) = match (params.get(2), params.get(3)) {
-                (None, _) => (None, false),
-                (Some(&"--gzip"), _) => (None, true),
-                (Some(lsn_str), gzip_str_opt) => {
-                    let lsn = Lsn::from_str(lsn_str)
-                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?;
-                    let gzip = match gzip_str_opt {
-                        Some(&"--gzip") => true,
-                        None => false,
-                        Some(third_param) => {
+            let mut lsn = None;
+            let mut replica = false;
+            let mut gzip = false;
+            for param in &params[2..] {
+                if param.starts_with("--") {
+                    match *param {
+                        "--gzip" => gzip = true,
+                        "--replica" => replica = true,
+                        _ => {
                            return Err(QueryError::Other(anyhow::anyhow!(
-                                "Parameter in position 3 unknown {third_param}",
+                                "Unknown parameter {param}",
                            )))
                        }
-                    };
-                    (Some(lsn), gzip)
+                    }
+                } else {
+                    lsn = Some(
+                        Lsn::from_str(param)
+                            .with_context(|| format!("Failed to parse Lsn from {param}"))?,
+                    );
                }
-            };
+            }

            let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
            let res = async {
@@ -1355,6 +1362,7 @@ where
                    None,
                    false,
                    gzip,
+                    replica,
                    &ctx,
                )
                .await?;
@@ -1415,6 +1423,7 @@ where
                prev_lsn,
                true,
                false,
+                false,
                &ctx,
            )
            .await?;
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -7,14 +7,14 @@
 //! Clarify that)
 //!
 use super::tenant::{PageReconstructError, Timeline};
+use crate::aux_file;
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
-use crate::walrecord::NeonWalRecord;
-use crate::{aux_file, repository::*};
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
+use pageserver_api::key::Key;
 use pageserver_api::key::{
    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
@@ -22,7 +22,9 @@ use pageserver_api::key::{
    CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
+use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
+use pageserver_api::value::Value;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
@@ -1506,35 +1508,42 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    /// Drop a relation.
-    pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    /// Drop some relations
+    pub(crate) async fn put_rel_drops(
+        &mut self,
+        drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        for ((spc_node, db_node), rel_tags) in drop_relations {
+            let dir_key = rel_dir_to_key(spc_node, db_node);
+            let buf = self.get(dir_key, ctx).await?;
+            let mut dir = RelDirectory::des(&buf)?;

-        // Remove it from the directory entry
-        let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let buf = self.get(dir_key, ctx).await?;
-        let mut dir = RelDirectory::des(&buf)?;
+            let mut dirty = false;
+            for rel_tag in rel_tags {
+                if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
+                    dirty = true;

-        self.pending_directory_entries
-            .push((DirectoryKind::Rel, dir.rels.len()));
+                    // update logical size
+                    let size_key = rel_size_to_key(rel_tag);
+                    let old_size = self.get(size_key, ctx).await?.get_u32_le();
+                    self.pending_nblocks -= old_size as i64;

-        if dir.rels.remove(&(rel.relnode, rel.forknum)) {
-            self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
-        } else {
-            warn!("dropped rel {} did not exist in rel directory", rel);
+                    // Remove entry from relation size cache
+                    self.tline.remove_cached_rel_size(&rel_tag);
+
+                    // Delete size entry, as well as all blocks
+                    self.delete(rel_key_range(rel_tag));
+                }
+            }
+
+            if dirty {
+                self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
+                self.pending_directory_entries
+                    .push((DirectoryKind::Rel, dir.rels.len()));
+            }
        }

-        // update logical size
-        let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key, ctx).await?.get_u32_le();
-        self.pending_nblocks -= old_size as i64;
-
-        // Remove enty from relation size cache
-        self.tline.remove_cached_rel_size(&rel);
-
-        // Delete size entry, as well as all blocks
-        self.delete(rel_key_range(rel));
-
        Ok(())
    }

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,7 +9,6 @@
 //! may lead to a data loss.
 //!
 pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
-use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
 use pageserver_api::models::{self, ThrottleConfig};
@@ -341,10 +340,6 @@ pub struct TenantConfOpt {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub image_layer_creation_check_threshold: Option<u8>,

-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(default)]
-    pub switch_aux_file_policy: Option<AuxFilePolicy>,
-
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(with = "humantime_serde")]
    #[serde(default)]
@@ -410,9 +405,6 @@ impl TenantConfOpt {
            image_layer_creation_check_threshold: self
                .image_layer_creation_check_threshold
                .unwrap_or(global_conf.image_layer_creation_check_threshold),
-            switch_aux_file_policy: self
-                .switch_aux_file_policy
-                .unwrap_or(global_conf.switch_aux_file_policy),
            lsn_lease_length: self
                .lsn_lease_length
                .unwrap_or(global_conf.lsn_lease_length),
@@ -470,7 +462,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
            lazy_slru_download: value.lazy_slru_download,
            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
-            switch_aux_file_policy: value.switch_aux_file_policy,
            lsn_lease_length: value.lsn_lease_length.map(humantime),
            lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
        }
--- a/pageserver/src/tenant/gc_result.rs
+++ b/pageserver/src/tenant/gc_result.rs
@@ -0,0 +1,57 @@
+use anyhow::Result;
+use serde::Serialize;
+use std::ops::AddAssign;
+use std::time::Duration;
+
+///
+/// Result of performing GC
+///
+#[derive(Default, Serialize, Debug)]
+pub struct GcResult {
+    pub layers_total: u64,
+    pub layers_needed_by_cutoff: u64,
+    pub layers_needed_by_pitr: u64,
+    pub layers_needed_by_branches: u64,
+    pub layers_needed_by_leases: u64,
+    pub layers_not_updated: u64,
+    pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
+
+    #[serde(serialize_with = "serialize_duration_as_millis")]
+    pub elapsed: Duration,
+
+    /// The layers which were garbage collected.
+    ///
+    /// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
+    /// dropped in tests.
+    #[cfg(feature = "testing")]
+    #[serde(skip)]
+    pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
+}
+
+// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
+fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: serde::Serializer,
+{
+    d.as_millis().serialize(serializer)
+}
+
+impl AddAssign for GcResult {
+    fn add_assign(&mut self, other: Self) {
+        self.layers_total += other.layers_total;
+        self.layers_needed_by_pitr += other.layers_needed_by_pitr;
+        self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
+        self.layers_needed_by_branches += other.layers_needed_by_branches;
+        self.layers_needed_by_leases += other.layers_needed_by_leases;
+        self.layers_not_updated += other.layers_not_updated;
+        self.layers_removed += other.layers_removed;
+
+        self.elapsed += other.elapsed;
+
+        #[cfg(feature = "testing")]
+        {
+            let mut other = other;
+            self.doomed_layers.append(&mut other.doomed_layers);
+        }
+    }
+}
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -48,9 +48,9 @@ mod layer_coverage;

 use crate::context::RequestContext;
 use crate::keyspace::KeyPartitioning;
-use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
+use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
 use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
 use std::collections::{HashMap, VecDeque};
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2811,7 +2811,7 @@ where
 }

 use {
-    crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
+    crate::tenant::gc_result::GcResult, pageserver_api::models::TimelineGcRequest,
    utils::http::error::ApiError,
 };

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -180,6 +180,7 @@

 pub(crate) mod download;
 pub mod index;
+pub mod manifest;
 pub(crate) mod upload;

 use anyhow::Context;
@@ -189,9 +190,9 @@ use chrono::{NaiveDateTime, Utc};
 pub(crate) use download::download_initdb_tar_zst;
 use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
+use regex::Regex;
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
-pub(crate) use upload::upload_initdb_dir;
 use utils::backoff::{
    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
@@ -199,7 +200,7 @@ use utils::pausable_failpoint;

 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
-use std::sync::{Arc, Mutex};
+use std::sync::{Arc, Mutex, OnceLock};
 use std::time::Duration;

 use remote_storage::{
@@ -245,9 +246,11 @@ use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
 use super::Generation;

 pub(crate) use download::{
-    download_index_part, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines,
+    download_index_part, download_tenant_manifest, is_temp_download_file,
+    list_remote_tenant_shards, list_remote_timelines,
 };
 pub(crate) use index::LayerFileMetadata;
+pub(crate) use upload::upload_initdb_dir;

 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
@@ -295,6 +298,10 @@ pub enum WaitCompletionError {
    UploadQueueShutDownOrStopped,
 }

+#[derive(Debug, thiserror::Error)]
+#[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")]
+pub struct UploadQueueNotReadyError;
+
 /// A client for accessing a timeline's data in remote storage.
 ///
 /// This takes care of managing the number of connections, and balancing them
@@ -468,6 +475,20 @@ impl RemoteTimelineClient {
            .ok()
    }

+    /// Returns `Ok(Some(timestamp))` if the timeline has been archived, `Ok(None)` if the timeline hasn't been archived.
+    ///
+    /// Return Err(_) if the remote index_part hasn't been downloaded yet, or the timeline hasn't been stopped yet.
+    pub(crate) fn archived_at_stopped_queue(
+        &self,
+    ) -> Result<Option<NaiveDateTime>, UploadQueueNotReadyError> {
+        self.upload_queue
+            .lock()
+            .unwrap()
+            .stopped_mut()
+            .map(|q| q.upload_queue_for_deletion.clean.0.archived_at)
+            .map_err(|_| UploadQueueNotReadyError)
+    }
+
    fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
        let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
            current_remote_index_part
@@ -1252,10 +1273,14 @@ impl RemoteTimelineClient {
        let fut = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = match &mut *guard {
-                UploadQueue::Stopped(_) => return,
+                UploadQueue::Stopped(_) => {
+                    scopeguard::ScopeGuard::into_inner(sg);
+                    return;
+                }
                UploadQueue::Uninitialized => {
                    // transition into Stopped state
                    self.stop_impl(&mut guard);
+                    scopeguard::ScopeGuard::into_inner(sg);
                    return;
                }
                UploadQueue::Initialized(ref mut init) => init,
@@ -2198,6 +2223,23 @@ pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
    RemotePath::from_string(&path).expect("Failed to construct path")
 }

+pub fn remote_tenant_manifest_path(
+    tenant_shard_id: &TenantShardId,
+    generation: Generation,
+) -> RemotePath {
+    let path = format!(
+        "tenants/{tenant_shard_id}/tenant-manifest{}.json",
+        generation.get_suffix()
+    );
+    RemotePath::from_string(&path).expect("Failed to construct path")
+}
+
+/// Prefix to all generations' manifest objects in a tenant shard
+pub fn remote_tenant_manifest_prefix(tenant_shard_id: &TenantShardId) -> RemotePath {
+    let path = format!("tenants/{tenant_shard_id}/tenant-manifest",);
+    RemotePath::from_string(&path).expect("Failed to construct path")
+}
+
 pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
    let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
    RemotePath::from_string(&path).expect("Failed to construct path")
@@ -2292,6 +2334,15 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
    }
 }

+/// Given the key of a tenant manifest, parse out the generation number
+pub(crate) fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option<Generation> {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| Regex::new(r".+tenant-manifest-([0-9a-f]{8}).json").unwrap());
+    re.captures(path.get_path().as_str())
+        .and_then(|c| c.get(1))
+        .and_then(|m| Generation::parse_suffix(m.as_str()))
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -20,7 +20,9 @@ use utils::backoff;

 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::span::{
+    debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_id,
+};
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
@@ -34,9 +36,11 @@ use utils::id::{TenantId, TimelineId};
 use utils::pausable_failpoint;

 use super::index::{IndexPart, LayerFileMetadata};
+use super::manifest::TenantManifest;
 use super::{
-    parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
-    remote_initdb_preserved_archive_path, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
+    parse_remote_index_path, parse_remote_tenant_manifest_path, remote_index_path,
+    remote_initdb_archive_path, remote_initdb_preserved_archive_path, remote_tenant_manifest_path,
+    remote_tenant_manifest_prefix, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
    FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
 };

@@ -338,19 +342,15 @@ pub async fn list_remote_timelines(
    list_identifiers::<TimelineId>(storage, remote_path, cancel).await
 }

-async fn do_download_index_part(
+async fn do_download_remote_path_retry_forever(
    storage: &GenericRemoteStorage,
-    tenant_shard_id: &TenantShardId,
-    timeline_id: &TimelineId,
-    index_generation: Generation,
+    remote_path: &RemotePath,
    cancel: &CancellationToken,
-) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
-    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
-
-    let (index_part_bytes, index_part_mtime) = download_retry_forever(
+) -> Result<(Vec<u8>, SystemTime), DownloadError> {
+    download_retry_forever(
        || async {
            let download = storage
-                .download(&remote_path, &DownloadOpts::default(), cancel)
+                .download(remote_path, &DownloadOpts::default(), cancel)
                .await?;

            let mut bytes = Vec::new();
@@ -365,7 +365,41 @@ async fn do_download_index_part(
        &format!("download {remote_path:?}"),
        cancel,
    )
-    .await?;
+    .await
+}
+
+async fn do_download_tenant_manifest(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    _timeline_id: Option<&TimelineId>,
+    generation: Generation,
+    cancel: &CancellationToken,
+) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> {
+    let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
+
+    let (manifest_bytes, manifest_bytes_mtime) =
+        do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
+
+    let tenant_manifest = TenantManifest::from_json_bytes(&manifest_bytes)
+        .with_context(|| format!("deserialize tenant manifest file at {remote_path:?}"))
+        .map_err(DownloadError::Other)?;
+
+    Ok((tenant_manifest, generation, manifest_bytes_mtime))
+}
+
+async fn do_download_index_part(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    timeline_id: Option<&TimelineId>,
+    index_generation: Generation,
+    cancel: &CancellationToken,
+) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
+    let timeline_id =
+        timeline_id.expect("A timeline ID is always provided when downloading an index");
+    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
+
+    let (index_part_bytes, index_part_mtime) =
+        do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;

    let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
        .with_context(|| format!("deserialize index part file at {remote_path:?}"))
@@ -374,59 +408,79 @@ async fn do_download_index_part(
    Ok((index_part, index_generation, index_part_mtime))
 }

-/// index_part.json objects are suffixed with a generation number, so we cannot
-/// directly GET the latest index part without doing some probing.
+/// Metadata objects are "generationed", meaning that they include a generation suffix.  This
+/// function downloads the object with the highest generation <= `my_generation`.
 ///
-/// In this function we probe for the most recent index in a generation <= our current generation.
-/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
+/// Data objects (layer files) also include a generation in their path, but there is no equivalent
+/// search process, because their reference from an index includes the generation.
+///
+/// An expensive object listing operation is only done if necessary: the typical fast path is to issue two
+/// GET operations, one to our own generation (stale attachment case), and one to the immediately preceding
+/// generation (normal case when migrating/restarting).  Only if both of these return 404 do we fall back
+/// to listing objects.
+///
+/// * `my_generation`: the value of `[crate::tenant::Tenant::generation]`
+/// * `what`: for logging, what object are we downloading
+/// * `prefix`: when listing objects, use this prefix (i.e. the part of the object path before the generation)
+/// * `do_download`: a GET of the object in a particular generation, which should **retry indefinitely** unless
+///                  `cancel`` has fired.  This function does not do its own retries of GET operations, and relies
+///                  on the function passed in to do so.
+/// * `parse_path`: parse a fully qualified remote storage path to get the generation of the object.
+#[allow(clippy::too_many_arguments)]
 #[tracing::instrument(skip_all, fields(generation=?my_generation))]
-pub(crate) async fn download_index_part(
-    storage: &GenericRemoteStorage,
-    tenant_shard_id: &TenantShardId,
-    timeline_id: &TimelineId,
+pub(crate) async fn download_generation_object<'a, T, DF, DFF, PF>(
+    storage: &'a GenericRemoteStorage,
+    tenant_shard_id: &'a TenantShardId,
+    timeline_id: Option<&'a TimelineId>,
    my_generation: Generation,
-    cancel: &CancellationToken,
-) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
-    debug_assert_current_span_has_tenant_and_timeline_id();
+    what: &str,
+    prefix: RemotePath,
+    do_download: DF,
+    parse_path: PF,
+    cancel: &'a CancellationToken,
+) -> Result<(T, Generation, SystemTime), DownloadError>
+where
+    DF: Fn(
+        &'a GenericRemoteStorage,
+        &'a TenantShardId,
+        Option<&'a TimelineId>,
+        Generation,
+        &'a CancellationToken,
+    ) -> DFF,
+    DFF: Future<Output = Result<(T, Generation, SystemTime), DownloadError>>,
+    PF: Fn(RemotePath) -> Option<Generation>,
+    T: 'static,
+{
+    debug_assert_current_span_has_tenant_id();

    if my_generation.is_none() {
        // Operating without generations: just fetch the generation-less path
-        return do_download_index_part(
-            storage,
-            tenant_shard_id,
-            timeline_id,
-            my_generation,
-            cancel,
-        )
-        .await;
+        return do_download(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
    }

-    // Stale case: If we were intentionally attached in a stale generation, there may already be a remote
-    // index in our generation.
+    // Stale case: If we were intentionally attached in a stale generation, the remote object may already
+    // exist in our generation.
    //
    // This is an optimization to avoid doing the listing for the general case below.
-    let res =
-        do_download_index_part(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
+    let res = do_download(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
    match res {
-        Ok(index_part) => {
-            tracing::debug!(
-                "Found index_part from current generation (this is a stale attachment)"
-            );
-            return Ok(index_part);
+        Ok(decoded) => {
+            tracing::debug!("Found {what} from current generation (this is a stale attachment)");
+            return Ok(decoded);
        }
        Err(DownloadError::NotFound) => {}
        Err(e) => return Err(e),
    };

-    // Typical case: the previous generation of this tenant was running healthily, and had uploaded
-    // and index part.  We may safely start from this index without doing a listing, because:
+    // Typical case: the previous generation of this tenant was running healthily, and had uploaded the object
+    // we are seeking in that generation.  We may safely start from this index without doing a listing, because:
    //  - We checked for current generation case above
    //  - generations > my_generation are to be ignored
-    //  - any other indices that exist would have an older generation than `previous_gen`, and
-    //    we want to find the most recent index from a previous generation.
+    //  - any other objects that exist would have an older generation than `previous_gen`, and
+    //    we want to find the most recent object from a previous generation.
    //
    // This is an optimization to avoid doing the listing for the general case below.
-    let res = do_download_index_part(
+    let res = do_download(
        storage,
        tenant_shard_id,
        timeline_id,
@@ -435,14 +489,12 @@ pub(crate) async fn download_index_part(
    )
    .await;
    match res {
-        Ok(index_part) => {
-            tracing::debug!("Found index_part from previous generation");
-            return Ok(index_part);
+        Ok(decoded) => {
+            tracing::debug!("Found {what} from previous generation");
+            return Ok(decoded);
        }
        Err(DownloadError::NotFound) => {
-            tracing::debug!(
-                "No index_part found from previous generation, falling back to listing"
-            );
+            tracing::debug!("No {what} found from previous generation, falling back to listing");
        }
        Err(e) => {
            return Err(e);
@@ -452,12 +504,10 @@ pub(crate) async fn download_index_part(
    // General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
    // objects, and select the highest one with a generation <= my_generation.  Constructing the prefix is equivalent
    // to constructing a full index path with no generation, because the generation is a suffix.
-    let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
-
-    let indices = download_retry(
+    let paths = download_retry(
        || async {
            storage
-                .list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel)
+                .list(Some(&prefix), ListingMode::NoDelimiter, None, cancel)
                .await
        },
        "list index_part files",
@@ -468,22 +518,22 @@ pub(crate) async fn download_index_part(

    // General case logic for which index to use: the latest index whose generation
    // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
-    let max_previous_generation = indices
+    let max_previous_generation = paths
        .into_iter()
-        .filter_map(|o| parse_remote_index_path(o.key))
+        .filter_map(|o| parse_path(o.key))
        .filter(|g| g <= &my_generation)
        .max();

    match max_previous_generation {
        Some(g) => {
-            tracing::debug!("Found index_part in generation {g:?}");
-            do_download_index_part(storage, tenant_shard_id, timeline_id, g, cancel).await
+            tracing::debug!("Found {what} in generation {g:?}");
+            do_download(storage, tenant_shard_id, timeline_id, g, cancel).await
        }
        None => {
            // Migration from legacy pre-generation state: we have a generation but no prior
            // attached pageservers did.  Try to load from a no-generation path.
-            tracing::debug!("No index_part.json* found");
-            do_download_index_part(
+            tracing::debug!("No {what}* found");
+            do_download(
                storage,
                tenant_shard_id,
                timeline_id,
@@ -495,6 +545,57 @@ pub(crate) async fn download_index_part(
    }
 }

+/// index_part.json objects are suffixed with a generation number, so we cannot
+/// directly GET the latest index part without doing some probing.
+///
+/// In this function we probe for the most recent index in a generation <= our current generation.
+/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
+pub(crate) async fn download_index_part(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    timeline_id: &TimelineId,
+    my_generation: Generation,
+    cancel: &CancellationToken,
+) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
+    debug_assert_current_span_has_tenant_and_timeline_id();
+
+    let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
+    download_generation_object(
+        storage,
+        tenant_shard_id,
+        Some(timeline_id),
+        my_generation,
+        "index_part",
+        index_prefix,
+        do_download_index_part,
+        parse_remote_index_path,
+        cancel,
+    )
+    .await
+}
+
+pub(crate) async fn download_tenant_manifest(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    my_generation: Generation,
+    cancel: &CancellationToken,
+) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> {
+    let manifest_prefix = remote_tenant_manifest_prefix(tenant_shard_id);
+
+    download_generation_object(
+        storage,
+        tenant_shard_id,
+        None,
+        my_generation,
+        "tenant-manifest",
+        manifest_prefix,
+        do_download_tenant_manifest,
+        parse_remote_tenant_manifest_path,
+        cancel,
+    )
+    .await
+}
+
 pub(crate) async fn download_initdb_tar_zst(
    conf: &'static PageServerConf,
    storage: &GenericRemoteStorage,
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -121,11 +121,11 @@ impl IndexPart {
        self.disk_consistent_lsn
    }

-    pub fn from_s3_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
+    pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
        serde_json::from_slice::<IndexPart>(bytes)
    }

-    pub fn to_s3_bytes(&self) -> serde_json::Result<Vec<u8>> {
+    pub fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
        serde_json::to_vec(self)
    }

@@ -383,7 +383,7 @@ mod tests {
            last_aux_file_policy: None,
        };

-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -427,7 +427,7 @@ mod tests {
            last_aux_file_policy: None,
        };

-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -472,7 +472,7 @@ mod tests {
            last_aux_file_policy: None,
        };

-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -520,7 +520,7 @@ mod tests {
            last_aux_file_policy: None,
        };

-        let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
+        let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap();

        assert_eq!(empty_layers_parsed, expected);
    }
@@ -563,7 +563,7 @@ mod tests {
            last_aux_file_policy: None,
        };

-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -609,7 +609,7 @@ mod tests {
            last_aux_file_policy: None,
        };

-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -660,7 +660,7 @@ mod tests {
            last_aux_file_policy: Some(AuxFilePolicy::V2),
        };

-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -716,7 +716,7 @@ mod tests {
            last_aux_file_policy: Default::default(),
        };

-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -773,7 +773,7 @@ mod tests {
            last_aux_file_policy: Default::default(),
        };

-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -835,7 +835,7 @@ mod tests {
            archived_at: None,
        };

-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

--- a/pageserver/src/tenant/remote_timeline_client/manifest.rs
+++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs
@@ -0,0 +1,53 @@
+use chrono::NaiveDateTime;
+use serde::{Deserialize, Serialize};
+use utils::{id::TimelineId, lsn::Lsn};
+
+/// Tenant-shard scoped manifest
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct TenantManifest {
+    /// Debugging aid describing the version of this manifest.
+    /// Can also be used for distinguishing breaking changes later on.
+    pub version: usize,
+
+    /// The list of offloaded timelines together with enough information
+    /// to not have to actually load them.
+    ///
+    /// Note: the timelines mentioned in this list might be deleted, i.e.
+    /// we don't hold an invariant that the references aren't dangling.
+    /// Existence of index-part.json is the actual indicator of timeline existence.
+    pub offloaded_timelines: Vec<OffloadedTimelineManifest>,
+}
+
+/// The remote level representation of an offloaded timeline.
+///
+/// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`],
+/// but the two datastructures serve different needs, this is for a persistent disk format
+/// that must be backwards compatible, while the other is only for informative purposes.
+#[derive(Clone, Serialize, Deserialize, Copy, PartialEq, Eq)]
+pub struct OffloadedTimelineManifest {
+    pub timeline_id: TimelineId,
+    /// Whether the timeline has a parent it has been branched off from or not
+    pub ancestor_timeline_id: Option<TimelineId>,
+    /// Whether to retain the branch lsn at the ancestor or not
+    pub ancestor_retain_lsn: Option<Lsn>,
+    /// The time point when the timeline was archived
+    pub archived_at: NaiveDateTime,
+}
+
+pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1;
+
+impl TenantManifest {
+    pub(crate) fn empty() -> Self {
+        Self {
+            version: LATEST_TENANT_MANIFEST_VERSION,
+            offloaded_timelines: vec![],
+        }
+    }
+    pub(crate) fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
+        serde_json::from_slice::<Self>(bytes)
+    }
+
+    pub(crate) fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
+        serde_json::to_vec(self)
+    }
+}
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -13,9 +13,11 @@ use tokio_util::sync::CancellationToken;
 use utils::{backoff, pausable_failpoint};

 use super::index::IndexPart;
+use super::manifest::TenantManifest;
 use super::Generation;
 use crate::tenant::remote_timeline_client::{
    remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path,
+    remote_tenant_manifest_path,
 };
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
 use utils::id::{TenantId, TimelineId};
@@ -39,7 +41,7 @@ pub(crate) async fn upload_index_part<'a>(
    pausable_failpoint!("before-upload-index-pausable");

    // FIXME: this error comes too late
-    let serialized = index_part.to_s3_bytes()?;
+    let serialized = index_part.to_json_bytes()?;
    let serialized = Bytes::from(serialized);

    let index_part_size = serialized.len();
@@ -55,6 +57,37 @@ pub(crate) async fn upload_index_part<'a>(
        .await
        .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }
+/// Serializes and uploads the given tenant manifest data to the remote storage.
+pub(crate) async fn upload_tenant_manifest(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    generation: Generation,
+    tenant_manifest: &TenantManifest,
+    cancel: &CancellationToken,
+) -> anyhow::Result<()> {
+    tracing::trace!("uploading new tenant manifest");
+
+    fail_point!("before-upload-manifest", |_| {
+        bail!("failpoint before-upload-manifest")
+    });
+    pausable_failpoint!("before-upload-manifest-pausable");
+
+    let serialized = tenant_manifest.to_json_bytes()?;
+    let serialized = Bytes::from(serialized);
+
+    let tenant_manifest_site = serialized.len();
+
+    let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
+    storage
+        .upload_storage_object(
+            futures::stream::once(futures::future::ready(Ok(serialized))),
+            tenant_manifest_site,
+            &remote_path,
+            cancel,
+        )
+        .await
+        .with_context(|| format!("upload tenant manifest for '{tenant_shard_id}'"))
+}

 /// Attempts to upload given layer files.
 /// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload.
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -187,6 +187,8 @@ pub(super) async fn gather_inputs(
    // but it is unlikely to cause any issues. In the worst case,
    // the calculation will error out.
    timelines.retain(|t| t.is_active());
+    // Also filter out archived timelines.
+    timelines.retain(|t| t.is_archived() != Some(true));

    // Build a map of branch points.
    let mut branchpoints: HashMap<TimelineId, HashSet<Lsn>> = HashMap::new();
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -1,5 +1,6 @@
 //! Common traits and structs for layers

+pub mod batch_split_writer;
 pub mod delta_layer;
 pub mod filter_iterator;
 pub mod image_layer;
@@ -8,14 +9,13 @@ pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;
-pub mod split_writer;

 use crate::context::{AccessStatsBehavior, RequestContext};
-use crate::repository::Value;
-use crate::walrecord::NeonWalRecord;
 use bytes::Bytes;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
+use pageserver_api::record::NeonWalRecord;
+use pageserver_api::value::Value;
 use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -5,48 +5,162 @@ use pageserver_api::key::{Key, KEY_SIZE};
 use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};

 use crate::tenant::storage_layer::Layer;
-use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
+use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
+use pageserver_api::value::Value;

 use super::layer::S3_UPLOAD_LIMIT;
 use super::{
    DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer,
 };

-pub(crate) enum SplitWriterResult {
+pub(crate) enum BatchWriterResult {
    Produced(ResidentLayer),
    Discarded(PersistentLayerKey),
 }

 #[cfg(test)]
-impl SplitWriterResult {
+impl BatchWriterResult {
    fn into_resident_layer(self) -> ResidentLayer {
        match self {
-            SplitWriterResult::Produced(layer) => layer,
-            SplitWriterResult::Discarded(_) => panic!("unexpected discarded layer"),
+            BatchWriterResult::Produced(layer) => layer,
+            BatchWriterResult::Discarded(_) => panic!("unexpected discarded layer"),
        }
    }

    fn into_discarded_layer(self) -> PersistentLayerKey {
        match self {
-            SplitWriterResult::Produced(_) => panic!("unexpected produced layer"),
-            SplitWriterResult::Discarded(layer) => layer,
+            BatchWriterResult::Produced(_) => panic!("unexpected produced layer"),
+            BatchWriterResult::Discarded(layer) => layer,
        }
    }
 }

+enum LayerWriterWrapper {
+    Image(ImageLayerWriter),
+    Delta(DeltaLayerWriter),
+}
+
+/// An layer writer that takes unfinished layers and finish them atomically.
+#[must_use]
+pub struct BatchLayerWriter {
+    generated_layer_writers: Vec<(LayerWriterWrapper, PersistentLayerKey)>,
+    conf: &'static PageServerConf,
+}
+
+impl BatchLayerWriter {
+    pub async fn new(conf: &'static PageServerConf) -> anyhow::Result<Self> {
+        Ok(Self {
+            generated_layer_writers: Vec::new(),
+            conf,
+        })
+    }
+
+    pub fn add_unfinished_image_writer(
+        &mut self,
+        writer: ImageLayerWriter,
+        key_range: Range<Key>,
+        lsn: Lsn,
+    ) {
+        self.generated_layer_writers.push((
+            LayerWriterWrapper::Image(writer),
+            PersistentLayerKey {
+                key_range,
+                lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
+                is_delta: false,
+            },
+        ));
+    }
+
+    pub fn add_unfinished_delta_writer(
+        &mut self,
+        writer: DeltaLayerWriter,
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+    ) {
+        self.generated_layer_writers.push((
+            LayerWriterWrapper::Delta(writer),
+            PersistentLayerKey {
+                key_range,
+                lsn_range,
+                is_delta: true,
+            },
+        ));
+    }
+
+    pub(crate) async fn finish_with_discard_fn<D, F>(
+        self,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+        discard_fn: D,
+    ) -> anyhow::Result<Vec<BatchWriterResult>>
+    where
+        D: Fn(&PersistentLayerKey) -> F,
+        F: Future<Output = bool>,
+    {
+        let Self {
+            generated_layer_writers,
+            ..
+        } = self;
+        let clean_up_layers = |generated_layers: Vec<BatchWriterResult>| {
+            for produced_layer in generated_layers {
+                if let BatchWriterResult::Produced(resident_layer) = produced_layer {
+                    let layer: Layer = resident_layer.into();
+                    layer.delete_on_drop();
+                }
+            }
+        };
+        // BEGIN: catch every error and do the recovery in the below section
+        let mut generated_layers: Vec<BatchWriterResult> = Vec::new();
+        for (inner, layer_key) in generated_layer_writers {
+            if discard_fn(&layer_key).await {
+                generated_layers.push(BatchWriterResult::Discarded(layer_key));
+            } else {
+                let res = match inner {
+                    LayerWriterWrapper::Delta(writer) => {
+                        writer.finish(layer_key.key_range.end, ctx).await
+                    }
+                    LayerWriterWrapper::Image(writer) => {
+                        writer
+                            .finish_with_end_key(layer_key.key_range.end, ctx)
+                            .await
+                    }
+                };
+                let layer = match res {
+                    Ok((desc, path)) => {
+                        match Layer::finish_creating(self.conf, tline, desc, &path) {
+                            Ok(layer) => layer,
+                            Err(e) => {
+                                tokio::fs::remove_file(&path).await.ok();
+                                clean_up_layers(generated_layers);
+                                return Err(e);
+                            }
+                        }
+                    }
+                    Err(e) => {
+                        // Image/DeltaLayerWriter::finish will clean up the temporary layer if anything goes wrong,
+                        // so we don't need to remove the layer we just failed to create by ourselves.
+                        clean_up_layers(generated_layers);
+                        return Err(e);
+                    }
+                };
+                generated_layers.push(BatchWriterResult::Produced(layer));
+            }
+        }
+        // END: catch every error and do the recovery in the above section
+        Ok(generated_layers)
+    }
+}
+
 /// An image writer that takes images and produces multiple image layers.
-///
-/// The interface does not guarantee atomicity (i.e., if the image layer generation
-/// fails, there might be leftover files to be cleaned up)
 #[must_use]
 pub struct SplitImageLayerWriter {
    inner: ImageLayerWriter,
    target_layer_size: u64,
-    generated_layer_writers: Vec<(ImageLayerWriter, PersistentLayerKey)>,
+    lsn: Lsn,
    conf: &'static PageServerConf,
    timeline_id: TimelineId,
    tenant_shard_id: TenantShardId,
-    lsn: Lsn,
+    batches: BatchLayerWriter,
    start_key: Key,
 }

@@ -71,10 +185,10 @@ impl SplitImageLayerWriter {
                ctx,
            )
            .await?,
-            generated_layer_writers: Vec::new(),
            conf,
            timeline_id,
            tenant_shard_id,
+            batches: BatchLayerWriter::new(conf).await?,
            lsn,
            start_key,
        })
@@ -102,16 +216,13 @@ impl SplitImageLayerWriter {
                ctx,
            )
            .await?;
-            let layer_key = PersistentLayerKey {
-                key_range: self.start_key..key,
-                lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
-                is_delta: false,
-            };
            let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
+            self.batches.add_unfinished_image_writer(
+                prev_image_writer,
+                self.start_key..key,
+                self.lsn,
+            );
            self.start_key = key;
-
-            self.generated_layer_writers
-                .push((prev_image_writer, layer_key));
        }
        self.inner.put_image(key, img, ctx).await
    }
@@ -122,64 +233,18 @@ impl SplitImageLayerWriter {
        ctx: &RequestContext,
        end_key: Key,
        discard_fn: D,
-    ) -> anyhow::Result<Vec<SplitWriterResult>>
+    ) -> anyhow::Result<Vec<BatchWriterResult>>
    where
        D: Fn(&PersistentLayerKey) -> F,
        F: Future<Output = bool>,
    {
        let Self {
-            mut generated_layer_writers,
-            inner,
-            ..
+            mut batches, inner, ..
        } = self;
        if inner.num_keys() != 0 {
-            let layer_key = PersistentLayerKey {
-                key_range: self.start_key..end_key,
-                lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
-                is_delta: false,
-            };
-            generated_layer_writers.push((inner, layer_key));
+            batches.add_unfinished_image_writer(inner, self.start_key..end_key, self.lsn);
        }
-        let clean_up_layers = |generated_layers: Vec<SplitWriterResult>| {
-            for produced_layer in generated_layers {
-                if let SplitWriterResult::Produced(image_layer) = produced_layer {
-                    let layer: Layer = image_layer.into();
-                    layer.delete_on_drop();
-                }
-            }
-        };
-        // BEGIN: catch every error and do the recovery in the below section
-        let mut generated_layers = Vec::new();
-        for (inner, layer_key) in generated_layer_writers {
-            if discard_fn(&layer_key).await {
-                generated_layers.push(SplitWriterResult::Discarded(layer_key));
-            } else {
-                let layer = match inner
-                    .finish_with_end_key(layer_key.key_range.end, ctx)
-                    .await
-                {
-                    Ok((desc, path)) => {
-                        match Layer::finish_creating(self.conf, tline, desc, &path) {
-                            Ok(layer) => layer,
-                            Err(e) => {
-                                tokio::fs::remove_file(&path).await.ok();
-                                clean_up_layers(generated_layers);
-                                return Err(e);
-                            }
-                        }
-                    }
-                    Err(e) => {
-                        // ImageLayerWriter::finish will clean up the temporary layer if anything goes wrong,
-                        // so we don't need to remove it by ourselves.
-                        clean_up_layers(generated_layers);
-                        return Err(e);
-                    }
-                };
-                generated_layers.push(SplitWriterResult::Produced(layer));
-            }
-        }
-        // END: catch every error and do the recovery in the above section
-        Ok(generated_layers)
+        batches.finish_with_discard_fn(tline, ctx, discard_fn).await
    }

    #[cfg(test)]
@@ -188,7 +253,7 @@ impl SplitImageLayerWriter {
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
        end_key: Key,
-    ) -> anyhow::Result<Vec<SplitWriterResult>> {
+    ) -> anyhow::Result<Vec<BatchWriterResult>> {
        self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
            .await
    }
@@ -196,9 +261,6 @@ impl SplitImageLayerWriter {

 /// A delta writer that takes key-lsn-values and produces multiple delta layers.
 ///
-/// The interface does not guarantee atomicity (i.e., if the delta layer generation fails,
-/// there might be leftover files to be cleaned up).
-///
 /// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
 /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
 /// will split them into multiple files based on size.
@@ -206,12 +268,12 @@ impl SplitImageLayerWriter {
 pub struct SplitDeltaLayerWriter {
    inner: Option<(Key, DeltaLayerWriter)>,
    target_layer_size: u64,
-    generated_layers: Vec<SplitWriterResult>,
    conf: &'static PageServerConf,
    timeline_id: TimelineId,
    tenant_shard_id: TenantShardId,
    lsn_range: Range<Lsn>,
    last_key_written: Key,
+    batches: BatchLayerWriter,
 }

 impl SplitDeltaLayerWriter {
@@ -225,29 +287,22 @@ impl SplitDeltaLayerWriter {
        Ok(Self {
            target_layer_size,
            inner: None,
-            generated_layers: Vec::new(),
            conf,
            timeline_id,
            tenant_shard_id,
            lsn_range,
            last_key_written: Key::MIN,
+            batches: BatchLayerWriter::new(conf).await?,
        })
    }

-    /// Put value into the layer writer. In the case the writer decides to produce a layer, and the discard fn returns true, no layer will be written in the end.
-    pub async fn put_value_with_discard_fn<D, F>(
+    pub async fn put_value(
        &mut self,
        key: Key,
        lsn: Lsn,
        val: Value,
-        tline: &Arc<Timeline>,
        ctx: &RequestContext,
-        discard: D,
-    ) -> anyhow::Result<()>
-    where
-        D: FnOnce(&PersistentLayerKey) -> F,
-        F: Future<Output = bool>,
-    {
+    ) -> anyhow::Result<()> {
        // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
        // number, and therefore the final layer size could be a little bit larger or smaller than the target.
        //
@@ -286,29 +341,11 @@ impl SplitDeltaLayerWriter {
                .await?;
                let (start_key, prev_delta_writer) =
                    std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
-                let layer_key = PersistentLayerKey {
-                    key_range: start_key..key,
-                    lsn_range: self.lsn_range.clone(),
-                    is_delta: true,
-                };
-                if discard(&layer_key).await {
-                    drop(prev_delta_writer);
-                    self.generated_layers
-                        .push(SplitWriterResult::Discarded(layer_key));
-                } else {
-                    // `finish` will remove the file if anything goes wrong, while we need to handle deleting temporary
-                    // files for `finish_creating`.
-                    let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
-                    let delta_layer = match Layer::finish_creating(self.conf, tline, desc, &path) {
-                        Ok(layer) => layer,
-                        Err(e) => {
-                            tokio::fs::remove_file(&path).await.ok();
-                            return Err(e);
-                        }
-                    };
-                    self.generated_layers
-                        .push(SplitWriterResult::Produced(delta_layer));
-                }
+                self.batches.add_unfinished_delta_writer(
+                    prev_delta_writer,
+                    start_key..key,
+                    self.lsn_range.clone(),
+                );
            } else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
                // We have to produce a very large file b/c a key is updated too often.
                anyhow::bail!(
@@ -323,61 +360,30 @@ impl SplitDeltaLayerWriter {
        inner.put_value(key, lsn, val, ctx).await
    }

-    pub async fn put_value(
-        &mut self,
-        key: Key,
-        lsn: Lsn,
-        val: Value,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        self.put_value_with_discard_fn(key, lsn, val, tline, ctx, |_| async { false })
-            .await
-    }
-
    pub(crate) async fn finish_with_discard_fn<D, F>(
        self,
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
-        discard: D,
-    ) -> anyhow::Result<Vec<SplitWriterResult>>
+        discard_fn: D,
+    ) -> anyhow::Result<Vec<BatchWriterResult>>
    where
-        D: FnOnce(&PersistentLayerKey) -> F,
+        D: Fn(&PersistentLayerKey) -> F,
        F: Future<Output = bool>,
    {
        let Self {
-            mut generated_layers,
-            inner,
-            ..
+            mut batches, inner, ..
        } = self;
-        let Some((start_key, inner)) = inner else {
-            return Ok(generated_layers);
-        };
-        if inner.num_keys() == 0 {
-            return Ok(generated_layers);
+        if let Some((start_key, writer)) = inner {
+            if writer.num_keys() != 0 {
+                let end_key = self.last_key_written.next();
+                batches.add_unfinished_delta_writer(
+                    writer,
+                    start_key..end_key,
+                    self.lsn_range.clone(),
+                );
+            }
        }
-        let end_key = self.last_key_written.next();
-        let layer_key = PersistentLayerKey {
-            key_range: start_key..end_key,
-            lsn_range: self.lsn_range.clone(),
-            is_delta: true,
-        };
-        if discard(&layer_key).await {
-            generated_layers.push(SplitWriterResult::Discarded(layer_key));
-        } else {
-            // `finish` will remove the file if anything goes wrong, while we need to handle deleting temporary
-            // files for `finish_creating`.
-            let (desc, path) = inner.finish(end_key, ctx).await?;
-            let delta_layer = match Layer::finish_creating(self.conf, tline, desc, &path) {
-                Ok(layer) => layer,
-                Err(e) => {
-                    tokio::fs::remove_file(&path).await.ok();
-                    return Err(e);
-                }
-            };
-            generated_layers.push(SplitWriterResult::Produced(delta_layer));
-        }
-        Ok(generated_layers)
+        batches.finish_with_discard_fn(tline, ctx, discard_fn).await
    }

    #[cfg(test)]
@@ -385,15 +391,10 @@ impl SplitDeltaLayerWriter {
        self,
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<SplitWriterResult>> {
+    ) -> anyhow::Result<Vec<BatchWriterResult>> {
        self.finish_with_discard_fn(tline, ctx, |_| async { false })
            .await
    }
-
-    /// This function will be deprecated with #8841.
-    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, Option<DeltaLayerWriter>)> {
-        Ok((self.generated_layers, self.inner.map(|x| x.1)))
-    }
 }

 #[cfg(test)]
@@ -473,13 +474,7 @@ mod tests {
        assert_eq!(layers.len(), 1);

        delta_writer
-            .put_value(
-                get_key(0),
-                Lsn(0x18),
-                Value::Image(get_img(0)),
-                &tline,
-                &ctx,
-            )
+            .put_value(get_key(0), Lsn(0x18), Value::Image(get_img(0)), &ctx)
            .await
            .unwrap();
        let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
@@ -551,14 +546,7 @@ mod tests {
                .await
                .unwrap();
            delta_writer
-                .put_value_with_discard_fn(
-                    get_key(i),
-                    Lsn(0x20),
-                    Value::Image(get_large_img()),
-                    &tline,
-                    &ctx,
-                    |_| async { discard },
-                )
+                .put_value(get_key(i), Lsn(0x20), Value::Image(get_large_img()), &ctx)
                .await
                .unwrap();
        }
@@ -664,23 +652,11 @@ mod tests {
        assert_eq!(layers.len(), 2);

        delta_writer
-            .put_value(
-                get_key(0),
-                Lsn(0x18),
-                Value::Image(get_img(0)),
-                &tline,
-                &ctx,
-            )
+            .put_value(get_key(0), Lsn(0x18), Value::Image(get_img(0)), &ctx)
            .await
            .unwrap();
        delta_writer
-            .put_value(
-                get_key(1),
-                Lsn(0x1A),
-                Value::Image(get_large_img()),
-                &tline,
-                &ctx,
-            )
+            .put_value(get_key(1), Lsn(0x1A), Value::Image(get_large_img()), &ctx)
            .await
            .unwrap();
        let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
@@ -744,7 +720,6 @@ mod tests {
                    get_key(0),
                    Lsn(i as u64 * 16 + 0x10),
                    Value::Image(get_large_img()),
-                    &tline,
                    &ctx,
                )
                .await
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -30,7 +30,6 @@
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::page_cache::{self, FileId, PAGE_SZ};
-use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{
@@ -46,7 +45,7 @@ use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
 use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
-use crate::{walrecord, TEMP_FILE_SUFFIX};
+use crate::TEMP_FILE_SUFFIX;
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -54,9 +53,11 @@ use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::config::MaxVectoredReadBytes;
 use pageserver_api::key::DBDIR_KEY;
+use pageserver_api::key::{Key, KEY_SIZE};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
+use pageserver_api::value::Value;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::collections::VecDeque;
@@ -269,7 +270,7 @@ impl AsLayerDesc for DeltaLayer {
 }

 impl DeltaLayer {
-    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        self.desc.dump();

        if !verbose {
@@ -1084,7 +1085,7 @@ impl DeltaLayerInner {
        }
    }

-    pub(super) async fn load_keys<'a>(
+    pub(crate) async fn index_entries<'a>(
        &'a self,
        ctx: &RequestContext,
    ) -> Result<Vec<DeltaEntry<'a>>> {
@@ -1293,7 +1294,7 @@ impl DeltaLayerInner {
                    // is it an image or will_init walrecord?
                    // FIXME: this could be handled by threading the BlobRef to the
                    // VectoredReadBuilder
-                    let will_init = crate::repository::ValueBytes::will_init(&data)
+                    let will_init = pageserver_api::value::ValueBytes::will_init(&data)
                        .inspect_err(|_e| {
                            #[cfg(feature = "testing")]
                            tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
@@ -1346,7 +1347,7 @@ impl DeltaLayerInner {

        tree_reader.dump().await?;

-        let keys = self.load_keys(ctx).await?;
+        let keys = self.index_entries(ctx).await?;

        async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
            let buf = val.load_raw(ctx).await?;
@@ -1356,7 +1357,7 @@ impl DeltaLayerInner {
                    format!(" img {} bytes", img.len())
                }
                Value::WalRecord(rec) => {
-                    let wal_desc = walrecord::describe_wal_record(&rec)?;
+                    let wal_desc = pageserver_api::record::describe_wal_record(&rec)?;
                    format!(
                        " rec {} bytes will_init: {} {}",
                        buf.len(),
@@ -1437,7 +1438,7 @@ impl DeltaLayerInner {
        offset
    }

-    pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
+    pub fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
        let block_reader = FileBlockReader::new(&self.file, self.file_id);
        let tree_reader =
            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
@@ -1453,6 +1454,16 @@ impl DeltaLayerInner {
            ),
        }
    }
+
+    /// NB: not super efficient, but not terrible either. Should prob be an iterator.
+    //
+    // We're reusing the index traversal logical in plan_reads; would be nice to
+    // factor that out.
+    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result<Vec<Key>> {
+        self.index_entries(ctx)
+            .await
+            .map(|entries| entries.into_iter().map(|entry| entry.key).collect())
+    }
 }

 /// A set of data associated with a delta layer key and its value
@@ -1600,7 +1611,6 @@ pub(crate) mod test {
    use rand::RngCore;

    use super::*;
-    use crate::repository::Value;
    use crate::tenant::harness::TIMELINE_ID;
    use crate::tenant::storage_layer::{Layer, ResidentLayer};
    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
@@ -1612,6 +1622,7 @@ pub(crate) mod test {
        DEFAULT_PG_VERSION,
    };
    use bytes::Bytes;
+    use pageserver_api::value::Value;

    /// Construct an index for a fictional delta layer and and then
    /// traverse in order to plan vectored reads for a query. Finally,
@@ -1964,8 +1975,8 @@ pub(crate) mod test {

    #[tokio::test]
    async fn copy_delta_prefix_smoke() {
-        use crate::walrecord::NeonWalRecord;
        use bytes::Bytes;
+        use pageserver_api::record::NeonWalRecord;

        let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
            .await
@@ -2188,6 +2199,7 @@ pub(crate) mod test {
        (k1, l1).cmp(&(k2, l2))
    }

+    #[cfg(feature = "testing")]
    pub(crate) fn sort_delta_value(
        (k1, l1, v1): &(Key, Lsn, Value),
        (k2, l2, v2): &(Key, Lsn, Value),
--- a/pageserver/src/tenant/storage_layer/filter_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs
@@ -7,7 +7,7 @@ use pageserver_api::{
 };
 use utils::lsn::Lsn;

-use crate::repository::Value;
+use pageserver_api::value::Value;

 use super::merge_iterator::MergeIterator;

@@ -121,8 +121,8 @@ mod tests {

    #[tokio::test]
    async fn filter_keyspace_iterator() {
-        use crate::repository::Value;
        use bytes::Bytes;
+        use pageserver_api::value::Value;

        let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
            .await
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -28,7 +28,6 @@
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::page_cache::{self, FileId, PAGE_SZ};
-use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, FileBlockReader};
 use crate::tenant::disk_btree::{
@@ -51,8 +50,10 @@ use hex;
 use itertools::Itertools;
 use pageserver_api::config::MaxVectoredReadBytes;
 use pageserver_api::key::DBDIR_KEY;
+use pageserver_api::key::{Key, KEY_SIZE};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
+use pageserver_api::value::Value;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::collections::VecDeque;
@@ -230,7 +231,7 @@ impl AsLayerDesc for ImageLayer {
 }

 impl ImageLayer {
-    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        self.desc.dump();

        if !verbose {
@@ -673,6 +674,21 @@ impl ImageLayerInner {
            ),
        }
    }
+
+    /// NB: not super efficient, but not terrible either. Should prob be an iterator.
+    //
+    // We're reusing the index traversal logical in plan_reads; would be nice to
+    // factor that out.
+    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result<Vec<Key>> {
+        let plan = self
+            .plan_reads(KeySpace::single(self.key_range.clone()), None, ctx)
+            .await?;
+        Ok(plan
+            .into_iter()
+            .flat_map(|read| read.blobs_at)
+            .map(|(_, blob_meta)| blob_meta.key)
+            .collect())
+    }
 }

 /// A builder object for constructing a new image layer.
@@ -1009,7 +1025,7 @@ impl ImageLayerWriter {
        self.inner.take().unwrap().finish(ctx, None).await
    }

-    /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
+    /// Finish writing the image layer with an end key, used in [`super::batch_split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
    pub(super) async fn finish_with_end_key(
        mut self,
        end_key: Key,
@@ -1110,6 +1126,7 @@ mod test {
    use pageserver_api::{
        key::Key,
        shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
+        value::Value,
    };
    use utils::{
        generation::Generation,
@@ -1119,7 +1136,6 @@ mod test {

    use crate::{
        context::RequestContext,
-        repository::Value,
        tenant::{
            config::TenantConf,
            harness::{TenantHarness, TIMELINE_ID},
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -7,7 +7,6 @@
 use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
-use crate::repository::{Key, Value};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
@@ -16,9 +15,11 @@ use crate::{l0_flush, page_cache};
 use anyhow::{anyhow, Context, Result};
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
+use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
+use pageserver_api::value::Value;
 use std::collections::{BTreeMap, HashMap};
 use std::sync::{Arc, OnceLock};
 use std::time::Instant;
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -19,7 +19,7 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::timeline::{CompactionError, GetVectoredError};
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};

-use super::delta_layer::{self, DeltaEntry};
+use super::delta_layer::{self};
 use super::image_layer::{self};
 use super::{
    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
@@ -1841,23 +1841,22 @@ impl ResidentLayer {
    pub(crate) async fn load_keys<'a>(
        &'a self,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<DeltaEntry<'a>>> {
+    ) -> anyhow::Result<Vec<pageserver_api::key::Key>> {
        use LayerKind::*;

        let owner = &self.owner.0;
-        match self.downloaded.get(owner, ctx).await? {
-            Delta(ref d) => {
-                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
-                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
-                // while it's being held.
-                self.owner.record_access(ctx);
+        let inner = self.downloaded.get(owner, ctx).await?;

-                delta_layer::DeltaLayerInner::load_keys(d, ctx)
-                    .await
-                    .with_context(|| format!("Layer index is corrupted for {self}"))
-            }
-            Image(_) => anyhow::bail!(format!("cannot load_keys on a image layer {self}")),
-        }
+        // this is valid because the DownloadedLayer::kind is a OnceCell, not a
+        // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
+        // while it's being held.
+        self.owner.record_access(ctx);
+
+        let res = match inner {
+            Delta(ref d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await,
+            Image(ref i) => image_layer::ImageLayerInner::load_keys(i, ctx).await,
+        };
+        res.with_context(|| format!("Layer index is corrupted for {self}"))
    }

    /// Read all they keys in this layer which match the ShardIdentity, and write them all to
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -760,8 +760,8 @@ async fn evict_and_wait_does_not_wait_for_download() {
 /// Also checks that the same does not happen on a non-evicted layer (regression test).
 #[tokio::test(start_paused = true)]
 async fn eviction_cancellation_on_drop() {
-    use crate::repository::Value;
    use bytes::Bytes;
+    use pageserver_api::value::Value;

    // this is the runtime on which Layer spawns the blocking tasks on
    let handle = tokio::runtime::Handle::current();
@@ -782,7 +782,7 @@ async fn eviction_cancellation_on_drop() {
        let mut writer = timeline.writer().await;
        writer
            .put(
-                crate::repository::Key::from_i128(5),
+                pageserver_api::key::Key::from_i128(5),
                Lsn(0x20),
                &Value::Image(Bytes::from_static(b"this does not matter either")),
                &ctx,
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -3,7 +3,7 @@ use pageserver_api::shard::TenantShardId;
 use std::ops::Range;
 use utils::{id::TimelineId, lsn::Lsn};

-use crate::repository::Key;
+use pageserver_api::key::Key;

 use super::{DeltaLayerName, ImageLayerName, LayerName};

@@ -57,6 +57,34 @@ impl std::fmt::Display for PersistentLayerKey {
    }
 }

+impl From<ImageLayerName> for PersistentLayerKey {
+    fn from(image_layer_name: ImageLayerName) -> Self {
+        Self {
+            key_range: image_layer_name.key_range,
+            lsn_range: PersistentLayerDesc::image_layer_lsn_range(image_layer_name.lsn),
+            is_delta: false,
+        }
+    }
+}
+
+impl From<DeltaLayerName> for PersistentLayerKey {
+    fn from(delta_layer_name: DeltaLayerName) -> Self {
+        Self {
+            key_range: delta_layer_name.key_range,
+            lsn_range: delta_layer_name.lsn_range,
+            is_delta: true,
+        }
+    }
+}
+
+impl From<LayerName> for PersistentLayerKey {
+    fn from(layer_name: LayerName) -> Self {
+        match layer_name {
+            LayerName::Image(i) => i.into(),
+            LayerName::Delta(d) => d.into(),
+        }
+    }
+}
 impl PersistentLayerDesc {
    pub fn key(&self) -> PersistentLayerKey {
        PersistentLayerKey {
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -1,14 +1,12 @@
 //!
 //! Helper functions for dealing with filenames of the image and delta layer files.
 //!
-use crate::repository::Key;
-use std::borrow::Cow;
+use pageserver_api::key::Key;
 use std::cmp::Ordering;
 use std::fmt;
 use std::ops::Range;
 use std::str::FromStr;

-use regex::Regex;
 use utils::lsn::Lsn;

 use super::PersistentLayerDesc;
@@ -60,32 +58,31 @@ impl Ord for DeltaLayerName {
 /// Represents the region of the LSN-Key space covered by a DeltaLayer
 ///
 /// ```text
-///    <key start>-<key end>__<LSN start>-<LSN end>
+///    <key start>-<key end>__<LSN start>-<LSN end>-<generation>
 /// ```
 impl DeltaLayerName {
    /// Parse the part of a delta layer's file name that represents the LayerName. Returns None
    /// if the filename does not match the expected pattern.
    pub fn parse_str(fname: &str) -> Option<Self> {
-        let mut parts = fname.split("__");
-        let mut key_parts = parts.next()?.split('-');
-        let mut lsn_parts = parts.next()?.split('-');
-
-        let key_start_str = key_parts.next()?;
-        let key_end_str = key_parts.next()?;
-        let lsn_start_str = lsn_parts.next()?;
-        let lsn_end_str = lsn_parts.next()?;
-
-        if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() {
-            return None;
-        }
-
-        if key_start_str.len() != 36
-            || key_end_str.len() != 36
-            || lsn_start_str.len() != 16
-            || lsn_end_str.len() != 16
+        let (key_parts, lsn_generation_parts) = fname.split_once("__")?;
+        let (key_start_str, key_end_str) = key_parts.split_once('-')?;
+        let (lsn_start_str, lsn_end_generation_parts) = lsn_generation_parts.split_once('-')?;
+        let lsn_end_str = if let Some((lsn_end_str, maybe_generation)) =
+            lsn_end_generation_parts.split_once('-')
        {
-            return None;
-        }
+            if maybe_generation.starts_with("v") {
+                // vY-XXXXXXXX
+                lsn_end_str
+            } else if maybe_generation.len() == 8 {
+                // XXXXXXXX
+                lsn_end_str
+            } else {
+                // no idea what this is
+                return None;
+            }
+        } else {
+            lsn_end_generation_parts
+        };

        let key_start = Key::from_hex(key_start_str).ok()?;
        let key_end = Key::from_hex(key_end_str).ok()?;
@@ -173,25 +170,29 @@ impl ImageLayerName {
 /// Represents the part of the Key-LSN space covered by an ImageLayer
 ///
 /// ```text
-///    <key start>-<key end>__<LSN>
+///    <key start>-<key end>__<LSN>-<generation>
 /// ```
 impl ImageLayerName {
    /// Parse a string as then LayerName part of an image layer file name. Returns None if the
    /// filename does not match the expected pattern.
    pub fn parse_str(fname: &str) -> Option<Self> {
-        let mut parts = fname.split("__");
-        let mut key_parts = parts.next()?.split('-');
-
-        let key_start_str = key_parts.next()?;
-        let key_end_str = key_parts.next()?;
-        let lsn_str = parts.next()?;
-        if parts.next().is_some() || key_parts.next().is_some() {
-            return None;
-        }
-
-        if key_start_str.len() != 36 || key_end_str.len() != 36 || lsn_str.len() != 16 {
-            return None;
-        }
+        let (key_parts, lsn_generation_parts) = fname.split_once("__")?;
+        let (key_start_str, key_end_str) = key_parts.split_once('-')?;
+        let lsn_str =
+            if let Some((lsn_str, maybe_generation)) = lsn_generation_parts.split_once('-') {
+                if maybe_generation.starts_with("v") {
+                    // vY-XXXXXXXX
+                    lsn_str
+                } else if maybe_generation.len() == 8 {
+                    // XXXXXXXX
+                    lsn_str
+                } else {
+                    // likely a delta layer
+                    return None;
+                }
+            } else {
+                lsn_generation_parts
+            };

        let key_start = Key::from_hex(key_start_str).ok()?;
        let key_end = Key::from_hex(key_end_str).ok()?;
@@ -258,6 +259,14 @@ impl LayerName {
        }
    }

+    /// Gets the LSN range encoded in the layer name.
+    pub fn lsn_as_range(&self) -> Range<Lsn> {
+        match &self {
+            LayerName::Image(layer) => layer.lsn_as_range(),
+            LayerName::Delta(layer) => layer.lsn_range.clone(),
+        }
+    }
+
    pub fn is_delta(&self) -> bool {
        matches!(self, LayerName::Delta(_))
    }
@@ -290,18 +299,8 @@ impl FromStr for LayerName {
    /// Self. When loading a physical layer filename, we drop any extra information
    /// not needed to build Self.
    fn from_str(value: &str) -> Result<Self, Self::Err> {
-        let gen_suffix_regex = Regex::new("^(?<base>.+)(?<gen>-v1-[0-9a-f]{8})$").unwrap();
-        let file_name: Cow<str> = match gen_suffix_regex.captures(value) {
-            Some(captures) => captures
-                .name("base")
-                .expect("Non-optional group")
-                .as_str()
-                .into(),
-            None => value.into(),
-        };
-
-        let delta = DeltaLayerName::parse_str(&file_name);
-        let image = ImageLayerName::parse_str(&file_name);
+        let delta = DeltaLayerName::parse_str(value);
+        let image = ImageLayerName::parse_str(value);
        let ok = match (delta, image) {
            (None, None) => {
                return Err(format!(
@@ -367,11 +366,14 @@ mod test {
            lsn: Lsn::from_hex("00000000014FED58").unwrap(),
        });
        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap();
-        assert_eq!(parsed, expected,);
+        assert_eq!(parsed, expected);
+
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-00000001").unwrap();
+        assert_eq!(parsed, expected);

        // Omitting generation suffix is valid
        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap();
-        assert_eq!(parsed, expected,);
+        assert_eq!(parsed, expected);
    }

    #[test]
@@ -385,6 +387,9 @@ mod test {
        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap();
        assert_eq!(parsed, expected);

+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-00000001").unwrap();
+        assert_eq!(parsed, expected);
+
        // Omitting generation suffix is valid
        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap();
        assert_eq!(parsed, expected);
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -7,7 +7,8 @@ use anyhow::bail;
 use pageserver_api::key::Key;
 use utils::lsn::Lsn;

-use crate::{context::RequestContext, repository::Value};
+use crate::context::RequestContext;
+use pageserver_api::value::Value;

 use super::{
    delta_layer::{DeltaLayerInner, DeltaLayerIterator},
@@ -291,12 +292,16 @@ mod tests {
    use crate::{
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value},
+            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
        },
-        walrecord::NeonWalRecord,
        DEFAULT_PG_VERSION,
    };

+    #[cfg(feature = "testing")]
+    use crate::tenant::storage_layer::delta_layer::test::sort_delta_value;
+    #[cfg(feature = "testing")]
+    use pageserver_api::record::NeonWalRecord;
+
    async fn assert_merge_iter_equal(
        merge_iter: &mut MergeIterator<'_>,
        expect: &[(Key, Lsn, Value)],
@@ -319,8 +324,8 @@ mod tests {

    #[tokio::test]
    async fn merge_in_between() {
-        use crate::repository::Value;
        use bytes::Bytes;
+        use pageserver_api::value::Value;

        let harness = TenantHarness::create("merge_iterator_merge_in_between")
            .await
@@ -384,8 +389,8 @@ mod tests {

    #[tokio::test]
    async fn delta_merge() {
-        use crate::repository::Value;
        use bytes::Bytes;
+        use pageserver_api::value::Value;

        let harness = TenantHarness::create("merge_iterator_delta_merge")
            .await
@@ -458,10 +463,11 @@ mod tests {
        // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
    }

+    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn delta_image_mixed_merge() {
-        use crate::repository::Value;
        use bytes::Bytes;
+        use pageserver_api::value::Value;

        let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
            .await
@@ -586,5 +592,6 @@ mod tests {
        is_send(merge_iter);
    }

+    #[cfg(feature = "testing")]
    fn is_send(_: impl Send) {}
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -125,11 +125,12 @@ use utils::{
    simple_rcu::{Rcu, RcuReadGuard},
 };

-use crate::repository::GcResult;
-use crate::repository::{Key, Value};
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant::gc_result::GcResult;
 use crate::ZERO_PAGE;
+use pageserver_api::key::Key;
+use pageserver_api::value::Value;

 use self::delete::DeleteTimelineFlow;
 pub(super) use self::eviction_task::EvictionTaskTenantState;
@@ -371,7 +372,7 @@ pub struct Timeline {

    /// Prevent two tasks from deleting the timeline at the same time. If held, the
    /// timeline is being deleted. If 'true', the timeline has already been deleted.
-    pub delete_progress: Arc<tokio::sync::Mutex<DeleteTimelineFlow>>,
+    pub delete_progress: TimelineDeleteProgress,

    eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,

@@ -424,8 +425,13 @@ pub struct Timeline {
    pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,

    pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
+
+    /// Cf. [`crate::tenant::CreateTimelineIdempotency`].
+    pub(crate) create_idempotency: crate::tenant::CreateTimelineIdempotency,
 }

+pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
+
 pub struct WalReceiverInfo {
    pub wal_source_connconf: PgConnectionConfig,
    pub last_received_msg_lsn: Lsn,
@@ -2134,6 +2140,7 @@ impl Timeline {
        pg_version: u32,
        state: TimelineState,
        attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
+        create_idempotency: crate::tenant::CreateTimelineIdempotency,
        cancel: CancellationToken,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2250,7 +2257,7 @@ impl Timeline {
                eviction_task_timeline_state: tokio::sync::Mutex::new(
                    EvictionTaskTimelineState::default(),
                ),
-                delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),
+                delete_progress: TimelineDeleteProgress::default(),

                cancel,
                gate: Gate::default(),
@@ -2272,6 +2279,8 @@ impl Timeline {
                handles: Default::default(),

                attach_wal_lag_cooldown,
+
+                create_idempotency,
            };

            result.repartition_threshold =
@@ -2402,7 +2411,7 @@ impl Timeline {
    pub(super) async fn load_layer_map(
        &self,
        disk_consistent_lsn: Lsn,
-        index_part: Option<IndexPart>,
+        index_part: IndexPart,
    ) -> anyhow::Result<()> {
        use init::{Decision::*, Discovered, DismissedLayer};
        use LayerName::*;
@@ -2466,8 +2475,7 @@ impl Timeline {
                    );
                }

-                let decided =
-                    init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn);
+                let decided = init::reconcile(discovered_layers, &index_part, disk_consistent_lsn);

                let mut loaded_layers = Vec::new();
                let mut needs_cleanup = Vec::new();
@@ -5815,17 +5823,15 @@ fn is_send() {
 #[cfg(test)]
 mod tests {
    use pageserver_api::key::Key;
+    use pageserver_api::value::Value;
    use utils::{id::TimelineId, lsn::Lsn};

-    use crate::{
-        repository::Value,
-        tenant::{
-            harness::{test_img, TenantHarness},
-            layer_map::LayerMap,
-            storage_layer::{Layer, LayerName},
-            timeline::{DeltaLayerTestDesc, EvictionError},
-            Timeline,
-        },
+    use crate::tenant::{
+        harness::{test_img, TenantHarness},
+        layer_map::LayerMap,
+        storage_layer::{Layer, LayerName},
+        timeline::{DeltaLayerTestDesc, EvictionError},
+        Timeline,
    };

    #[tokio::test]
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -32,11 +32,11 @@ use crate::page_cache;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
+use crate::tenant::storage_layer::batch_split_writer::{
+    BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
+};
 use crate::tenant::storage_layer::filter_iterator::FilterIterator;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
-use crate::tenant::storage_layer::split_writer::{
-    SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
-};
 use crate::tenant::storage_layer::{
    AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
 };
@@ -49,9 +49,10 @@ use pageserver_api::config::tenant_conf_defaults::{
    DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
 };

-use crate::keyspace::KeySpace;
-use crate::repository::{Key, Value};
-use crate::walrecord::NeonWalRecord;
+use pageserver_api::key::Key;
+use pageserver_api::keyspace::KeySpace;
+use pageserver_api::record::NeonWalRecord;
+use pageserver_api::value::Value;

 use utils::lsn::Lsn;

@@ -121,18 +122,12 @@ impl KeyHistoryRetention {
    async fn pipe_to(
        self,
        key: Key,
-        tline: &Arc<Timeline>,
        delta_writer: &mut SplitDeltaLayerWriter,
        mut image_writer: Option<&mut SplitImageLayerWriter>,
        stat: &mut CompactionStatistics,
-        dry_run: bool,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let mut first_batch = true;
-        let discard = |key: &PersistentLayerKey| {
-            let key = key.clone();
-            async move { Self::discard_key(&key, tline, dry_run).await }
-        };
        for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
            if first_batch {
                if logs.len() == 1 && logs[0].1.is_image() {
@@ -144,40 +139,27 @@ impl KeyHistoryRetention {
                        image_writer.put_image(key, img.clone(), ctx).await?;
                    } else {
                        delta_writer
-                            .put_value_with_discard_fn(
-                                key,
-                                cutoff_lsn,
-                                Value::Image(img.clone()),
-                                tline,
-                                ctx,
-                                discard,
-                            )
+                            .put_value(key, cutoff_lsn, Value::Image(img.clone()), ctx)
                            .await?;
                    }
                } else {
                    for (lsn, val) in logs {
                        stat.produce_key(&val);
-                        delta_writer
-                            .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
-                            .await?;
+                        delta_writer.put_value(key, lsn, val, ctx).await?;
                    }
                }
                first_batch = false;
            } else {
                for (lsn, val) in logs {
                    stat.produce_key(&val);
-                    delta_writer
-                        .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
-                        .await?;
+                    delta_writer.put_value(key, lsn, val, ctx).await?;
                }
            }
        }
        let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
        for (lsn, val) in above_horizon_logs {
            stat.produce_key(&val);
-            delta_writer
-                .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
-                .await?;
+            delta_writer.put_value(key, lsn, val, ctx).await?;
        }
        Ok(())
    }
@@ -853,7 +835,12 @@ impl Timeline {
                if self.cancel.is_cancelled() {
                    return Err(CompactionError::ShuttingDown);
                }
-                all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
+                let delta = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
+                let keys = delta
+                    .index_entries(ctx)
+                    .await
+                    .map_err(CompactionError::Other)?;
+                all_keys.extend(keys);
            }
            // The current stdlib sorting implementation is designed in a way where it is
            // particularly fast where the slice is made up of sorted sub-ranges.
@@ -1729,20 +1716,32 @@ impl Timeline {
        Ok(())
    }

-    /// An experimental compaction building block that combines compaction with garbage collection.
-    ///
-    /// The current implementation picks all delta + image layers that are below or intersecting with
-    /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
-    /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
-    /// and create delta layers with all deltas >= gc horizon.
    pub(crate) async fn compact_with_gc(
        self: &Arc<Self>,
        cancel: &CancellationToken,
        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        use std::collections::BTreeSet;
+        self.partial_compact_with_gc(None, cancel, flags, ctx).await
+    }

+    /// An experimental compaction building block that combines compaction with garbage collection.
+    ///
+    /// The current implementation picks all delta + image layers that are below or intersecting with
+    /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
+    /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
+    /// and create delta layers with all deltas >= gc horizon.
+    ///
+    /// If `key_range`, it will only compact the keys within the range, aka partial compaction. This functionality
+    /// is not complete yet, and if it is set, only image layers will be generated.
+    ///
+    pub(crate) async fn partial_compact_with_gc(
+        self: &Arc<Self>,
+        compaction_key_range: Option<Range<Key>>,
+        cancel: &CancellationToken,
+        flags: EnumSet<CompactFlags>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
        // Block other compaction/GC tasks from running for now. GC-compaction could run along
        // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
        // Note that we already acquired the compaction lock when the outer `compact` function gets called.
@@ -1763,8 +1762,13 @@ impl Timeline {
        .await?;

        let dry_run = flags.contains(CompactFlags::DryRun);
+        let partial_compaction = compaction_key_range.is_some();

-        info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
+        if let Some(ref compaction_key_range) = compaction_key_range {
+            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end);
+        } else {
+            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
+        }

        scopeguard::defer! {
            info!("done enhanced gc bottom-most compaction");
@@ -1776,7 +1780,7 @@ impl Timeline {
        // The layer selection has the following properties:
        // 1. If a layer is in the selection, all layers below it are in the selection.
        // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
-        let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = {
+        let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = if !partial_compaction {
            let guard = self.layers.read().await;
            let layers = guard.layer_map()?;
            let gc_info = self.gc_info.read().unwrap();
@@ -1792,7 +1796,7 @@ impl Timeline {
                    retain_lsns_below_horizon.push(*lsn);
                }
            }
-            let mut selected_layers = Vec::new();
+            let mut selected_layers: Vec<Layer> = Vec::new();
            drop(gc_info);
            // Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
            let Some(max_layer_lsn) = layers
@@ -1817,8 +1821,52 @@ impl Timeline {
            }
            retain_lsns_below_horizon.sort();
            (selected_layers, gc_cutoff, retain_lsns_below_horizon)
+        } else {
+            // In case of partial compaction, we currently only support generating image layers, and therefore,
+            // we pick all layers that are below the lowest retain_lsn and does not intersect with any of the layers.
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map()?;
+            let gc_info = self.gc_info.read().unwrap();
+            let mut min_lsn = gc_info.cutoffs.select_min();
+            for (lsn, _, _) in &gc_info.retain_lsns {
+                if lsn < &min_lsn {
+                    min_lsn = *lsn;
+                }
+            }
+            for lsn in gc_info.leases.keys() {
+                if lsn < &min_lsn {
+                    min_lsn = *lsn;
+                }
+            }
+            let mut selected_layers = Vec::new();
+            drop(gc_info);
+            // |-------| |-------| |-------|
+            // | Delta | | Delta | | Delta | -- min_lsn could be intersecting with the layers
+            // |-------| |-------| |-------| <- we want to pick all the layers below min_lsn, so that
+            // | Delta | | Delta | | Delta |    ...we can remove them after compaction
+            // |-------| |-------| |-------|
+            // Pick all the layers intersect or below the min_lsn, get the largest LSN in the selected layers.
+            let Some(compaction_key_range) = compaction_key_range.as_ref() else {
+                unreachable!()
+            };
+            for desc in layers.iter_historic_layers() {
+                if desc.get_lsn_range().end <= min_lsn
+                    && overlaps_with(&desc.key_range, compaction_key_range)
+                {
+                    selected_layers.push(guard.get_from_desc(&desc));
+                }
+            }
+            if selected_layers.is_empty() {
+                info!("no layers to compact with gc");
+                return Ok(());
+            }
+            (selected_layers, min_lsn, Vec::new())
        };
        let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
+            if partial_compaction {
+                warn!("partial compaction cannot run on child branches (for now)");
+                return Ok(());
+            }
            Lsn(self.ancestor_lsn.0 + 1)
        } else {
            let res = retain_lsns_below_horizon
@@ -1846,23 +1894,18 @@ impl Timeline {

        self.check_compaction_space(&layer_selection).await?;

-        // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
-        // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
-        let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
+        // Generate statistics for the compaction
        for layer in &layer_selection {
            let desc = layer.layer_desc();
            if desc.is_delta() {
-                // ignore single-key layer files
-                if desc.key_range.start.next() != desc.key_range.end {
-                    let lsn_range = &desc.lsn_range;
-                    lsn_split_point.insert(lsn_range.start);
-                    lsn_split_point.insert(lsn_range.end);
-                }
                stat.visit_delta_layer(desc.file_size());
            } else {
                stat.visit_image_layer(desc.file_size());
            }
        }
+
+        // Step 1: construct a k-merge iterator over all layers.
+        // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
        let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
            .iter()
            .map(|layer| layer.layer_desc().layer_name())
@@ -1913,7 +1956,10 @@ impl Timeline {
                    self.conf,
                    self.timeline_id,
                    self.tenant_shard_id,
-                    Key::MIN,
+                    compaction_key_range
+                        .as_ref()
+                        .map(|x| x.start)
+                        .unwrap_or(Key::MIN),
                    lowest_retain_lsn,
                    self.get_compaction_target_size(),
                    ctx,
@@ -1974,59 +2020,71 @@ impl Timeline {
            } else {
                let last_key = last_key.as_mut().unwrap();
                stat.on_unique_key_visited();
-                let retention = self
-                    .generate_key_retention(
-                        *last_key,
-                        &accumulated_values,
-                        gc_cutoff,
-                        &retain_lsns_below_horizon,
-                        COMPACTION_DELTA_THRESHOLD,
-                        get_ancestor_image(self, *last_key, ctx).await?,
-                    )
-                    .await?;
-                // Put the image into the image layer. Currently we have a single big layer for the compaction.
-                retention
-                    .pipe_to(
-                        *last_key,
-                        self,
-                        &mut delta_layer_writer,
-                        image_layer_writer.as_mut(),
-                        &mut stat,
-                        dry_run,
-                        ctx,
-                    )
-                    .await?;
+                let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
+                    !compaction_key_range.contains(last_key)
+                } else {
+                    false
+                };
+                if !skip_adding_key {
+                    let retention = self
+                        .generate_key_retention(
+                            *last_key,
+                            &accumulated_values,
+                            gc_cutoff,
+                            &retain_lsns_below_horizon,
+                            COMPACTION_DELTA_THRESHOLD,
+                            get_ancestor_image(self, *last_key, ctx).await?,
+                        )
+                        .await?;
+                    // Put the image into the image layer. Currently we have a single big layer for the compaction.
+                    retention
+                        .pipe_to(
+                            *last_key,
+                            &mut delta_layer_writer,
+                            image_layer_writer.as_mut(),
+                            &mut stat,
+                            ctx,
+                        )
+                        .await?;
+                }
                accumulated_values.clear();
                *last_key = key;
                accumulated_values.push((key, lsn, val));
            }
        }

+        // TODO: move the below part to the loop body
        let last_key = last_key.expect("no keys produced during compaction");
-        // TODO: move this part to the loop body
        stat.on_unique_key_visited();
-        let retention = self
-            .generate_key_retention(
-                last_key,
-                &accumulated_values,
-                gc_cutoff,
-                &retain_lsns_below_horizon,
-                COMPACTION_DELTA_THRESHOLD,
-                get_ancestor_image(self, last_key, ctx).await?,
-            )
-            .await?;
-        // Put the image into the image layer. Currently we have a single big layer for the compaction.
-        retention
-            .pipe_to(
-                last_key,
-                self,
-                &mut delta_layer_writer,
-                image_layer_writer.as_mut(),
-                &mut stat,
-                dry_run,
-                ctx,
-            )
-            .await?;
+
+        let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
+            !compaction_key_range.contains(&last_key)
+        } else {
+            false
+        };
+        if !skip_adding_key {
+            let retention = self
+                .generate_key_retention(
+                    last_key,
+                    &accumulated_values,
+                    gc_cutoff,
+                    &retain_lsns_below_horizon,
+                    COMPACTION_DELTA_THRESHOLD,
+                    get_ancestor_image(self, last_key, ctx).await?,
+                )
+                .await?;
+            // Put the image into the image layer. Currently we have a single big layer for the compaction.
+            retention
+                .pipe_to(
+                    last_key,
+                    &mut delta_layer_writer,
+                    image_layer_writer.as_mut(),
+                    &mut stat,
+                    ctx,
+                )
+                .await?;
+        }
+        // end: move the above part to the loop body

        let discard = |key: &PersistentLayerKey| {
            let key = key.clone();
@@ -2035,8 +2093,12 @@ impl Timeline {

        let produced_image_layers = if let Some(writer) = image_layer_writer {
            if !dry_run {
+                let end_key = compaction_key_range
+                    .as_ref()
+                    .map(|x| x.end)
+                    .unwrap_or(Key::MAX);
                writer
-                    .finish_with_discard_fn(self, ctx, Key::MAX, discard)
+                    .finish_with_discard_fn(self, ctx, end_key, discard)
                    .await?
            } else {
                drop(writer);
@@ -2051,22 +2113,25 @@ impl Timeline {
                .finish_with_discard_fn(self, ctx, discard)
                .await?
        } else {
-            let (layers, _) = delta_layer_writer.take()?;
-            assert!(layers.is_empty(), "delta layers produced in dry run mode?");
+            drop(delta_layer_writer);
            Vec::new()
        };

+        if partial_compaction && !produced_delta_layers.is_empty() {
+            bail!("implementation error: partial compaction should not be producing delta layers (for now)");
+        }
+
        let mut compact_to = Vec::new();
        let mut keep_layers = HashSet::new();
        let produced_delta_layers_len = produced_delta_layers.len();
        let produced_image_layers_len = produced_image_layers.len();
        for action in produced_delta_layers {
            match action {
-                SplitWriterResult::Produced(layer) => {
+                BatchWriterResult::Produced(layer) => {
                    stat.produce_delta_layer(layer.layer_desc().file_size());
                    compact_to.push(layer);
                }
-                SplitWriterResult::Discarded(l) => {
+                BatchWriterResult::Discarded(l) => {
                    keep_layers.insert(l);
                    stat.discard_delta_layer();
                }
@@ -2074,11 +2139,11 @@ impl Timeline {
        }
        for action in produced_image_layers {
            match action {
-                SplitWriterResult::Produced(layer) => {
+                BatchWriterResult::Produced(layer) => {
                    stat.produce_image_layer(layer.layer_desc().file_size());
                    compact_to.push(layer);
                }
-                SplitWriterResult::Discarded(l) => {
+                BatchWriterResult::Discarded(l) => {
                    keep_layers.insert(l);
                    stat.discard_image_layer();
                }
@@ -2086,6 +2151,28 @@ impl Timeline {
        }
        let mut layer_selection = layer_selection;
        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
+        if let Some(ref compaction_key_range) = compaction_key_range {
+            // Partial compaction might select more data than it processes, e.g., if
+            // the compaction_key_range only partially overlaps:
+            //
+            //         [---compaction_key_range---]
+            //   [---A----][----B----][----C----][----D----]
+            //
+            // A,B,C,D are all in the `layer_selection`. The created image layers contain
+            // whatever is needed from B, C, and from `----]` of A, and from  `[--` of D.
+            //
+            // In contrast, `[--A-` and `--D----]` have not been processed, so, we must
+            // keep that data.
+            //
+            // The solution for now is to keep A and D completely.
+            // (layer_selection is what we'll remove from the layer map, so,
+            //  retain what is _not_ fully covered by compaction_key_range).
+            layer_selection.retain(|x| {
+                let key_range = &x.layer_desc().key_range;
+                key_range.start >= compaction_key_range.start
+                    && key_range.end <= compaction_key_range.end
+            });
+        }

        info!(
            "gc-compaction statistics: {}",
@@ -2167,7 +2254,7 @@ struct ResidentDeltaLayer(ResidentLayer);
 struct ResidentImageLayer(ResidentLayer);

 impl CompactionJobExecutor for TimelineAdaptor {
-    type Key = crate::repository::Key;
+    type Key = pageserver_api::key::Key;

    type Layer = OwnArc<PersistentLayerDesc>;
    type DeltaLayer = ResidentDeltaLayer;
@@ -2462,7 +2549,7 @@ impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
    type DeltaEntry<'a> = DeltaEntry<'a>;

    async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
-        self.0.load_keys(ctx).await
+        self.0.get_as_delta(ctx).await?.index_entries(ctx).await
    }
 }

--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -6,7 +6,7 @@ use std::{
 use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
-use tracing::{error, info, instrument, Instrument};
+use tracing::{error, info, info_span, instrument, Instrument};
 use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};

 use crate::{
@@ -15,7 +15,8 @@ use crate::{
    tenant::{
        metadata::TimelineMetadata,
        remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
-        CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded,
+        CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
+        TimelineOrOffloaded,
    },
 };

@@ -25,12 +26,9 @@ use super::{Timeline, TimelineResources};
 /// during attach or pageserver restart.
 /// See comment in persist_index_part_with_deleted_flag.
 async fn set_deleted_in_remote_index(
-    timeline: &TimelineOrOffloaded,
+    remote_client: &Arc<RemoteTimelineClient>,
 ) -> Result<(), DeleteTimelineError> {
-    let res = timeline
-        .remote_client()
-        .persist_index_part_with_deleted_flag()
-        .await;
+    let res = remote_client.persist_index_part_with_deleted_flag().await;
    match res {
        // If we (now, or already) marked it successfully as deleted, we can proceed
        Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
@@ -129,12 +127,10 @@ pub(super) async fn delete_local_timeline_directory(
 }

 /// Removes remote layers and an index file after them.
-async fn delete_remote_layers_and_index(timeline: &TimelineOrOffloaded) -> anyhow::Result<()> {
-    timeline
-        .remote_client()
-        .delete_all()
-        .await
-        .context("delete_all")
+async fn delete_remote_layers_and_index(
+    remote_client: &Arc<RemoteTimelineClient>,
+) -> anyhow::Result<()> {
+    remote_client.delete_all().await.context("delete_all")
 }

 /// It is important that this gets called when DeletionGuard is being held.
@@ -235,7 +231,33 @@ impl DeleteTimelineFlow {
            ))?
        });

-        set_deleted_in_remote_index(&timeline).await?;
+        let remote_client = match timeline.maybe_remote_client() {
+            Some(remote_client) => remote_client,
+            None => {
+                let remote_client = tenant
+                    .build_timeline_client(timeline.timeline_id(), tenant.remote_storage.clone());
+                let result = remote_client
+                    .download_index_file(&tenant.cancel)
+                    .instrument(info_span!("download_index_file"))
+                    .await
+                    .map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!("error: {:?}", e)))?;
+                let index_part = match result {
+                    MaybeDeletedIndexPart::Deleted(p) => {
+                        tracing::info!("Timeline already set as deleted in remote index");
+                        p
+                    }
+                    MaybeDeletedIndexPart::IndexPart(p) => p,
+                };
+                let remote_client = Arc::new(remote_client);
+
+                remote_client
+                    .init_upload_queue(&index_part)
+                    .map_err(DeleteTimelineError::Other)?;
+                remote_client.shutdown().await;
+                remote_client
+            }
+        };
+        set_deleted_in_remote_index(&remote_client).await?;

        fail::fail_point!("timeline-delete-before-schedule", |_| {
            Err(anyhow::anyhow!(
@@ -243,7 +265,13 @@ impl DeleteTimelineFlow {
            ))?
        });

-        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        Self::schedule_background(
+            guard,
+            tenant.conf,
+            Arc::clone(tenant),
+            timeline,
+            remote_client,
+        );

        Ok(())
    }
@@ -283,6 +311,7 @@ impl DeleteTimelineFlow {
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
                CreateTimelineCause::Delete,
+                crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
            )
            .context("create_timeline_struct")?;

@@ -301,8 +330,9 @@ impl DeleteTimelineFlow {

        guard.mark_in_progress()?;

+        let remote_client = timeline.remote_client.clone();
        let timeline = TimelineOrOffloaded::Timeline(timeline);
-        Self::schedule_background(guard, tenant.conf, tenant, timeline);
+        Self::schedule_background(guard, tenant.conf, tenant, timeline, remote_client);

        Ok(())
    }
@@ -380,6 +410,7 @@ impl DeleteTimelineFlow {
        conf: &'static PageServerConf,
        tenant: Arc<Tenant>,
        timeline: TimelineOrOffloaded,
+        remote_client: Arc<RemoteTimelineClient>,
    ) {
        let tenant_shard_id = timeline.tenant_shard_id();
        let timeline_id = timeline.timeline_id();
@@ -391,7 +422,7 @@ impl DeleteTimelineFlow {
            Some(timeline_id),
            "timeline_delete",
            async move {
-                if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
+                if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await {
                    error!("Error: {err:#}");
                    if let TimelineOrOffloaded::Timeline(timeline) = timeline {
                        timeline.set_broken(format!("{err:#}"))
@@ -408,6 +439,7 @@ impl DeleteTimelineFlow {
        conf: &PageServerConf,
        tenant: &Tenant,
        timeline: &TimelineOrOffloaded,
+        remote_client: Arc<RemoteTimelineClient>,
    ) -> Result<(), DeleteTimelineError> {
        // Offloaded timelines have no local state
        // TODO: once we persist offloaded information, delete the timeline from there, too
@@ -415,12 +447,22 @@ impl DeleteTimelineFlow {
            delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
        }

-        delete_remote_layers_and_index(timeline).await?;
+        delete_remote_layers_and_index(&remote_client).await?;

        pausable_failpoint!("in_progress_delete");

        remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?;

+        // This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
+        // between the deletion of the index-part.json and reaching of this code.
+        // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
+        // However, we handle this case in tenant loading code so the next time we attach, the issue is
+        // resolved.
+        tenant
+            .store_tenant_manifest()
+            .await
+            .map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!(e)))?;
+
        *guard = Self::Finished;

        Ok(())
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -29,6 +29,9 @@ pub(crate) enum Error {
    #[error("shutting down, please retry later")]
    ShuttingDown,

+    #[error("archived: {}", .0)]
+    Archived(TimelineId),
+
    #[error(transparent)]
    NotFound(crate::tenant::GetTimelineError),

@@ -79,8 +82,9 @@ impl From<Error> for ApiError {
    fn from(value: Error) -> Self {
        match value {
            Error::NoAncestor => ApiError::Conflict(value.to_string()),
-            Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", value)),
+            Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{value}")),
            Error::ShuttingDown => ApiError::ShuttingDown,
+            Error::Archived(_) => ApiError::BadRequest(anyhow::anyhow!("{value}")),
            Error::OtherTimelineDetachOngoing(_) | Error::FailedToReparentAll => {
                ApiError::ResourceUnavailable(value.to_string().into())
            }
@@ -201,12 +205,18 @@ pub(super) async fn prepare(
        }));
    };

+    if detached.is_archived() != Some(false) {
+        return Err(Archived(detached.timeline_id));
+    }
+
    if !ancestor_lsn.is_valid() {
        // rare case, probably wouldn't even load
        tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing");
        return Err(NoAncestor);
    }

+    check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?;
+
    if ancestor.ancestor_timeline.is_some() {
        // non-technical requirement; we could flatten N ancestors just as easily but we chose
        // not to, at least initially
@@ -950,3 +960,36 @@ where
        }
    })
 }
+
+fn check_no_archived_children_of_ancestor(
+    tenant: &Tenant,
+    detached: &Arc<Timeline>,
+    ancestor: &Arc<Timeline>,
+    ancestor_lsn: Lsn,
+) -> Result<(), Error> {
+    let timelines = tenant.timelines.lock().unwrap();
+    let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
+    for timeline in reparentable_timelines(timelines.values(), detached, ancestor, ancestor_lsn) {
+        if timeline.is_archived() == Some(true) {
+            return Err(Error::Archived(timeline.timeline_id));
+        }
+    }
+    for timeline_offloaded in timelines_offloaded.values() {
+        if timeline_offloaded.ancestor_timeline_id != Some(ancestor.timeline_id) {
+            continue;
+        }
+        // This forbids the detach ancestor feature if flattened timelines are present,
+        // even if the ancestor_lsn is from after the branchpoint of the detached timeline.
+        // But as per current design, we don't record the ancestor_lsn of flattened timelines.
+        // This is a bit unfortunate, but as of writing this we don't support flattening
+        // anyway. Maybe we can evolve the data model in the future.
+        if let Some(retain_lsn) = timeline_offloaded.ancestor_retain_lsn {
+            let is_earlier = retain_lsn <= ancestor_lsn;
+            if !is_earlier {
+                continue;
+            }
+        }
+        return Err(Error::Archived(timeline_offloaded.timeline_id));
+    }
+    Ok(())
+}
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -125,19 +125,9 @@ pub(super) enum DismissedLayer {
 /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
 pub(super) fn reconcile(
    local_layers: Vec<(LayerName, LocalLayerFileMetadata)>,
-    index_part: Option<&IndexPart>,
+    index_part: &IndexPart,
    disk_consistent_lsn: Lsn,
 ) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
-    let Some(index_part) = index_part else {
-        // If we have no remote metadata, no local layer files are considered valid to load
-        return local_layers
-            .into_iter()
-            .map(|(layer_name, local_metadata)| {
-                (layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))
-            })
-            .collect();
-    };
-
    let mut result = Vec::new();

    let mut remote_layers = HashMap::new();
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -45,13 +45,16 @@ impl LayerManager {
    pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
        // The assumption for the `expect()` is that all code maintains the following invariant:
        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
-        self.layers()
-            .get(key)
+        self.try_get_from_key(key)
            .with_context(|| format!("get layer from key: {key}"))
            .expect("not found")
            .clone()
    }

+    pub(crate) fn try_get_from_key(&self, key: &PersistentLayerKey) -> Option<&Layer> {
+        self.layers().get(key)
+    }
+
    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
        self.get_from_key(&desc.key())
    }
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -1,17 +1,17 @@
 use std::sync::Arc;

+use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
+use super::Timeline;
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded};

-use super::{
-    delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard},
-    Timeline,
-};
-
 pub(crate) async fn offload_timeline(
    tenant: &Tenant,
    timeline: &Arc<Timeline>,
 ) -> anyhow::Result<()> {
+    debug_assert_current_span_has_tenant_and_timeline_id();
    tracing::info!("offloading archived timeline");
+
    let (timeline, guard) = DeleteTimelineFlow::prepare(tenant, timeline.timeline_id)?;

    let TimelineOrOffloaded::Timeline(timeline) = timeline else {
@@ -19,14 +19,28 @@ pub(crate) async fn offload_timeline(
        return Ok(());
    };

+    let is_archived = timeline.is_archived();
+    match is_archived {
+        Some(true) => (),
+        Some(false) => {
+            tracing::warn!(?is_archived, "tried offloading a non-archived timeline");
+            anyhow::bail!("timeline isn't archived");
+        }
+        None => {
+            tracing::warn!(
+                ?is_archived,
+                "tried offloading a timeline where manifest is not yet available"
+            );
+            anyhow::bail!("timeline manifest hasn't been loaded yet");
+        }
+    }
+
    // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
    timeline.shutdown(super::ShutdownMode::Hard).await;

    // TODO extend guard mechanism above with method
    // to make deletions possible while offloading is in progress

-    // TODO mark timeline as offloaded in S3
-
    let conf = &tenant.conf;
    delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await?;

@@ -36,10 +50,24 @@ pub(crate) async fn offload_timeline(
        let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
        offloaded_timelines.insert(
            timeline.timeline_id,
-            Arc::new(OffloadedTimeline::from_timeline(&timeline)),
+            Arc::new(
+                OffloadedTimeline::from_timeline(&timeline)
+                    .expect("we checked above that timeline was ready"),
+            ),
        );
    }

+    // Last step: mark timeline as offloaded in S3
+    // TODO: maybe move this step above, right above deletion of the local timeline directory,
+    // then there is no potential race condition where we partially offload a timeline, and
+    // at the next restart attach it again.
+    // For that to happen, we'd need to make the manifest reflect our *intended* state,
+    // not our actual state of offloaded timelines.
+    tenant
+        .store_tenant_manifest()
+        .await
+        .map_err(|e| anyhow::anyhow!(e))?;
+
    Ok(())
 }

--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -5,7 +5,11 @@ use camino::Utf8PathBuf;
 use tracing::{error, info, info_span};
 use utils::{fs_ext, id::TimelineId, lsn::Lsn};

-use crate::{context::RequestContext, import_datadir, tenant::Tenant};
+use crate::{
+    context::RequestContext,
+    import_datadir,
+    tenant::{CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
+};

 use super::Timeline;

@@ -165,13 +169,17 @@ pub(crate) struct TimelineCreateGuard<'t> {
    owning_tenant: &'t Tenant,
    timeline_id: TimelineId,
    pub(crate) timeline_path: Utf8PathBuf,
+    pub(crate) idempotency: CreateTimelineIdempotency,
 }

 /// Errors when acquiring exclusive access to a timeline ID for creation
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum TimelineExclusionError {
    #[error("Already exists")]
-    AlreadyExists(Arc<Timeline>),
+    AlreadyExists {
+        existing: TimelineOrOffloaded,
+        arg: CreateTimelineIdempotency,
+    },
    #[error("Already creating")]
    AlreadyCreating,

@@ -185,27 +193,42 @@ impl<'t> TimelineCreateGuard<'t> {
        owning_tenant: &'t Tenant,
        timeline_id: TimelineId,
        timeline_path: Utf8PathBuf,
+        idempotency: CreateTimelineIdempotency,
+        allow_offloaded: bool,
    ) -> Result<Self, TimelineExclusionError> {
        // Lock order: this is the only place we take both locks.  During drop() we only
        // lock creating_timelines
        let timelines = owning_tenant.timelines.lock().unwrap();
+        let timelines_offloaded = owning_tenant.timelines_offloaded.lock().unwrap();
        let mut creating_timelines: std::sync::MutexGuard<
            '_,
            std::collections::HashSet<TimelineId>,
        > = owning_tenant.timelines_creating.lock().unwrap();

        if let Some(existing) = timelines.get(&timeline_id) {
-            Err(TimelineExclusionError::AlreadyExists(existing.clone()))
-        } else if creating_timelines.contains(&timeline_id) {
-            Err(TimelineExclusionError::AlreadyCreating)
-        } else {
-            creating_timelines.insert(timeline_id);
-            Ok(Self {
-                owning_tenant,
-                timeline_id,
-                timeline_path,
-            })
+            return Err(TimelineExclusionError::AlreadyExists {
+                existing: TimelineOrOffloaded::Timeline(existing.clone()),
+                arg: idempotency,
+            });
        }
+        if !allow_offloaded {
+            if let Some(existing) = timelines_offloaded.get(&timeline_id) {
+                return Err(TimelineExclusionError::AlreadyExists {
+                    existing: TimelineOrOffloaded::Offloaded(existing.clone()),
+                    arg: idempotency,
+                });
+            }
+        }
+        if creating_timelines.contains(&timeline_id) {
+            return Err(TimelineExclusionError::AlreadyCreating);
+        }
+        creating_timelines.insert(timeline_id);
+        Ok(Self {
+            owning_tenant,
+            timeline_id,
+            timeline_path,
+            idempotency,
+        })
    }
 }

--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -31,11 +31,11 @@ use crate::{
    task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
    walingest::WalIngest,
-    walrecord::{decode_wal_record, DecodedWALRecord},
 };
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
+use postgres_ffi::walrecord::{decode_wal_record, DecodedWALRecord};
 use utils::{id::NodeId, lsn::Lsn};
 use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};

--- a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
+++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
@@ -16,18 +16,24 @@ use tokio_epoll_uring::{System, SystemHandle};

 use crate::virtual_file::on_fatal_io_error;

-use crate::metrics::tokio_epoll_uring as metrics;
+use crate::metrics::tokio_epoll_uring::{self as metrics, THREAD_LOCAL_METRICS_STORAGE};

 #[derive(Clone)]
 struct ThreadLocalState(Arc<ThreadLocalStateInner>);

 struct ThreadLocalStateInner {
-    cell: tokio::sync::OnceCell<SystemHandle>,
+    cell: tokio::sync::OnceCell<SystemHandle<metrics::ThreadLocalMetrics>>,
    launch_attempts: AtomicU32,
    /// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`]
    thread_local_state_id: u64,
 }

+impl Drop for ThreadLocalStateInner {
+    fn drop(&mut self) {
+        THREAD_LOCAL_METRICS_STORAGE.remove_system(self.thread_local_state_id);
+    }
+}
+
 impl ThreadLocalState {
    pub fn new() -> Self {
        Self(Arc::new(ThreadLocalStateInner {
@@ -71,7 +77,8 @@ pub async fn thread_local_system() -> Handle {
                        &fake_cancel,
                    )
                    .await;
-                    let res = System::launch()
+                    let per_system_metrics = metrics::THREAD_LOCAL_METRICS_STORAGE.register_system(inner.thread_local_state_id);
+                    let res = System::launch_with_metrics(per_system_metrics)
                    // this might move us to another executor thread => loop outside the get_or_try_init, not inside it
                    .await;
                    match res {
@@ -86,6 +93,7 @@ pub async fn thread_local_system() -> Handle {
                                emit_launch_failure_process_stats();
                            });
                            metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc();
+                            metrics::THREAD_LOCAL_METRICS_STORAGE.remove_system(inner.thread_local_state_id);
                            Err(())
                        }
                        // abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere.
@@ -115,7 +123,7 @@ fn emit_launch_failure_process_stats() {
    // number of threads
    // rss / system memory usage generally

-    let tokio_epoll_uring::metrics::Metrics {
+    let tokio_epoll_uring::metrics::GlobalMetrics {
        systems_created,
        systems_destroyed,
    } = tokio_epoll_uring::metrics::global();
@@ -182,7 +190,7 @@ fn emit_launch_failure_process_stats() {
 pub struct Handle(ThreadLocalState);

 impl std::ops::Deref for Handle {
-    type Target = SystemHandle;
+    type Target = SystemHandle<metrics::ThreadLocalMetrics>;

    fn deref(&self) -> &Self::Target {
        self.0
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -29,11 +29,11 @@ use crate::metrics::{
    WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
    WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME,
 };
-use crate::repository::Key;
-use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use bytes::{Bytes, BytesMut};
+use pageserver_api::key::Key;
 use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
+use pageserver_api::record::NeonWalRecord;
 use pageserver_api::shard::TenantShardId;
 use std::future::Future;
 use std::sync::Arc;
@@ -548,9 +548,10 @@ impl PostgresRedoManager {
 #[cfg(test)]
 mod tests {
    use super::PostgresRedoManager;
-    use crate::repository::Key;
-    use crate::{config::PageServerConf, walrecord::NeonWalRecord};
+    use crate::config::PageServerConf;
    use bytes::Bytes;
+    use pageserver_api::key::Key;
+    use pageserver_api::record::NeonWalRecord;
    use pageserver_api::shard::TenantShardId;
    use std::str::FromStr;
    use tracing::Instrument;
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -1,8 +1,8 @@
-use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::BytesMut;
 use pageserver_api::key::Key;
+use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
@@ -238,7 +238,7 @@ pub(crate) fn apply_in_neon(
            // No-op: this record will never be created in aux v2.
            warn!("AuxFile record should not be created in aux v2");
        }
-        #[cfg(test)]
+        #[cfg(feature = "testing")]
        NeonWalRecord::Test {
            append,
            clear,
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -8,10 +8,10 @@ use crate::{
    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
    page_cache::PAGE_SZ,
    span::debug_assert_current_span_has_tenant_id,
-    walrecord::NeonWalRecord,
 };
 use anyhow::Context;
 use bytes::Bytes;
+use pageserver_api::record::NeonWalRecord;
 use pageserver_api::{reltag::RelTag, shard::TenantShardId};
 use postgres_ffi::BLCKSZ;
 #[cfg(feature = "testing")]
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -8,6 +8,7 @@ OBJS = \
 	file_cache.o \
 	hll.o \
 	libpagestore.o \
+	logical_replication_monitor.o \
 	neon.o \
 	neon_pgversioncompat.o \
 	neon_perf_counters.o \
@@ -15,6 +16,7 @@ OBJS = \
 	neon_walreader.o \
 	pagestore_smgr.o \
 	relsize_cache.o \
+	unstable_extensions.o \
 	walproposer.o \
 	walproposer_pg.o \
 	control_plane_connector.o \
@@ -54,7 +56,7 @@ walproposer-lib: libwalproposer.a;

 .PHONY: libwalproposer.a
 libwalproposer.a: $(WALPROP_OBJS)
-	rm -f $@
+	$(RM) $@
 	$(AR) $(AROPT) $@ $^

 # needs vars:
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -18,6 +18,7 @@
 *
 *-------------------------------------------------------------------------
 */
+
 #include "postgres.h"

 #include <curl/curl.h>
@@ -508,6 +509,8 @@ NeonXactCallback(XactEvent event, void *arg)
 static bool
 RoleIsNeonSuperuser(const char *role_name)
 {
+	Assert(role_name);
+
 	return strcmp(role_name, "neon_superuser") == 0;
 }

@@ -670,7 +673,7 @@ HandleCreateRole(CreateRoleStmt *stmt)
 static void
 HandleAlterRole(AlterRoleStmt *stmt)
 {
-	const char *role_name = stmt->role->rolename;
+	char	   *role_name;
 	DefElem    *dpass;
 	ListCell   *option;
 	bool		found = false;
@@ -678,6 +681,7 @@ HandleAlterRole(AlterRoleStmt *stmt)

 	InitRoleTableIfNeeded();

+	role_name = get_rolespec_name(stmt->role);
 	if (RoleIsNeonSuperuser(role_name) && !superuser())
 		elog(ERROR, "can't ALTER neon_superuser");

@@ -689,9 +693,13 @@ HandleAlterRole(AlterRoleStmt *stmt)
 		if (strcmp(defel->defname, "password") == 0)
 			dpass = defel;
 	}
+
 	/* We only care about updates to the password */
 	if (!dpass)
+	{
+		pfree(role_name);
 		return;
+	}

 	entry = hash_search(CurrentDdlTable->role_table,
 						role_name,
@@ -704,6 +712,8 @@ HandleAlterRole(AlterRoleStmt *stmt)
 	else
 		entry->password = NULL;
 	entry->type = Op_Set;
+
+	pfree(role_name);
 }

 static void
@@ -767,7 +777,7 @@ HandleDropRole(DropRoleStmt *stmt)
 		entry->type = Op_Delete;
 		entry->password = NULL;
 		if (!found)
-			memset(entry->old_name, 0, sizeof(entry));
+			memset(entry->old_name, 0, sizeof(entry->old_name));
 	}
 }

--- a/pgxn/neon/logical_replication_monitor.c
+++ b/pgxn/neon/logical_replication_monitor.c
@@ -0,0 +1,253 @@
+#include <limits.h>
+#include <string.h>
+#include <dirent.h>
+#include <signal.h>
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/interrupt.h"
+#include "replication/slot.h"
+#include "storage/fd.h"
+#include "storage/procsignal.h"
+#include "tcop/tcopprot.h"
+#include "utils/guc.h"
+#include "utils/wait_event.h"
+
+#include "logical_replication_monitor.h"
+
+#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */
+
+static int	logical_replication_max_snap_files = 300;
+
+PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
+
+static int
+LsnDescComparator(const void *a, const void *b)
+{
+	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
+	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
+
+	if (lsn1 < lsn2)
+		return 1;
+	else if (lsn1 == lsn2)
+		return 0;
+	else
+		return -1;
+}
+
+/*
+ * Look at .snap files and calculate minimum allowed restart_lsn of slot so that
+ * next gc would leave not more than logical_replication_max_snap_files; all
+ * slots having lower restart_lsn should be dropped.
+ */
+static XLogRecPtr
+get_num_snap_files_lsn_threshold(void)
+{
+	DIR		   *dirdesc;
+	struct dirent *de;
+	char	   *snap_path = "pg_logical/snapshots/";
+	int			lsns_allocated = 1024;
+	int			lsns_num = 0;
+	XLogRecPtr *lsns;
+	XLogRecPtr	cutoff;
+
+	if (logical_replication_max_snap_files < 0)
+		return 0;
+
+	lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
+
+	/* find all .snap files and get their lsns */
+	dirdesc = AllocateDir(snap_path);
+	while ((de = ReadDir(dirdesc, snap_path)) != NULL)
+	{
+		XLogRecPtr	lsn;
+		uint32		hi;
+		uint32		lo;
+
+		if (strcmp(de->d_name, ".") == 0 ||
+			strcmp(de->d_name, "..") == 0)
+			continue;
+
+		if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2)
+		{
+			ereport(LOG,
+					(errmsg("could not parse file name as .snap file \"%s\"", de->d_name)));
+			continue;
+		}
+
+		lsn = ((uint64) hi) << 32 | lo;
+		elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
+		if (lsns_allocated == lsns_num)
+		{
+			lsns_allocated *= 2;
+			lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
+		}
+		lsns[lsns_num++] = lsn;
+	}
+	/* sort by lsn desc */
+	qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
+	/* and take cutoff at logical_replication_max_snap_files */
+	if (logical_replication_max_snap_files > lsns_num)
+		cutoff = 0;
+	/* have less files than cutoff */
+	else
+	{
+		cutoff = lsns[logical_replication_max_snap_files - 1];
+		elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
+			 LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
+	}
+	pfree(lsns);
+	FreeDir(dirdesc);
+	return cutoff;
+}
+
+void
+InitLogicalReplicationMonitor(void)
+{
+	BackgroundWorker bgw;
+
+	DefineCustomIntVariable(
+							"neon.logical_replication_max_snap_files",
+							"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
+							NULL,
+							&logical_replication_max_snap_files,
+							300, -1, INT_MAX,
+							PGC_SIGHUP,
+							0,
+							NULL, NULL, NULL);
+
+	memset(&bgw, 0, sizeof(bgw));
+	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
+	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
+	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain");
+	snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor");
+	snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor");
+	bgw.bgw_restart_time = 5;
+	bgw.bgw_notify_pid = 0;
+	bgw.bgw_main_arg = (Datum) 0;
+
+	RegisterBackgroundWorker(&bgw);
+}
+
+/*
+ * Unused logical replication slots pins WAL and prevents deletion of snapshots.
+ * WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which
+ * need too many .snap files.
+ */
+void
+LogicalSlotsMonitorMain(Datum main_arg)
+{
+	/* Establish signal handlers. */
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+	pqsignal(SIGHUP, SignalHandlerForConfigReload);
+	pqsignal(SIGTERM, die);
+
+	BackgroundWorkerUnblockSignals();
+
+	for (;;)
+	{
+		XLogRecPtr	cutoff_lsn;
+
+		/* In case of a SIGHUP, just reload the configuration. */
+		if (ConfigReloadPending)
+		{
+			ConfigReloadPending = false;
+			ProcessConfigFile(PGC_SIGHUP);
+		}
+
+		/*
+		 * If there are too many .snap files, just drop all logical slots to
+		 * prevent aux files bloat.
+		 */
+		cutoff_lsn = get_num_snap_files_lsn_threshold();
+		if (cutoff_lsn > 0)
+		{
+			for (int i = 0; i < max_replication_slots; i++)
+			{
+				char		slot_name[NAMEDATALEN];
+				ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
+				XLogRecPtr	restart_lsn;
+
+				/* find the name */
+				LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+				/* Consider only logical repliction slots */
+				if (!s->in_use || !SlotIsLogical(s))
+				{
+					LWLockRelease(ReplicationSlotControlLock);
+					continue;
+				}
+
+				/* do we need to drop it? */
+				SpinLockAcquire(&s->mutex);
+				restart_lsn = s->data.restart_lsn;
+				SpinLockRelease(&s->mutex);
+				if (restart_lsn >= cutoff_lsn)
+				{
+					LWLockRelease(ReplicationSlotControlLock);
+					continue;
+				}
+
+				strlcpy(slot_name, s->data.name.data, NAMEDATALEN);
+				elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X",
+					 slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
+				LWLockRelease(ReplicationSlotControlLock);
+
+				/* now try to drop it, killing owner before if any */
+				for (;;)
+				{
+					pid_t		active_pid;
+
+					SpinLockAcquire(&s->mutex);
+					active_pid = s->active_pid;
+					SpinLockRelease(&s->mutex);
+
+					if (active_pid == 0)
+					{
+						/*
+						 * Slot is releasted, try to drop it. Though of course
+						 * it could have been reacquired, so drop can ERROR
+						 * out. Similarly it could have been dropped in the
+						 * meanwhile.
+						 *
+						 * In principle we could remove pg_try/pg_catch, that
+						 * would restart the whole bgworker.
+						 */
+						ConditionVariableCancelSleep();
+						PG_TRY();
+						{
+							ReplicationSlotDrop(slot_name, true);
+							elog(LOG, "ls_monitor: slot %s dropped", slot_name);
+						}
+						PG_CATCH();
+						{
+							/* log ERROR and reset elog stack */
+							EmitErrorReport();
+							FlushErrorState();
+							elog(LOG, "ls_monitor: failed to drop slot %s", slot_name);
+						}
+						PG_END_TRY();
+						break;
+					}
+					else
+					{
+						/* kill the owner and wait for release */
+						elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid);
+						(void) kill(active_pid, SIGTERM);
+						/* We shouldn't get stuck, but to be safe add timeout. */
+						ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP);
+					}
+				}
+			}
+		}
+
+		(void) WaitLatch(MyLatch,
+						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
+						 LS_MONITOR_CHECK_INTERVAL,
+						 PG_WAIT_EXTENSION);
+		ResetLatch(MyLatch);
+		CHECK_FOR_INTERRUPTS();
+	}
+}
--- a/pgxn/neon/logical_replication_monitor.h
+++ b/pgxn/neon/logical_replication_monitor.h
@@ -0,0 +1,6 @@
+#ifndef __NEON_LOGICAL_REPLICATION_MONITOR_H__
+#define __NEON_LOGICAL_REPLICATION_MONITOR_H__
+
+void InitLogicalReplicationMonitor(void);
+
+#endif
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -14,32 +14,23 @@
 #include "miscadmin.h"
 #include "access/subtrans.h"
 #include "access/twophase.h"
-#include "access/xact.h"
 #include "access/xlog.h"
-#include "storage/buf_internals.h"
-#include "storage/bufmgr.h"
-#include "catalog/pg_type.h"
-#include "postmaster/bgworker.h"
-#include "postmaster/interrupt.h"
 #include "replication/logical.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
 #include "storage/proc.h"
-#include "storage/procsignal.h"
-#include "tcop/tcopprot.h"
 #include "funcapi.h"
 #include "access/htup_details.h"
 #include "utils/builtins.h"
 #include "utils/pg_lsn.h"
 #include "utils/guc.h"
 #include "utils/guc_tables.h"
-#include "utils/wait_event.h"

 #include "extension_server.h"
 #include "neon.h"
-#include "walproposer.h"
-#include "pagestore_client.h"
 #include "control_plane_connector.h"
+#include "logical_replication_monitor.h"
+#include "unstable_extensions.h"
 #include "walsender_hooks.h"
 #if PG_MAJORVERSION_NUM >= 16
 #include "storage/ipc.h"
@@ -48,7 +39,6 @@
 PG_MODULE_MAGIC;
 void		_PG_init(void);

-static int	logical_replication_max_snap_files = 300;

 static int  running_xacts_overflow_policy;

@@ -82,237 +72,6 @@ static const struct config_enum_entry running_xacts_overflow_policies[] = {
 	{NULL, 0, false}
 };

-static void
-InitLogicalReplicationMonitor(void)
-{
-	BackgroundWorker bgw;
-
-	DefineCustomIntVariable(
-							"neon.logical_replication_max_snap_files",
-							"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
-							NULL,
-							&logical_replication_max_snap_files,
-							300, -1, INT_MAX,
-							PGC_SIGHUP,
-							0,
-							NULL, NULL, NULL);
-
-	memset(&bgw, 0, sizeof(bgw));
-	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
-	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
-	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
-	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain");
-	snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor");
-	snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor");
-	bgw.bgw_restart_time = 5;
-	bgw.bgw_notify_pid = 0;
-	bgw.bgw_main_arg = (Datum) 0;
-
-	RegisterBackgroundWorker(&bgw);
-}
-
-static int
-LsnDescComparator(const void *a, const void *b)
-{
-	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
-	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
-
-	if (lsn1 < lsn2)
-		return 1;
-	else if (lsn1 == lsn2)
-		return 0;
-	else
-		return -1;
-}
-
-/*
- * Look at .snap files and calculate minimum allowed restart_lsn of slot so that
- * next gc would leave not more than logical_replication_max_snap_files; all
- * slots having lower restart_lsn should be dropped.
- */
-static XLogRecPtr
-get_num_snap_files_lsn_threshold(void)
-{
-	DIR		   *dirdesc;
-	struct dirent *de;
-	char	   *snap_path = "pg_logical/snapshots/";
-	int			lsns_allocated = 1024;
-	int			lsns_num = 0;
-	XLogRecPtr *lsns;
-	XLogRecPtr	cutoff;
-
-	if (logical_replication_max_snap_files < 0)
-		return 0;
-
-	lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
-
-	/* find all .snap files and get their lsns */
-	dirdesc = AllocateDir(snap_path);
-	while ((de = ReadDir(dirdesc, snap_path)) != NULL)
-	{
-		XLogRecPtr	lsn;
-		uint32		hi;
-		uint32		lo;
-
-		if (strcmp(de->d_name, ".") == 0 ||
-			strcmp(de->d_name, "..") == 0)
-			continue;
-
-		if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2)
-		{
-			ereport(LOG,
-					(errmsg("could not parse file name as .snap file \"%s\"", de->d_name)));
-			continue;
-		}
-
-		lsn = ((uint64) hi) << 32 | lo;
-		elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
-		if (lsns_allocated == lsns_num)
-		{
-			lsns_allocated *= 2;
-			lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
-		}
-		lsns[lsns_num++] = lsn;
-	}
-	/* sort by lsn desc */
-	qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
-	/* and take cutoff at logical_replication_max_snap_files */
-	if (logical_replication_max_snap_files > lsns_num)
-		cutoff = 0;
-	/* have less files than cutoff */
-	else
-	{
-		cutoff = lsns[logical_replication_max_snap_files - 1];
-		elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
-			 LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
-	}
-	pfree(lsns);
-	FreeDir(dirdesc);
-	return cutoff;
-}
-
-#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */
-
-/*
- * Unused logical replication slots pins WAL and prevents deletion of snapshots.
- * WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which
- * need too many .snap files.
- */
-PGDLLEXPORT void
-LogicalSlotsMonitorMain(Datum main_arg)
-{
-	/* Establish signal handlers. */
-	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
-	pqsignal(SIGHUP, SignalHandlerForConfigReload);
-	pqsignal(SIGTERM, die);
-
-	BackgroundWorkerUnblockSignals();
-
-	for (;;)
-	{
-		XLogRecPtr	cutoff_lsn;
-
-		/* In case of a SIGHUP, just reload the configuration. */
-		if (ConfigReloadPending)
-		{
-			ConfigReloadPending = false;
-			ProcessConfigFile(PGC_SIGHUP);
-		}
-
-		/*
-		 * If there are too many .snap files, just drop all logical slots to
-		 * prevent aux files bloat.
-		 */
-		cutoff_lsn = get_num_snap_files_lsn_threshold();
-		if (cutoff_lsn > 0)
-		{
-			for (int i = 0; i < max_replication_slots; i++)
-			{
-				char		slot_name[NAMEDATALEN];
-				ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
-				XLogRecPtr	restart_lsn;
-
-				/* find the name */
-				LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
-				/* Consider only logical repliction slots */
-				if (!s->in_use || !SlotIsLogical(s))
-				{
-					LWLockRelease(ReplicationSlotControlLock);
-					continue;
-				}
-
-				/* do we need to drop it? */
-				SpinLockAcquire(&s->mutex);
-				restart_lsn = s->data.restart_lsn;
-				SpinLockRelease(&s->mutex);
-				if (restart_lsn >= cutoff_lsn)
-				{
-					LWLockRelease(ReplicationSlotControlLock);
-					continue;
-				}
-
-				strlcpy(slot_name, s->data.name.data, NAMEDATALEN);
-				elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X",
-					 slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
-				LWLockRelease(ReplicationSlotControlLock);
-
-				/* now try to drop it, killing owner before if any */
-				for (;;)
-				{
-					pid_t		active_pid;
-
-					SpinLockAcquire(&s->mutex);
-					active_pid = s->active_pid;
-					SpinLockRelease(&s->mutex);
-
-					if (active_pid == 0)
-					{
-						/*
-						 * Slot is releasted, try to drop it. Though of course
-						 * it could have been reacquired, so drop can ERROR
-						 * out. Similarly it could have been dropped in the
-						 * meanwhile.
-						 *
-						 * In principle we could remove pg_try/pg_catch, that
-						 * would restart the whole bgworker.
-						 */
-						ConditionVariableCancelSleep();
-						PG_TRY();
-						{
-							ReplicationSlotDrop(slot_name, true);
-							elog(LOG, "ls_monitor: slot %s dropped", slot_name);
-						}
-						PG_CATCH();
-						{
-							/* log ERROR and reset elog stack */
-							EmitErrorReport();
-							FlushErrorState();
-							elog(LOG, "ls_monitor: failed to drop slot %s", slot_name);
-						}
-						PG_END_TRY();
-						break;
-					}
-					else
-					{
-						/* kill the owner and wait for release */
-						elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid);
-						(void) kill(active_pid, SIGTERM);
-						/* We shouldn't get stuck, but to be safe add timeout. */
-						ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP);
-					}
-				}
-			}
-		}
-
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
-						 LS_MONITOR_CHECK_INTERVAL,
-						 PG_WAIT_EXTENSION);
-		ResetLatch(MyLatch);
-		CHECK_FOR_INTERRUPTS();
-	}
-}
-
 /*
 * XXX: These private to procarray.c, but we need them here.
 */
@@ -666,8 +425,8 @@ _PG_init(void)
 	LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 	SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;

+	InitUnstableExtensionsSupport();
 	InitLogicalReplicationMonitor();
-
 	InitControlPlaneConnector();

 	pg_init_extension_server();
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`SELECT neon.backpressure_throttling_time() AS throttled;`