diff --git a/.dockerignore b/.dockerignore
index 1258532db8..c7a2f78e32 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -8,6 +8,7 @@
 !scripts/combine_control_files.py
 !scripts/ninstall.sh
 !vm-cgconfig.conf
+!docker-compose/run-tests.sh
 
 # Directories
 !.cargo/
@@ -20,7 +21,7 @@
 !patches/
 !pgxn/
 !proxy/
-!s3_scrubber/
+!storage_scrubber/
 !safekeeper/
 !storage_broker/
 !storage_controller/
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1fc0fbb0b6..79a0a77638 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -299,21 +299,21 @@ jobs:
         uses: actions/cache@v4
         with:
           path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}-${{ hashFiles('Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
         uses: actions/cache@v4
         with:
           path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}-${{ hashFiles('Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
         uses: actions/cache@v4
         with:
           path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}-${{ hashFiles('Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
 
       - name: Build postgres v14
         if: steps.cache_pg_14.outputs.cache-hit != 'true'
@@ -858,6 +858,26 @@ jobs:
           cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
           tags: |
             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
+      
+      - name: Build neon extensions test image
+        if: matrix.version == 'v16'
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            PG_VERSION=${{ matrix.version }}
+            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.compute-node
+          target: neon-pg-ext-test
+          cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
+          cache-to: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
+          tags: |
+            neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
 
       - name: Build compute-tools image
         # compute-tools are Postgres independent, so build it only once
@@ -902,6 +922,13 @@ jobs:
                                              neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
                                              neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
 
+      - name: Create multi-arch neon-test-extensions image
+        if: matrix.version == 'v16'
+        run: |
+          docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
+                                             neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
+                                             neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
+
       - name: Create multi-arch compute-tools image
         if: matrix.version == 'v16'
         run: |
@@ -1020,7 +1047,7 @@ jobs:
             exit 1
           fi
 
-      - name: Verify docker-compose example
+      - name: Verify docker-compose example and test extensions
         timeout-minutes: 20
         run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
 
diff --git a/Cargo.lock b/Cargo.lock
index dbbf330cf9..1c8a8b0c0f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5109,54 +5109,6 @@ version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
 
-[[package]]
-name = "s3_scrubber"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "async-stream",
- "aws-config",
- "aws-sdk-s3",
- "aws-smithy-async",
- "bincode",
- "bytes",
- "camino",
- "chrono",
- "clap",
- "crc32c",
- "either",
- "futures",
- "futures-util",
- "hex",
- "histogram",
- "humantime",
- "itertools",
- "once_cell",
- "pageserver",
- "pageserver_api",
- "postgres_ffi",
- "rand 0.8.5",
- "remote_storage",
- "reqwest 0.12.4",
- "rustls 0.22.4",
- "rustls-native-certs 0.7.0",
- "serde",
- "serde_json",
- "serde_with",
- "thiserror",
- "tokio",
- "tokio-postgres",
- "tokio-postgres-rustls",
- "tokio-rustls 0.25.0",
- "tokio-stream",
- "tokio-util",
- "tracing",
- "tracing-appender",
- "tracing-subscriber",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "safekeeper"
 version = "0.1.0"
@@ -5813,6 +5765,54 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "storage_scrubber"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-stream",
+ "aws-config",
+ "aws-sdk-s3",
+ "aws-smithy-async",
+ "bincode",
+ "bytes",
+ "camino",
+ "chrono",
+ "clap",
+ "crc32c",
+ "either",
+ "futures",
+ "futures-util",
+ "hex",
+ "histogram",
+ "humantime",
+ "itertools",
+ "once_cell",
+ "pageserver",
+ "pageserver_api",
+ "postgres_ffi",
+ "rand 0.8.5",
+ "remote_storage",
+ "reqwest 0.12.4",
+ "rustls 0.22.4",
+ "rustls-native-certs 0.7.0",
+ "serde",
+ "serde_json",
+ "serde_with",
+ "thiserror",
+ "tokio",
+ "tokio-postgres",
+ "tokio-postgres-rustls",
+ "tokio-rustls 0.25.0",
+ "tokio-stream",
+ "tokio-util",
+ "tracing",
+ "tracing-appender",
+ "tracing-subscriber",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "storcon_cli"
 version = "0.1.0"
@@ -5820,6 +5820,7 @@ dependencies = [
  "anyhow",
  "clap",
  "comfy-table",
+ "futures",
  "humantime",
  "hyper 0.14.26",
  "pageserver_api",
diff --git a/Cargo.toml b/Cargo.toml
index 58715db32b..dc89c2341b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,7 @@ members = [
     "safekeeper",
     "storage_broker",
     "storage_controller",
-    "s3_scrubber",
+    "storage_scrubber",
     "workspace_hack",
     "trace",
     "libs/compute_api",
diff --git a/Dockerfile b/Dockerfile
index b4900d4a94..5f82df3e18 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -69,6 +69,8 @@ RUN set -e \
     && apt install -y \
         libreadline-dev \
         libseccomp-dev \
+        libicu67 \
+        openssl \
         ca-certificates \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
     && useradd -d /data neon \
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 91194eda1a..460b8c996d 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -112,35 +112,6 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
     && make install \
     && rm -rf ../lcov.tar.gz
 
-# Compile and install the static OpenSSL library
-ENV OPENSSL_VERSION=3.2.2
-ENV OPENSSL_PREFIX=/usr/local/openssl
-RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
-    echo "197149c18d9e9f292c43f0400acaba12e5f52cacfe050f3d199277ea738ec2e7 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
-    cd /tmp && \
-    tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
-    rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
-    cd /tmp/openssl-${OPENSSL_VERSION} && \
-    ./config --prefix=${OPENSSL_PREFIX}  -static --static no-shared -fPIC && \
-    make ${MAKE_ARGS} && \
-    make install && \
-    cd /tmp && \
-    rm -rf /tmp/openssl-${OPENSSL_VERSION}
-
-# Set the ICU version
-ENV ICU_VERSION=72.1
-ENV ICU_PREFIX=/usr/local/icu
-
-# Download and build static ICU
-RUN wget https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION//./-}/icu4c-${ICU_VERSION//./_}-src.tgz && \
-    tar -xzf icu4c-${ICU_VERSION//./_}-src.tgz && \
-    cd icu/source && \
-    ./configure --prefix=${ICU_PREFIX}  --enable-static --enable-shared=no CXXFLAGS="-fPIC" CFLAGS="-fPIC" && \
-    make && \
-    make install && \
-    cd ../.. && \
-    rm -rf icu icu4c-${ICU_VERSION//./_}-src.tgz
-
 # Switch to nonroot user
 USER nonroot:nonroot
 WORKDIR /home/nonroot
@@ -199,6 +170,3 @@ RUN whoami \
     && rustup --version --verbose \
     && rustc --version --verbose \
     && clang --version
-
-# Set following flag to check in Makefile if its running in Docker
-RUN touch /home/nonroot/.docker_build
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 90b8868b43..a86fdd0bc3 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -928,6 +928,69 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a
 
+
+#########################################################################################
+#
+# Layer neon-pg-ext-test
+#
+#########################################################################################
+
+FROM neon-pg-ext-build AS neon-pg-ext-test
+ARG PG_VERSION
+RUN mkdir /ext-src
+
+#COPY --from=postgis-build /postgis.tar.gz /ext-src/
+#COPY --from=postgis-build /sfcgal/* /usr
+COPY --from=plv8-build /plv8.tar.gz /ext-src/
+COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
+COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
+COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
+COPY --from=vector-pg-build /pgvector.patch /ext-src/
+COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
+#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
+#COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
+#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
+COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
+COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
+#COPY --from=rum-pg-build /rum.tar.gz /ext-src
+#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
+COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
+COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
+COPY --from=hll-pg-build /hll.tar.gz /ext-src
+COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
+#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
+COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
+COPY patches/pg_hintplan.patch /ext-src
+#COPY --from=kq-imcx-pg-build /kq_imcx.tar.gz /ext-src
+COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
+COPY patches/pg_cron.patch /ext-src
+#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
+COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
+COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
+COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
+COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
+#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
+#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
+COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
+COPY patches/pg_anon.patch /ext-src
+COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
+COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
+RUN cd /ext-src/ && for f in *.tar.gz; \
+    do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
+    rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
+    || exit 1; rm -f $f; done
+RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch 
+# cmake is required for the h3 test
+RUN apt-get update && apt-get install -y cmake
+RUN patch -p1 < /ext-src/pg_hintplan.patch
+COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
+RUN patch -p1 </ext-src/pg_anon.patch
+RUN patch -p1 </ext-src/pg_cron.patch
+ENV PATH=/usr/local/pgsql/bin:$PATH
+ENV PGHOST=compute
+ENV PGPORT=55433
+ENV PGUSER=cloud_admin
+ENV PGDATABASE=postgres
 #########################################################################################
 #
 # Final layer
diff --git a/Makefile b/Makefile
index b5f426344e..37bd19ba44 100644
--- a/Makefile
+++ b/Makefile
@@ -3,9 +3,6 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Where to install Postgres, default is ./pg_install, maybe useful for package managers
 POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
 
-OPENSSL_PREFIX_DIR := /usr/local/openssl
-ICU_PREFIX_DIR := /usr/local/icu
-
 #
 # We differentiate between release / debug build types using the BUILD_TYPE
 # environment variable.
@@ -23,16 +20,6 @@ else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif
 
-ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
-	# Exclude static build openssl, icu for local build (MacOS, Linux)
-	# Only keep for build type release and debug
-	PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include
-	PG_CONFIGURE_OPTS += --with-icu
-	PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION'
-	PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm'
-	PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread'
-endif
-
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
@@ -41,7 +28,7 @@ else ifeq ($(UNAME_S),Darwin)
 	ifndef DISABLE_HOMEBREW
 		# macOS with brew-installed openssl requires explicit paths
 		# It can be configured with OPENSSL_PREFIX variable
-		OPENSSL_PREFIX := $(shell brew --prefix openssl@3)
+		OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
 		PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
 		PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
 		# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
@@ -137,6 +124,8 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
 	+@echo "Compiling amcheck $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
+	+@echo "Compiling test_decoding $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/test_decoding install
 
 .PHONY: postgres-clean-%
 postgres-clean-%:
diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml
index ed3462961f..f96f0084b2 100644
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -9,6 +9,7 @@ license.workspace = true
 anyhow.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+futures.workspace = true
 humantime.workspace = true
 hyper.workspace = true
 pageserver_api.workspace = true
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 05c4acdf90..8c84911d33 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,3 +1,4 @@
+use futures::StreamExt;
 use std::{collections::HashMap, str::FromStr, time::Duration};
 
 use clap::{Parser, Subcommand};
@@ -148,6 +149,22 @@ enum Command {
         #[arg(long)]
         threshold: humantime::Duration,
     },
+    // Drain a set of specified pageservers by moving the primary attachments to pageservers
+    // outside of the specified set.
+    Drain {
+        // Set of pageserver node ids to drain.
+        #[arg(long)]
+        nodes: Vec<NodeId>,
+        // Optional: migration concurrency (default is 8)
+        #[arg(long)]
+        concurrency: Option<usize>,
+        // Optional: maximum number of shards to migrate
+        #[arg(long)]
+        max_shards: Option<usize>,
+        // Optional: when set to true, nothing is migrated, but the plan is printed to stdout
+        #[arg(long)]
+        dry_run: Option<bool>,
+    },
 }
 
 #[derive(Parser)]
@@ -737,6 +754,197 @@ async fn main() -> anyhow::Result<()> {
                 })
                 .await?;
         }
+        Command::Drain {
+            nodes,
+            concurrency,
+            max_shards,
+            dry_run,
+        } => {
+            // Load the list of nodes, split them up into the drained and filled sets,
+            // and validate that draining is possible.
+            let node_descs = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+
+            let mut node_to_drain_descs = Vec::new();
+            let mut node_to_fill_descs = Vec::new();
+
+            for desc in node_descs {
+                let to_drain = nodes.iter().any(|id| *id == desc.id);
+                if to_drain {
+                    node_to_drain_descs.push(desc);
+                } else {
+                    node_to_fill_descs.push(desc);
+                }
+            }
+
+            if nodes.len() != node_to_drain_descs.len() {
+                anyhow::bail!("Drain requested for node which doesn't exist.")
+            }
+
+            let can_fill = node_to_fill_descs
+                .iter()
+                .filter(|desc| {
+                    matches!(desc.availability, NodeAvailabilityWrapper::Active)
+                        && matches!(
+                            desc.scheduling,
+                            NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Filling
+                        )
+                })
+                .any(|_| true);
+
+            if !can_fill {
+                anyhow::bail!("There are no nodes to drain to")
+            }
+
+            // Set the node scheduling policy to draining for the nodes which
+            // we plan to drain.
+            for node_desc in node_to_drain_descs.iter() {
+                let req = NodeConfigureRequest {
+                    node_id: node_desc.id,
+                    availability: None,
+                    scheduling: Some(NodeSchedulingPolicy::Draining),
+                };
+
+                storcon_client
+                    .dispatch::<_, ()>(
+                        Method::PUT,
+                        format!("control/v1/node/{}/config", node_desc.id),
+                        Some(req),
+                    )
+                    .await?;
+            }
+
+            // Perform the drain: move each tenant shard scheduled on a node to
+            // be drained to a node which is being filled. A simple round robin
+            // strategy is used to pick the new node.
+            let tenants = storcon_client
+                .dispatch::<(), Vec<TenantDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/tenant".to_string(),
+                    None,
+                )
+                .await?;
+
+            let mut selected_node_idx = 0;
+
+            struct DrainMove {
+                tenant_shard_id: TenantShardId,
+                from: NodeId,
+                to: NodeId,
+            }
+
+            let mut moves: Vec<DrainMove> = Vec::new();
+
+            let shards = tenants
+                .into_iter()
+                .flat_map(|tenant| tenant.shards.into_iter());
+            for shard in shards {
+                if let Some(max_shards) = max_shards {
+                    if moves.len() >= max_shards {
+                        println!(
+                            "Stop planning shard moves since the requested maximum was reached"
+                        );
+                        break;
+                    }
+                }
+
+                let should_migrate = {
+                    if let Some(attached_to) = shard.node_attached {
+                        node_to_drain_descs
+                            .iter()
+                            .map(|desc| desc.id)
+                            .any(|id| id == attached_to)
+                    } else {
+                        false
+                    }
+                };
+
+                if !should_migrate {
+                    continue;
+                }
+
+                moves.push(DrainMove {
+                    tenant_shard_id: shard.tenant_shard_id,
+                    from: shard
+                        .node_attached
+                        .expect("We only migrate attached tenant shards"),
+                    to: node_to_fill_descs[selected_node_idx].id,
+                });
+                selected_node_idx = (selected_node_idx + 1) % node_to_fill_descs.len();
+            }
+
+            let total_moves = moves.len();
+
+            if dry_run == Some(true) {
+                println!("Dryrun requested. Planned {total_moves} moves:");
+                for mv in &moves {
+                    println!("{}: {} -> {}", mv.tenant_shard_id, mv.from, mv.to)
+                }
+
+                return Ok(());
+            }
+
+            const DEFAULT_MIGRATE_CONCURRENCY: usize = 8;
+            let mut stream = futures::stream::iter(moves)
+                .map(|mv| {
+                    let client = Client::new(cli.api.clone(), cli.jwt.clone());
+                    async move {
+                        client
+                            .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
+                                Method::PUT,
+                                format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
+                                Some(TenantShardMigrateRequest {
+                                    tenant_shard_id: mv.tenant_shard_id,
+                                    node_id: mv.to,
+                                }),
+                            )
+                            .await
+                            .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
+                    }
+                })
+                .buffered(concurrency.unwrap_or(DEFAULT_MIGRATE_CONCURRENCY));
+
+            let mut success = 0;
+            let mut failure = 0;
+
+            while let Some(res) = stream.next().await {
+                match res {
+                    Ok(_) => {
+                        success += 1;
+                    }
+                    Err((tenant_shard_id, from, to, error)) => {
+                        failure += 1;
+                        println!(
+                            "Failed to migrate {} from node {} to node {}: {}",
+                            tenant_shard_id, from, to, error
+                        );
+                    }
+                }
+
+                if (success + failure) % 20 == 0 {
+                    println!(
+                        "Processed {}/{} shards: {} succeeded, {} failed",
+                        success + failure,
+                        total_moves,
+                        success,
+                        failure
+                    );
+                }
+            }
+
+            println!(
+                "Processed {}/{} shards: {} succeeded, {} failed",
+                success + failure,
+                total_moves,
+                success,
+                failure
+            );
+        }
     }
 
     Ok(())
diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile
index 974dcd7f03..8378f37b48 100644
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -8,6 +8,11 @@ USER root
 RUN apt-get update &&       \
     apt-get install -y curl \
                        jq   \
+                       python3-pip \
                        netcat
+#Faker is required for the pg_anon test
+RUN pip3 install Faker
+#This is required for the pg_hintplan test
+RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src 
 
-USER postgres
+USER postgres
\ No newline at end of file
diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
index ccf0a91b90..8e582e74e1 100644
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
@@ -95,7 +95,7 @@
             },
             {
                 "name": "shared_preload_libraries",
-                "value": "neon",
+                "value": "neon,pg_cron,timescaledb,pg_stat_statements",
                 "vartype": "string"
             },
             {
@@ -127,6 +127,16 @@
                 "name": "max_replication_flush_lag",
                 "value": "10GB",
                 "vartype": "string"
+            },
+            {
+                "name": "cron.database",
+                "value": "postgres",
+                "vartype": "string"
+            },
+            {
+                "name": "session_preload_libraries",
+                "value": "anon",
+                "vartype": "string"
             }
         ]
     },
diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index 9777d1fdd2..3f097f2700 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -1,5 +1,3 @@
-version: '3'
-
 services:
   minio:
     restart: always
@@ -161,12 +159,12 @@ services:
       context: ./compute_wrapper/
       args:
         - REPOSITORY=${REPOSITORY:-neondatabase}
-        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
+        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16}
         - TAG=${TAG:-latest}
         - http_proxy=$http_proxy
         - https_proxy=$https_proxy
     environment:
-      - PG_VERSION=${PG_VERSION:-14}
+      - PG_VERSION=${PG_VERSION:-16}
       #- RUST_BACKTRACE=1
     # Mount the test files directly, for faster editing cycle.
     volumes:
@@ -194,3 +192,13 @@ services:
          done"
     depends_on:
       - compute
+
+  neon-test-extensions:
+    image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest}
+    entrypoint:
+      - "/bin/bash"
+      - "-c"
+    command:
+      - sleep 1800
+    depends_on:
+      - compute
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index 062fc6fc92..71e73c2d0a 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -7,15 +7,21 @@
 # Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file
 # Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
 # to verify custom image builds (e.g pre-published ones).
-
+#
+# A test script for postgres extensions
+# Currently supports only v16
+#
 set -eux -o pipefail
 
-SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
-COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml
-
+COMPOSE_FILE='docker-compose.yml'
+cd $(dirname $0)
+docker compose -f $COMPOSE_FILE 
 COMPUTE_CONTAINER_NAME=docker-compose-compute-1
-SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
-PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
+TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1
+PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
+: ${http_proxy:=}
+: ${https_proxy:=}
+export http_proxy https_proxy
 
 cleanup() {
     echo "show container information"
@@ -25,34 +31,71 @@ cleanup() {
     docker compose -f $COMPOSE_FILE down
 }
 
-echo "clean up containers if exists"
-cleanup
-
 for pg_version in 14 15 16; do
-    echo "start containers (pg_version=$pg_version)."
-    PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d
+    echo "clean up containers if exists"
+    cleanup
+    PG_TEST_VERSION=$(($pg_version < 16 ? 16 : $pg_version))
+    PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose -f $COMPOSE_FILE up --build -d
 
     echo "wait until the compute is ready. timeout after 60s. "
     cnt=0
-    while sleep 1; do
+    while sleep 3; do
         # check timeout
-        cnt=`expr $cnt + 1`
+        cnt=`expr $cnt + 3`
         if [ $cnt -gt 60 ]; then
             echo "timeout before the compute is ready."
             cleanup
             exit 1
         fi
-
-        # check if the compute is ready
-        set +o pipefail
-        result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
-        set -o pipefail
-        if [ $result -eq 1 ]; then
+        if docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then
             echo "OK. The compute is ready to connect."
             echo "execute simple queries."
             docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
-            cleanup
             break
         fi
     done
+
+    if [ $pg_version -ge 16 ]
+    then
+        echo Enabling trust connection
+        docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' "
+        echo Adding postgres role
+        docker exec $COMPUTE_CONTAINER_NAME psql $PSQL_OPTION -c "CREATE ROLE postgres SUPERUSER LOGIN"
+        # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
+        # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
+        echo Adding dummy config
+        docker exec $COMPUTE_CONTAINER_NAME touch /var/db/postgres/compute/compute_ctl_temp_override.conf
+        # This block is required for the pg_anon extension test.
+        # The test assumes that it is running on the same host with the postgres engine.
+        # In our case it's not true, that's why we are copying files to the compute node
+        TMPDIR=$(mktemp -d)
+        docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
+        echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
+        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
+        rm -rf $TMPDIR
+        TMPDIR=$(mktemp -d)
+        # The following block does the same for the pg_hintplan test
+        docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
+        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
+        rm -rf $TMPDIR
+        # We are running tests now
+        if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
+            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
+        then
+            cleanup
+        else
+            FAILED=$(tail -1 testout.txt)
+            for d in $FAILED
+            do
+                mkdir $d
+                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true
+                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true
+                cat $d/regression.out $d/regression.diffs || true
+            done
+        rm -rf $FAILED
+        cleanup
+        exit 1
+        fi
+    fi
+    cleanup
 done
diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh
new file mode 100644
index 0000000000..c05fc159aa
--- /dev/null
+++ b/docker-compose/run-tests.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -x
+
+cd /ext-src
+FAILED=
+LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
+for d in ${LIST}
+do
+       [ -d ${d} ] || continue
+    psql -c "select 1" >/dev/null || break
+       make -C ${d} installcheck || FAILED="${d} ${FAILED}"
+done
+[ -z "${FAILED}" ] && exit 0
+echo ${FAILED}
+exit 1
\ No newline at end of file
diff --git a/docs/docker.md b/docs/docker.md
index cbf68be3a7..ccd2afc27a 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -34,12 +34,12 @@ You can see a [docker compose](https://docs.docker.com/compose/) example to crea
 1. create containers
 
 You can specify version of neon cluster using following environment values.
-- PG_VERSION: postgres version for compute (default is 14)
+- PG_VERSION: postgres version for compute (default is 16)
 - TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
 ```
 $ cd docker-compose/
 $ docker-compose down   # remove the containers if exists
-$ PG_VERSION=15 TAG=2937 docker-compose up --build -d  # You can specify the postgres and image version
+$ PG_VERSION=16 TAG=2937 docker-compose up --build -d  # You can specify the postgres and image version
 Creating network "dockercompose_default" with the default driver
 Creating docker-compose_storage_broker_1       ... done
 (...omit...)
diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs
index a33cae6769..20018846f8 100644
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -1,11 +1,6 @@
-use std::collections::HashMap;
-
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
-use pageserver::tenant::storage_layer::LayerName;
-use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
-use utils::lsn::Lsn;
+use pageserver::tenant::IndexPart;
 
 #[derive(clap::Subcommand)]
 pub(crate) enum IndexPartCmd {
@@ -17,20 +12,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
         IndexPartCmd::Dump { path } => {
             let bytes = tokio::fs::read(path).await.context("read file")?;
             let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
-            #[derive(serde::Serialize)]
-            struct Output<'a> {
-                layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
-                disk_consistent_lsn: Lsn,
-                timeline_metadata: &'a TimelineMetadata,
-            }
-
-            let output = Output {
-                layer_metadata: &des.layer_metadata,
-                disk_consistent_lsn: des.metadata.disk_consistent_lsn(),
-                timeline_metadata: &des.metadata,
-            };
-
-            let output = serde_json::to_string_pretty(&output).context("serialize output")?;
+            let output = serde_json::to_string_pretty(&des).context("serialize output")?;
             println!("{output}");
             Ok(())
         }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 60cd5c9695..f9ed6d3071 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3395,6 +3395,12 @@ impl Tenant {
         let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id;
         let unfinished_timeline = raw_timeline.raw_timeline()?;
 
+        // Flush the new layer files to disk, before we make the timeline as available to
+        // the outside world.
+        //
+        // Flush loop needs to be spawned in order to be able to flush.
+        unfinished_timeline.maybe_spawn_flush_loop();
+
         import_datadir::import_timeline_from_postgres_datadir(
             unfinished_timeline,
             &pgdata_path,
@@ -3406,12 +3412,6 @@ impl Tenant {
             format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}")
         })?;
 
-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        unfinished_timeline.maybe_spawn_flush_loop();
-
         fail::fail_point!("before-checkpoint-new-timeline", |_| {
             anyhow::bail!("failpoint before-checkpoint-new-timeline");
         });
@@ -4044,10 +4044,12 @@ mod tests {
     use crate::DEFAULT_PG_VERSION;
     use bytes::{Bytes, BytesMut};
     use hex_literal::hex;
+    use itertools::Itertools;
     use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
     use pageserver_api::keyspace::KeySpace;
     use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
     use rand::{thread_rng, Rng};
+    use storage_layer::PersistentLayerKey;
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
     use utils::bin_ser::BeSer;
@@ -6584,8 +6586,8 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_metadata_tombstone_image_creation() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
+    async fn test_metadata_tombstone_image_creation() {
+        let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
@@ -6613,7 +6615,8 @@ mod tests {
                 vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
                 Lsn(0x30),
             )
-            .await?;
+            .await
+            .unwrap();
 
         let cancel = CancellationToken::new();
 
@@ -6628,23 +6631,24 @@ mod tests {
                 },
                 &ctx,
             )
-            .await?;
+            .await
+            .unwrap();
 
         // Image layers are created at last_record_lsn
         let images = tline
             .inspect_image_layers(Lsn(0x30), &ctx)
-            .await?
+            .await
+            .unwrap()
             .into_iter()
             .filter(|(k, _)| k.is_metadata_key())
             .collect::<Vec<_>>();
         assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed.
-
-        Ok(())
     }
 
     #[tokio::test]
-    async fn test_metadata_tombstone_empty_image_creation() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
+    async fn test_metadata_tombstone_empty_image_creation() {
+        let harness =
+            TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap();
         let (tenant, ctx) = harness.load().await;
 
         let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -6666,7 +6670,8 @@ mod tests {
                 vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
                 Lsn(0x30),
             )
-            .await?;
+            .await
+            .unwrap();
 
         let cancel = CancellationToken::new();
 
@@ -6681,16 +6686,174 @@ mod tests {
                 },
                 &ctx,
             )
-            .await?;
+            .await
+            .unwrap();
 
         // Image layers are created at last_record_lsn
         let images = tline
             .inspect_image_layers(Lsn(0x30), &ctx)
-            .await?
+            .await
+            .unwrap()
             .into_iter()
             .filter(|(k, _)| k.is_metadata_key())
             .collect::<Vec<_>>();
         assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created
+    }
+
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction")?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
+        //
+        //  | D1 |                       | D3 |
+        // -|    |-- gc horizon -----------------
+        //  |    |                | D2 |
+        // --------- img layer ------------------
+        //
+        // What we should expact from this compaction is:
+        //  | Part of D1 |               | D3 |
+        // --------- img layer with D1+D2 at GC horizon------------------
+
+        // img layer at 0x10
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), test_img(&format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![
+            // TODO: we should test a real delta record here, which requires us to add a variant of NeonWalRecord for testing purpose.
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::Image(test_img("value 1@0x20")),
+            ),
+            (
+                get_key(2),
+                Lsn(0x30),
+                Value::Image(test_img("value 2@0x30")),
+            ),
+            (
+                get_key(3),
+                Lsn(0x40),
+                Value::Image(test_img("value 3@0x40")),
+            ),
+        ];
+        let delta2 = vec![
+            (
+                get_key(5),
+                Lsn(0x20),
+                Value::Image(test_img("value 5@0x20")),
+            ),
+            (
+                get_key(6),
+                Lsn(0x20),
+                Value::Image(test_img("value 6@0x20")),
+            ),
+        ];
+        let delta3 = vec![
+            (
+                get_key(8),
+                Lsn(0x40),
+                Value::Image(test_img("value 8@0x40")),
+            ),
+            (
+                get_key(9),
+                Lsn(0x40),
+                Value::Image(test_img("value 9@0x40")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![delta1, delta2, delta3], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            guard.cutoffs.pitr = Lsn(0x30);
+            guard.cutoffs.horizon = Lsn(0x30);
+        }
+
+        let cancel = CancellationToken::new();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+
+        // Check if the image layer at the GC horizon contains exactly what we want
+        let image_at_gc_horizon = tline
+            .inspect_image_layers(Lsn(0x30), &ctx)
+            .await
+            .unwrap()
+            .into_iter()
+            .filter(|(k, _)| k.is_metadata_key())
+            .collect::<Vec<_>>();
+
+        assert_eq!(image_at_gc_horizon.len(), 10);
+        let expected_lsn = [0x10, 0x20, 0x30, 0x10, 0x10, 0x20, 0x20, 0x10, 0x10, 0x10];
+        for idx in 0..10 {
+            assert_eq!(
+                image_at_gc_horizon[idx],
+                (
+                    get_key(idx as u32),
+                    test_img(&format!("value {idx}@{:#x}", expected_lsn[idx]))
+                )
+            );
+        }
+
+        // Check if old layers are removed / new layers have the expected LSN
+        let mut all_layers = tline.inspect_historic_layers().await.unwrap();
+        all_layers.sort_by(|k1, k2| {
+            (
+                k1.is_delta,
+                k1.key_range.start,
+                k1.key_range.end,
+                k1.lsn_range.start,
+                k1.lsn_range.end,
+            )
+                .cmp(&(
+                    k2.is_delta,
+                    k2.key_range.start,
+                    k2.key_range.end,
+                    k2.lsn_range.start,
+                    k2.lsn_range.end,
+                ))
+        });
+        assert_eq!(
+            all_layers,
+            vec![
+                // Image layer at GC horizon
+                PersistentLayerKey {
+                    key_range: Key::MIN..get_key(10),
+                    lsn_range: Lsn(0x30)..Lsn(0x31),
+                    is_delta: false
+                },
+                // The delta layer that is cut in the middle
+                PersistentLayerKey {
+                    key_range: Key::MIN..get_key(9),
+                    lsn_range: Lsn(0x30)..Lsn(0x41),
+                    is_delta: true
+                },
+                // The delta layer we created and should not be picked for the compaction
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x40)..Lsn(0x41),
+                    is_delta: true
+                }
+            ]
+        );
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index c00672895a..6ba1bdef9b 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -1,15 +1,23 @@
-//! Every image of a certain timeline from [`crate::tenant::Tenant`]
-//! has a metadata that needs to be stored persistently.
+//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in
+//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines,
+//! this struct and it's original serialization format is still needed because they were written a
+//! long time ago.
 //!
-//! Later, the file gets used in [`remote_timeline_client`] as a part of
-//! external storage import and export operations.
+//! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json
+//! versioning.
 //!
-//! The module contains all structs and related helper methods related to timeline metadata.
+//! To clean up this module we need to migrate all index_part.json files to a later version.
+//! While doing this, we need to be mindful about s3 based recovery as well, so it might take
+//! however long we keep the old versions to be able to delete the old code. After that, we can
+//! remove everything else than [`TimelineMetadataBodyV2`], rename it as `TimelineMetadata` and
+//! move it to `index.rs`. Before doing all of this, we need to keep the structures for backwards
+//! compatibility.
 //!
 //! [`remote_timeline_client`]: super::remote_timeline_client
+//! [`IndexPart`]: super::remote_timeline_client::index::IndexPart
 
 use anyhow::ensure;
-use serde::{de::Error, Deserialize, Serialize, Serializer};
+use serde::{Deserialize, Serialize};
 use utils::bin_ser::SerializeError;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
 
@@ -17,17 +25,37 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
 const METADATA_FORMAT_VERSION: u16 = 4;
 
 /// Previous supported format versions.
+///
+/// In practice, none of these should remain, all are [`METADATA_FORMAT_VERSION`], but confirming
+/// that requires a scrubber run which is yet to be done.
 const METADATA_OLD_FORMAT_VERSION: u16 = 3;
 
-/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic.
+/// When the file existed on disk we assumed that a write of up to METADATA_MAX_SIZE bytes is atomic.
 ///
 /// This is the same assumption that PostgreSQL makes with the control file,
+///
 /// see PG_CONTROL_MAX_SAFE_SIZE
 const METADATA_MAX_SIZE: usize = 512;
 
-/// Metadata stored on disk for each timeline
+/// Legacy metadata stored as a component of `index_part.json` per timeline.
 ///
-/// The fields correspond to the values we hold in memory, in Timeline.
+/// Do not make new changes to this type or the module. In production, we have two different kinds
+/// of serializations of this type: bincode and json. Bincode version reflects what used to be
+/// stored on disk in earlier versions and does internal crc32 checksumming.
+///
+/// This type should not implement `serde::Serialize` or `serde::Deserialize` because there would
+/// be a confusion whether you want the old version ([`TimelineMetadata::from_bytes`]) or the modern
+/// as-exists in `index_part.json` ([`self::modern_serde`]).
+///
+/// ```compile_fail
+/// #[derive(serde::Serialize)]
+/// struct DoNotDoThis(pageserver::tenant::metadata::TimelineMetadata);
+/// ```
+///
+/// ```compile_fail
+/// #[derive(serde::Deserialize)]
+/// struct NeitherDoThis(pageserver::tenant::metadata::TimelineMetadata);
+/// ```
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct TimelineMetadata {
     hdr: TimelineMetadataHeader,
@@ -40,6 +68,49 @@ struct TimelineMetadataHeader {
     size: u16,           // size of serialized metadata
     format_version: u16, // metadata format version (used for compatibility checks)
 }
+
+impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader {
+    type Error = Crc32CalculationFailed;
+
+    fn try_from(value: &TimelineMetadataBodyV2) -> Result<Self, Self::Error> {
+        #[derive(Default)]
+        struct Crc32Sink {
+            crc: u32,
+            count: usize,
+        }
+
+        impl std::io::Write for Crc32Sink {
+            fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+                self.crc = crc32c::crc32c_append(self.crc, buf);
+                self.count += buf.len();
+                Ok(buf.len())
+            }
+
+            fn flush(&mut self) -> std::io::Result<()> {
+                Ok(())
+            }
+        }
+
+        // jump through hoops to calculate the crc32 so that TimelineMetadata::ne works
+        // across serialization versions
+        let mut sink = Crc32Sink::default();
+        <TimelineMetadataBodyV2 as utils::bin_ser::BeSer>::ser_into(value, &mut sink)
+            .map_err(Crc32CalculationFailed)?;
+
+        let size = METADATA_HDR_SIZE + sink.count;
+
+        Ok(TimelineMetadataHeader {
+            checksum: sink.crc,
+            size: size as u16,
+            format_version: METADATA_FORMAT_VERSION,
+        })
+    }
+}
+
+#[derive(thiserror::Error, Debug)]
+#[error("re-serializing for crc32 failed")]
+struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError);
+
 const METADATA_HDR_SIZE: usize = std::mem::size_of::<TimelineMetadataHeader>();
 
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -111,6 +182,12 @@ impl TimelineMetadata {
         }
     }
 
+    #[cfg(test)]
+    pub(crate) fn with_recalculated_checksum(mut self) -> anyhow::Result<Self> {
+        self.hdr = TimelineMetadataHeader::try_from(&self.body)?;
+        Ok(self)
+    }
+
     fn upgrade_timeline_metadata(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
         let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?;
 
@@ -261,32 +338,8 @@ impl TimelineMetadata {
     }
 }
 
-impl<'de> Deserialize<'de> for TimelineMetadata {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let bytes = Vec::<u8>::deserialize(deserializer)?;
-        Self::from_bytes(bytes.as_slice()).map_err(D::Error::custom)
-    }
-}
-
-impl Serialize for TimelineMetadata {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: Serializer,
-    {
-        let bytes = self.to_bytes().map_err(serde::ser::Error::custom)?;
-        bytes.serialize(serializer)
-    }
-}
-
 pub(crate) mod modern_serde {
-    use crate::tenant::metadata::METADATA_FORMAT_VERSION;
-
-    use super::{
-        TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader, METADATA_HDR_SIZE,
-    };
+    use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader};
     use serde::{Deserialize, Serialize};
 
     pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result<TimelineMetadata, D::Error>
@@ -322,71 +375,15 @@ pub(crate) mod modern_serde {
 
                 let de = serde::de::value::MapAccessDeserializer::new(map);
                 let body = TimelineMetadataBodyV2::deserialize(de)?;
+                let hdr = TimelineMetadataHeader::try_from(&body).map_err(A::Error::custom)?;
 
-                // jump through hoops to calculate the crc32 so that TimelineMetadata::ne works
-                // across serialization versions
-                let mut sink = Crc32Sink::default();
-                <TimelineMetadataBodyV2 as utils::bin_ser::BeSer>::ser_into(&body, &mut sink)
-                    .map_err(|e| A::Error::custom(Crc32CalculationFailed(e)))?;
-
-                let size = METADATA_HDR_SIZE + sink.count;
-
-                Ok(TimelineMetadata {
-                    hdr: TimelineMetadataHeader {
-                        checksum: sink.crc,
-                        size: size as u16,
-                        format_version: METADATA_FORMAT_VERSION,
-                    },
-                    body,
-                })
+                Ok(TimelineMetadata { hdr, body })
             }
         }
 
         deserializer.deserialize_any(Visitor)
     }
 
-    #[derive(Default)]
-    struct Crc32Sink {
-        crc: u32,
-        count: usize,
-    }
-
-    impl std::io::Write for Crc32Sink {
-        fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
-            self.crc = crc32c::crc32c_append(self.crc, buf);
-            self.count += buf.len();
-            Ok(buf.len())
-        }
-
-        fn flush(&mut self) -> std::io::Result<()> {
-            Ok(())
-        }
-    }
-
-    #[derive(thiserror::Error)]
-    #[error("re-serializing for crc32 failed")]
-    struct Crc32CalculationFailed<E>(#[source] E);
-
-    // this should be true for one release, after that we can change it to false
-    // remember to check the IndexPart::metadata field TODO comment as well
-    const LEGACY_BINCODED_BYTES: bool = true;
-
-    #[derive(serde::Serialize)]
-    #[serde(transparent)]
-    struct LegacyPaddedBytes<'a>(&'a TimelineMetadata);
-
-    struct JustTheBodyV2<'a>(&'a TimelineMetadata);
-
-    impl serde::Serialize for JustTheBodyV2<'_> {
-        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            // header is not needed, upon reading we've upgraded all v1 to v2
-            self.0.body.serialize(serializer)
-        }
-    }
-
     pub(crate) fn serialize<S>(
         metadata: &TimelineMetadata,
         serializer: S,
@@ -394,25 +391,23 @@ pub(crate) mod modern_serde {
     where
         S: serde::Serializer,
     {
-        // we cannot use TimelineMetadata::serialize for now because it'll do
-        // TimelineMetadata::to_bytes
-        if LEGACY_BINCODED_BYTES {
-            LegacyPaddedBytes(metadata).serialize(serializer)
-        } else {
-            JustTheBodyV2(metadata).serialize(serializer)
-        }
+        // header is not needed, upon reading we've upgraded all v1 to v2
+        metadata.body.serialize(serializer)
     }
 
     #[test]
     fn deserializes_bytes_as_well_as_equivalent_body_v2() {
         #[derive(serde::Deserialize, serde::Serialize)]
-        struct Wrapper(#[serde(deserialize_with = "deserialize")] TimelineMetadata);
+        struct Wrapper(
+            #[serde(deserialize_with = "deserialize", serialize_with = "serialize")]
+            TimelineMetadata,
+        );
 
         let too_many_bytes = "[216,111,252,208,0,54,0,4,0,0,0,0,1,73,253,144,1,0,0,0,0,1,73,253,24,0,0,0,0,0,0,0,0,0,0,0,0,0,1,73,253,24,0,0,0,0,1,73,253,24,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]";
 
         let wrapper_from_bytes = serde_json::from_str::<Wrapper>(too_many_bytes).unwrap();
 
-        let serialized = serde_json::to_value(JustTheBodyV2(&wrapper_from_bytes.0)).unwrap();
+        let serialized = serde_json::to_value(&wrapper_from_bytes).unwrap();
 
         assert_eq!(
             serialized,
@@ -553,59 +548,6 @@ mod tests {
         );
     }
 
-    #[test]
-    fn test_metadata_bincode_serde() {
-        let original_metadata = TimelineMetadata::new(
-            Lsn(0x200),
-            Some(Lsn(0x100)),
-            Some(TIMELINE_ID),
-            Lsn(0),
-            Lsn(0),
-            Lsn(0),
-            // Any version will do here, so use the default
-            crate::DEFAULT_PG_VERSION,
-        );
-        let metadata_bytes = original_metadata
-            .to_bytes()
-            .expect("Cannot create bytes array from metadata");
-
-        let metadata_bincode_be_bytes = original_metadata
-            .ser()
-            .expect("Cannot serialize the metadata");
-
-        // 8 bytes for the length of the vector
-        assert_eq!(metadata_bincode_be_bytes.len(), 8 + metadata_bytes.len());
-
-        let expected_bincode_bytes = {
-            let mut temp = vec![];
-            let len_bytes = metadata_bytes.len().to_be_bytes();
-            temp.extend_from_slice(&len_bytes);
-            temp.extend_from_slice(&metadata_bytes);
-            temp
-        };
-        assert_eq!(metadata_bincode_be_bytes, expected_bincode_bytes);
-
-        let deserialized_metadata = TimelineMetadata::des(&metadata_bincode_be_bytes).unwrap();
-        // Deserialized metadata has the metadata header, which is different from the serialized one.
-        //   Reference: TimelineMetaData::to_bytes()
-        let expected_metadata = {
-            let mut temp_metadata = original_metadata;
-            let body_bytes = temp_metadata
-                .body
-                .ser()
-                .expect("Cannot serialize the metadata body");
-            let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
-            let hdr = TimelineMetadataHeader {
-                size: metadata_size as u16,
-                format_version: METADATA_FORMAT_VERSION,
-                checksum: crc32c::crc32c(&body_bytes),
-            };
-            temp_metadata.hdr = hdr;
-            temp_metadata
-        };
-        assert_eq!(deserialized_metadata, expected_metadata);
-    }
-
     #[test]
     fn test_metadata_bincode_serde_ensure_roundtrip() {
         let original_metadata = TimelineMetadata::new(
@@ -619,8 +561,6 @@ mod tests {
             crate::DEFAULT_PG_VERSION,
         );
         let expected_bytes = vec![
-            /* bincode length encoding bytes */
-            0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector
             /* TimelineMetadataHeader */
             4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
             /* TimelineMetadataBodyV2 */
@@ -650,7 +590,7 @@ mod tests {
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
             0, 0, 0, 0, 0, 0, 0,
         ];
-        let metadata_ser_bytes = original_metadata.ser().unwrap();
+        let metadata_ser_bytes = original_metadata.to_bytes().unwrap();
         assert_eq!(metadata_ser_bytes, expected_bytes);
 
         let expected_metadata = {
@@ -668,7 +608,7 @@ mod tests {
             temp_metadata.hdr = hdr;
             temp_metadata
         };
-        let des_metadata = TimelineMetadata::des(&metadata_ser_bytes).unwrap();
+        let des_metadata = TimelineMetadata::from_bytes(&metadata_ser_bytes).unwrap();
         assert_eq!(des_metadata, expected_metadata);
     }
 }
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 7d2e9b9a91..6233a3477e 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -38,14 +38,17 @@ pub struct IndexPart {
     /// that latest version stores.
     pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,
 
-    // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
-    // It's duplicated for convenience when reading the serialized structure, but is
-    // private because internally we would read from metadata instead.
+    /// Because of the trouble of eyeballing the legacy "metadata" field, we copied the
+    /// "disk_consistent_lsn" out. After version 7 this is no longer needed, but the name cannot be
+    /// reused.
     pub(super) disk_consistent_lsn: Lsn,
 
-    // TODO: later make this "rename" to "alias", rename field as "legacy_metadata"
+    // TODO: rename as "metadata" next week, keep the alias = "metadata_bytes", bump version Adding
+    // the "alias = metadata" was forgotten in #7693, so we have to use "rewrite = metadata_bytes"
+    // for backwards compatibility.
     #[serde(
         rename = "metadata_bytes",
+        alias = "metadata",
         with = "crate::tenant::metadata::modern_serde"
     )]
     pub metadata: TimelineMetadata,
@@ -76,10 +79,11 @@ impl IndexPart {
     /// - 4: timeline_layers is fully removed.
     /// - 5: lineage was added
     /// - 6: last_aux_file_policy is added.
-    const LATEST_VERSION: usize = 6;
+    /// - 7: metadata_bytes is no longer written, but still read
+    const LATEST_VERSION: usize = 7;
 
     // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7];
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
@@ -95,7 +99,7 @@ impl IndexPart {
         }
     }
 
-    pub fn get_version(&self) -> usize {
+    pub fn version(&self) -> usize {
         self.version
     }
 
@@ -217,9 +221,9 @@ impl Lineage {
 
 #[cfg(test)]
 mod tests {
-    use std::str::FromStr;
-
     use super::*;
+    use std::str::FromStr;
+    use utils::id::TimelineId;
 
     #[test]
     fn v1_indexpart_is_parsed() {
@@ -338,8 +342,7 @@ mod tests {
             ]),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
-            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
+            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
             lineage: Lineage::default(),
             last_aux_file_policy: None,
         };
@@ -515,8 +518,7 @@ mod tests {
             ]),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
-            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
+            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
             lineage: Lineage {
                 reparenting_history_truncated: false,
                 reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
@@ -529,6 +531,60 @@ mod tests {
         assert_eq!(part, expected);
     }
 
+    #[test]
+    fn v7_indexpart_is_parsed() {
+        let example = r#"{
+            "version": 7,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata": {
+                "disk_consistent_lsn": "0/16960E8",
+                "prev_record_lsn": "0/1696070",
+                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
+                "ancestor_lsn": "0/0",
+                "latest_gc_cutoff_lsn": "0/1696070",
+                "initdb_lsn": "0/1696070",
+                "pg_version": 14
+            },
+            "deleted_at": "2023-07-31T09:00:00.123"
+        }"#;
+
+        let expected = IndexPart {
+            version: 7,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::new(
+                Lsn::from_str("0/16960E8").unwrap(),
+                Some(Lsn::from_str("0/1696070").unwrap()),
+                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
+                Lsn::INVALID,
+                Lsn::from_str("0/1696070").unwrap(),
+                Lsn::from_str("0/1696070").unwrap(),
+                14,
+            ).with_recalculated_checksum().unwrap(),
+            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
+            lineage: Default::default(),
+            last_aux_file_policy: Default::default(),
+        };
+
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
     fn parse_naive_datetime(s: &str) -> NaiveDateTime {
         chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
     }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 999e2e8679..eb7cf81643 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -929,6 +929,45 @@ impl DeltaLayerInner {
         Ok(())
     }
 
+    /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
+    #[cfg(test)]
+    pub(super) async fn load_key_values(
+        &self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            block_reader,
+        );
+        let mut result = Vec::new();
+        let mut stream =
+            Box::pin(self.stream_index_forwards(&index_reader, &[0; DELTA_KEY_SIZE], ctx));
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let cursor = block_reader.block_cursor();
+        let mut buf = Vec::new();
+        while let Some(item) = stream.next().await {
+            let (key, lsn, pos) = item?;
+            // TODO: dedup code with get_reconstruct_value
+            // TODO: ctx handling and sharding
+            cursor
+                .read_blob_into_buf(pos.pos(), &mut buf, ctx)
+                .await
+                .with_context(|| {
+                    format!("Failed to read blob from virtual file {}", self.file.path)
+                })?;
+            let val = Value::des(&buf).with_context(|| {
+                format!(
+                    "Failed to deserialize file blob from virtual file {}",
+                    self.file.path
+                )
+            })?;
+            result.push((key, lsn, val));
+        }
+        Ok(result)
+    }
+
     async fn plan_reads<Reader>(
         keyspace: &KeySpace,
         lsn_range: Range<Lsn>,
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 285618b146..06e2f09384 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -485,6 +485,34 @@ impl ImageLayerInner {
         Ok(())
     }
 
+    /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
+    #[cfg(test)]
+    pub(super) async fn load_key_values(
+        &self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
+        let mut result = Vec::new();
+        let mut stream = Box::pin(tree_reader.get_stream_from(&[0; KEY_SIZE], ctx));
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let cursor = block_reader.block_cursor();
+        while let Some(item) = stream.next().await {
+            // TODO: dedup code with get_reconstruct_value
+            let (raw_key, offset) = item?;
+            let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+            // TODO: ctx handling and sharding
+            let blob = cursor
+                .read_blob(offset, ctx)
+                .await
+                .with_context(|| format!("failed to read value from offset {}", offset))?;
+            let value = Bytes::from(blob);
+            result.push((key, self.lsn, Value::Image(value)));
+        }
+        Ok(result)
+    }
+
     /// Traverse the layer's index to build read operations on the overlap of the input keyspace
     /// and the keys in this layer.
     ///
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 9553f83026..1ecc56ce99 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -52,7 +52,7 @@ pub struct InMemoryLayer {
 
     /// Frozen layers have an exclusive end LSN.
     /// Writes are only allowed when this is `None`.
-    end_lsn: OnceLock<Lsn>,
+    pub(crate) end_lsn: OnceLock<Lsn>,
 
     /// Used for traversal path. Cached representation of the in-memory layer before frozen.
     local_path_str: Arc<str>,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 18f9ba4ef8..32acb3f0cd 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -388,6 +388,23 @@ impl Layer {
             })
     }
 
+    /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
+    #[cfg(test)]
+    pub(crate) async fn load_key_values(
+        &self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
+        let layer = self
+            .0
+            .get_or_maybe_download(true, Some(ctx))
+            .await
+            .map_err(|err| match err {
+                DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
+                other => GetVectoredError::Other(anyhow::anyhow!(other)),
+            })?;
+        layer.load_key_values(&self.0, ctx).await
+    }
+
     /// Download the layer if evicted.
     ///
     /// Will not error when the layer is already downloaded.
@@ -1757,6 +1774,20 @@ impl DownloadedLayer {
         }
     }
 
+    #[cfg(test)]
+    async fn load_key_values(
+        &self,
+        owner: &Arc<LayerInner>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
+        use LayerKind::*;
+
+        match self.get(owner, ctx).await? {
+            Delta(d) => d.load_key_values(ctx).await,
+            Image(i) => i.load_key_values(ctx).await,
+        }
+    }
+
     async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
         use LayerKind::*;
         match self.get(owner, ctx).await? {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 388d5b9d54..54a4ceeaf3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -322,6 +322,8 @@ pub struct Timeline {
     /// Locked automatically by [`TimelineWriter`] and checkpointer.
     /// Must always be acquired before the layer map/individual layer lock
     /// to avoid deadlock.
+    ///
+    /// The state is cleared upon freezing.
     write_lock: tokio::sync::Mutex<Option<TimelineWriterState>>,
 
     /// Used to avoid multiple `flush_loop` tasks running
@@ -1569,7 +1571,15 @@ impl Timeline {
     // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
     // polluting the span hierarchy.
     pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
-        let to_lsn = self.freeze_inmem_layer(false).await;
+        let to_lsn = {
+            // Freeze the current open in-memory layer. It will be written to disk on next
+            // iteration.
+            let mut g = self.write_lock.lock().await;
+
+            let to_lsn = self.get_last_record_lsn();
+            self.freeze_inmem_layer_at(to_lsn, &mut g).await;
+            to_lsn
+        };
         self.flush_frozen_layers_and_wait(to_lsn).await
     }
 
@@ -1578,7 +1588,7 @@ impl Timeline {
     // an ephemeral layer open forever when idle.  It also freezes layers if the global limit on
     // ephemeral layer bytes has been breached.
     pub(super) async fn maybe_freeze_ephemeral_layer(&self) {
-        let Ok(_write_guard) = self.write_lock.try_lock() else {
+        let Ok(mut write_guard) = self.write_lock.try_lock() else {
             // If the write lock is held, there is an active wal receiver: rolling open layers
             // is their responsibility while they hold this lock.
             return;
@@ -1655,24 +1665,35 @@ impl Timeline {
             self.last_freeze_at.load(),
             open_layer.get_opened_at(),
         ) {
-            match open_layer.info() {
+            let at_lsn = match open_layer.info() {
                 InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
                     // We may reach this point if the layer was already frozen by not yet flushed: flushing
                     // happens asynchronously in the background.
                     tracing::debug!(
                         "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})"
                     );
+                    None
                 }
                 InMemoryLayerInfo::Open { .. } => {
                     // Upgrade to a write lock and freeze the layer
                     drop(layers_guard);
                     let mut layers_guard = self.layers.write().await;
-                    layers_guard
-                        .try_freeze_in_memory_layer(current_lsn, &self.last_freeze_at)
+                    let froze = layers_guard
+                        .try_freeze_in_memory_layer(
+                            current_lsn,
+                            &self.last_freeze_at,
+                            &mut write_guard,
+                        )
                         .await;
+                    Some(current_lsn).filter(|_| froze)
+                }
+            };
+            if let Some(lsn) = at_lsn {
+                let res: Result<u64, _> = self.flush_frozen_layers(lsn);
+                if let Err(e) = res {
+                    tracing::info!("failed to flush frozen layer after background freeze: {e:#}");
                 }
             }
-            self.flush_frozen_layers();
         }
     }
 
@@ -2036,11 +2057,11 @@ impl Timeline {
             true
         } else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() {
             info!(
-                    "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
-                    projected_lsn,
-                    layer_size,
-                    opened_at.elapsed()
-                );
+                "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
+                projected_lsn,
+                layer_size,
+                opened_at.elapsed()
+            );
 
             true
         } else {
@@ -2381,7 +2402,7 @@ impl Timeline {
                 let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
                 self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
                 let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
-                assert!(matches!(*flush_loop_state, FlushLoopState::Running{ ..}));
+                assert!(matches!(*flush_loop_state, FlushLoopState::Running{..}));
                 *flush_loop_state  = FlushLoopState::Exited;
                 Ok(())
             }
@@ -3644,28 +3665,21 @@ impl Timeline {
         self.last_record_lsn.advance(new_lsn);
     }
 
-    /// Whether there was a layer to freeze or not, return the value of get_last_record_lsn
-    /// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive).
-    async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn {
-        // Freeze the current open in-memory layer. It will be written to disk on next
-        // iteration.
-
-        let _write_guard = if write_lock_held {
-            None
-        } else {
-            Some(self.write_lock.lock().await)
+    async fn freeze_inmem_layer_at(
+        &self,
+        at: Lsn,
+        write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
+    ) {
+        let frozen = {
+            let mut guard = self.layers.write().await;
+            guard
+                .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock)
+                .await
         };
-
-        let to_lsn = self.get_last_record_lsn();
-        self.freeze_inmem_layer_at(to_lsn).await;
-        to_lsn
-    }
-
-    async fn freeze_inmem_layer_at(&self, at: Lsn) {
-        let mut guard = self.layers.write().await;
-        guard
-            .try_freeze_in_memory_layer(at, &self.last_freeze_at)
-            .await;
+        if frozen {
+            let now = Instant::now();
+            *(self.last_freeze_ts.write().unwrap()) = now;
+        }
     }
 
     /// Layer flusher task's main loop.
@@ -3759,18 +3773,14 @@ impl Timeline {
         }
     }
 
-    /// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk.
-    /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`].
+    /// Request the flush loop to write out all frozen layers up to `at_lsn` as Delta L0 files to disk.
+    /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer_at`].
     ///
-    /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
-    /// it means no data will be written between the top of the highest frozen layer and to_lsn,
-    /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
-    async fn flush_frozen_layers_and_wait(
-        &self,
-        last_record_lsn: Lsn,
-    ) -> Result<(), FlushLayerError> {
-        let mut rx = self.layer_flush_done_tx.subscribe();
-
+    /// `at_lsn` may be higher than the highest LSN of a frozen layer: if this is the
+    /// case, it means no data will be written between the top of the highest frozen layer and
+    /// to_lsn, e.g. because this tenant shard has ingested up to to_lsn and not written any data
+    /// locally for that part of the WAL.
+    fn flush_frozen_layers(&self, at_lsn: Lsn) -> Result<u64, FlushLayerError> {
         // Increment the flush cycle counter and wake up the flush task.
         // Remember the new value, so that when we listen for the flush
         // to finish, we know when the flush that we initiated has
@@ -3785,13 +3795,18 @@ impl Timeline {
         self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
             my_flush_request = *counter + 1;
             *counter = my_flush_request;
-            *lsn = std::cmp::max(last_record_lsn, *lsn);
+            *lsn = std::cmp::max(at_lsn, *lsn);
         });
 
+        Ok(my_flush_request)
+    }
+
+    async fn wait_flush_completion(&self, request: u64) -> Result<(), FlushLayerError> {
+        let mut rx = self.layer_flush_done_tx.subscribe();
         loop {
             {
                 let (last_result_counter, last_result) = &*rx.borrow();
-                if *last_result_counter >= my_flush_request {
+                if *last_result_counter >= request {
                     if let Err(err) = last_result {
                         // We already logged the original error in
                         // flush_loop. We cannot propagate it to the caller
@@ -3818,12 +3833,9 @@ impl Timeline {
         }
     }
 
-    fn flush_frozen_layers(&self) {
-        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
-            *counter += 1;
-
-            *lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1));
-        });
+    async fn flush_frozen_layers_and_wait(&self, at_lsn: Lsn) -> Result<(), FlushLayerError> {
+        let token = self.flush_frozen_layers(at_lsn)?;
+        self.wait_flush_completion(token).await
     }
 
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
@@ -5537,10 +5549,26 @@ impl Timeline {
         all_data.sort();
         Ok(all_data)
     }
+
+    /// Get all historic layer descriptors in the layer map
+    #[cfg(test)]
+    pub(crate) async fn inspect_historic_layers(
+        self: &Arc<Timeline>,
+    ) -> anyhow::Result<Vec<super::storage_layer::PersistentLayerKey>> {
+        let mut layers = Vec::new();
+        let guard = self.layers.read().await;
+        for layer in guard.layer_map().iter_historic_layers() {
+            layers.push(layer.key());
+        }
+        Ok(layers)
+    }
 }
 
 type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
 
+/// Tracking writes ingestion does to a particular in-memory layer.
+///
+/// Cleared upon freezing a layer.
 struct TimelineWriterState {
     open_layer: Arc<InMemoryLayer>,
     current_size: u64,
@@ -5581,12 +5609,6 @@ impl Deref for TimelineWriter<'_> {
     }
 }
 
-impl Drop for TimelineWriter<'_> {
-    fn drop(&mut self) {
-        self.write_guard.take();
-    }
-}
-
 #[derive(PartialEq)]
 enum OpenLayerAction {
     Roll,
@@ -5669,16 +5691,15 @@ impl<'a> TimelineWriter<'a> {
     }
 
     async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> {
-        assert!(self.write_guard.is_some());
-
-        self.tl.freeze_inmem_layer_at(freeze_at).await;
-
-        let now = Instant::now();
-        *(self.last_freeze_ts.write().unwrap()) = now;
-
-        self.tl.flush_frozen_layers();
-
         let current_size = self.write_guard.as_ref().unwrap().current_size;
+
+        // self.write_guard will be taken by the freezing
+        self.tl
+            .freeze_inmem_layer_at(freeze_at, &mut self.write_guard)
+            .await;
+
+        self.tl.flush_frozen_layers(freeze_at)?;
+
         if current_size >= self.get_checkpoint_distance() * 2 {
             warn!("Flushed oversized open layer with size {}", current_size)
         }
@@ -5692,9 +5713,27 @@ impl<'a> TimelineWriter<'a> {
             return OpenLayerAction::Open;
         };
 
+        #[cfg(feature = "testing")]
+        if state.cached_last_freeze_at < self.tl.last_freeze_at.load() {
+            // this check and assertion are not really needed because
+            // LayerManager::try_freeze_in_memory_layer will always clear out the
+            // TimelineWriterState if something is frozen. however, we can advance last_freeze_at when there
+            // is no TimelineWriterState.
+            assert!(
+                state.open_layer.end_lsn.get().is_some(),
+                "our open_layer must be outdated"
+            );
+
+            // this would be a memory leak waiting to happen because the in-memory layer always has
+            // an index
+            panic!("BUG: TimelineWriterState held on to frozen in-memory layer.");
+        }
+
         if state.prev_lsn == Some(lsn) {
-            // Rolling mid LSN is not supported by downstream code.
+            // Rolling mid LSN is not supported by [downstream code].
             // Hence, only roll at LSN boundaries.
+            //
+            // [downstream code]: https://github.com/neondatabase/neon/pull/7993#discussion_r1633345422
             return OpenLayerAction::None;
         }
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index d8de6aee7c..8a95029f33 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -952,6 +952,178 @@ impl Timeline {
         adaptor.flush_updates().await?;
         Ok(())
     }
+
+    /// An experimental compaction building block that combines compaction with garbage collection.
+    ///
+    /// The current implementation picks all delta + image layers that are below or intersecting with
+    /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
+    /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
+    /// and create delta layers with all deltas >= gc horizon.
+    #[cfg(test)]
+    pub(crate) async fn compact_with_gc(
+        self: &Arc<Self>,
+        _cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> Result<(), CompactionError> {
+        use crate::tenant::storage_layer::ValueReconstructState;
+        // Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
+        // The layer selection has the following properties:
+        // 1. If a layer is in the selection, all layers below it are in the selection.
+        // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
+        let (layer_selection, gc_cutoff) = {
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+            let gc_info = self.gc_info.read().unwrap();
+            let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
+            let mut selected_layers = Vec::new();
+            // TODO: consider retain_lsns
+            drop(gc_info);
+            for desc in layers.iter_historic_layers() {
+                if desc.get_lsn_range().start <= gc_cutoff {
+                    selected_layers.push(guard.get_from_desc(&desc));
+                }
+            }
+            (selected_layers, gc_cutoff)
+        };
+        // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
+        let mut all_key_values = Vec::new();
+        for layer in &layer_selection {
+            all_key_values.extend(layer.load_key_values(ctx).await?);
+        }
+        // Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and
+        // image layers, make image appear later than delta.
+        struct ValueWrapper<'a>(&'a crate::repository::Value);
+        impl Ord for ValueWrapper<'_> {
+            fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+                use crate::repository::Value;
+                use std::cmp::Ordering;
+                match (self.0, other.0) {
+                    (Value::Image(_), Value::WalRecord(_)) => Ordering::Greater,
+                    (Value::WalRecord(_), Value::Image(_)) => Ordering::Less,
+                    _ => Ordering::Equal,
+                }
+            }
+        }
+        impl PartialOrd for ValueWrapper<'_> {
+            fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+                Some(self.cmp(other))
+            }
+        }
+        impl PartialEq for ValueWrapper<'_> {
+            fn eq(&self, other: &Self) -> bool {
+                self.cmp(other) == std::cmp::Ordering::Equal
+            }
+        }
+        impl Eq for ValueWrapper<'_> {}
+        all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| {
+            (k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2)))
+        });
+        let max_lsn = all_key_values
+            .iter()
+            .map(|(_, lsn, _)| lsn)
+            .max()
+            .copied()
+            .unwrap()
+            + 1;
+        // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
+        // Data of the same key.
+        let mut accumulated_values = Vec::new();
+        let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty
+
+        /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
+        async fn flush_accumulated_states(
+            tline: &Arc<Timeline>,
+            key: Key,
+            accumulated_values: &[&(Key, Lsn, crate::repository::Value)],
+            horizon: Lsn,
+        ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
+            let mut base_image = None;
+            let mut keys_above_horizon = Vec::new();
+            let mut delta_above_base_image = Vec::new();
+            // We have a list of deltas/images. We want to create image layers while collect garbages.
+            for (key, lsn, val) in accumulated_values.iter().rev() {
+                if *lsn > horizon {
+                    keys_above_horizon.push((*key, *lsn, val.clone())); // TODO: ensure one LSN corresponds to either delta or image instead of both
+                } else if *lsn <= horizon {
+                    match val {
+                        crate::repository::Value::Image(image) => {
+                            if lsn <= &horizon {
+                                base_image = Some((*lsn, image.clone()));
+                                break;
+                            }
+                        }
+                        crate::repository::Value::WalRecord(wal) => {
+                            delta_above_base_image.push((*lsn, wal.clone()));
+                        }
+                    }
+                }
+            }
+            delta_above_base_image.reverse();
+            keys_above_horizon.reverse();
+            let state = ValueReconstructState {
+                img: base_image,
+                records: delta_above_base_image,
+            };
+            let img = tline.reconstruct_value(key, horizon, state).await?;
+            Ok((keys_above_horizon, img))
+        }
+
+        let mut delta_layer_writer = DeltaLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            all_key_values.first().unwrap().0,
+            gc_cutoff..max_lsn, // TODO: off by one?
+            ctx,
+        )
+        .await?;
+        let mut image_layer_writer = ImageLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            &(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()),
+            gc_cutoff,
+            ctx,
+        )
+        .await?;
+
+        for item @ (key, _, _) in &all_key_values {
+            if &last_key == key {
+                accumulated_values.push(item);
+            } else {
+                let (deltas, image) =
+                    flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff)
+                        .await?;
+                image_layer_writer.put_image(last_key, image, ctx).await?;
+                for (key, lsn, val) in deltas {
+                    delta_layer_writer.put_value(key, lsn, val, ctx).await?;
+                }
+                accumulated_values.clear();
+                accumulated_values.push(item);
+                last_key = *key;
+            }
+        }
+        let (deltas, image) =
+            flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
+        image_layer_writer.put_image(last_key, image, ctx).await?;
+        for (key, lsn, val) in deltas {
+            delta_layer_writer.put_value(key, lsn, val, ctx).await?;
+        }
+        accumulated_values.clear();
+        // TODO: split layers
+        let delta_layer = delta_layer_writer.finish(last_key, self, ctx).await?;
+        let image_layer = image_layer_writer.finish(self, ctx).await?;
+        // Step 3: Place back to the layer map.
+        {
+            let mut guard = self.layers.write().await;
+            guard.finish_gc_compaction(
+                &layer_selection,
+                &[delta_layer.clone(), image_layer.clone()],
+                &self.metrics,
+            )
+        };
+        Ok(())
+    }
 }
 
 struct TimelineAdaptor {
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 0e82dedecb..550a9a567a 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -21,6 +21,8 @@ use crate::{
     },
 };
 
+use super::TimelineWriterState;
+
 /// Provides semantic APIs to manipulate the layer map.
 #[derive(Default)]
 pub(crate) struct LayerManager {
@@ -120,18 +122,20 @@ impl LayerManager {
         Ok(layer)
     }
 
-    /// Called from `freeze_inmem_layer`, returns true if successfully frozen.
-    pub(crate) async fn try_freeze_in_memory_layer(
+    /// Tries to freeze an open layer and also manages clearing the TimelineWriterState.
+    ///
+    /// Returns true if anything was frozen.
+    pub(super) async fn try_freeze_in_memory_layer(
         &mut self,
         lsn: Lsn,
         last_freeze_at: &AtomicLsn,
-    ) {
+        write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
+    ) -> bool {
         let Lsn(last_record_lsn) = lsn;
         let end_lsn = Lsn(last_record_lsn + 1);
 
-        if let Some(open_layer) = &self.layer_map.open_layer {
+        let froze = if let Some(open_layer) = &self.layer_map.open_layer {
             let open_layer_rc = Arc::clone(open_layer);
-            // Does this layer need freezing?
             open_layer.freeze(end_lsn).await;
 
             // The layer is no longer open, update the layer map to reflect this.
@@ -139,11 +143,25 @@ impl LayerManager {
             self.layer_map.frozen_layers.push_back(open_layer_rc);
             self.layer_map.open_layer = None;
             self.layer_map.next_open_layer_at = Some(end_lsn);
-        }
+
+            true
+        } else {
+            false
+        };
 
         // Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this
         // accounts for regions in the LSN range where we might have ingested no data due to sharding.
         last_freeze_at.store(end_lsn);
+
+        // the writer state must no longer have a reference to the frozen layer
+        let taken = write_lock.take();
+        assert_eq!(
+            froze,
+            taken.is_some(),
+            "should only had frozen a layer when TimelineWriterState existed"
+        );
+
+        froze
     }
 
     /// Add image layers to the layer map, called from `create_image_layers`.
@@ -208,6 +226,18 @@ impl LayerManager {
         updates.flush();
     }
 
+    /// Called when a GC-compaction is completed.
+    #[cfg(test)]
+    pub(crate) fn finish_gc_compaction(
+        &mut self,
+        compact_from: &[Layer],
+        compact_to: &[ResidentLayer],
+        metrics: &TimelineMetrics,
+    ) {
+        // We can simply reuse compact l0 logic. Use a different function name to indicate a different type of layer map modification.
+        self.finish_compact_l0(compact_from, compact_to, metrics)
+    }
+
     /// Called when compaction is completed.
     pub(crate) fn rewrite_layers(
         &mut self,
diff --git a/patches/pg_anon.patch b/patches/pg_anon.patch
new file mode 100644
index 0000000000..15dfd3c5a0
--- /dev/null
+++ b/patches/pg_anon.patch
@@ -0,0 +1,223 @@
+commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
+Author: Alexey Masterov <alexeymasterov@neon.tech>
+Date:   Fri May 31 06:34:26 2024 +0000
+
+    These alternative expected files were added to consider the neon features
+
+diff --git a/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
+new file mode 100644
+index 0000000..2539cfd
+--- /dev/null
++++ b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
+@@ -0,0 +1,101 @@
++BEGIN;
++CREATE EXTENSION anon CASCADE;
++NOTICE:  installing required extension "pgcrypto"
++SELECT anon.init();
++ init 
++------
++ t
++(1 row)
++
++CREATE ROLE mallory_the_masked_user;
++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
++CREATE TABLE t1(i INT);
++ALTER TABLE t1 ADD COLUMN t TEXT;
++SECURITY LABEL FOR anon ON COLUMN t1.t
++IS 'MASKED WITH VALUE NULL';
++INSERT INTO t1 VALUES (1,'test');
++--
++-- We're checking the owner's permissions
++--
++-- see
++-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
++--
++SET ROLE mallory_the_masked_user;
++SELECT anon.pseudo_first_name(0) IS NOT NULL;
++ ?column? 
++----------
++ t
++(1 row)
++
++-- SHOULD FAIL
++DO $$
++BEGIN
++  PERFORM anon.init();
++  EXCEPTION WHEN insufficient_privilege
++  THEN RAISE NOTICE 'insufficient_privilege';
++END$$;
++NOTICE:  insufficient_privilege
++-- SHOULD FAIL
++DO $$
++BEGIN
++  PERFORM anon.anonymize_table('t1');
++  EXCEPTION WHEN insufficient_privilege
++  THEN RAISE NOTICE 'insufficient_privilege';
++END$$;
++NOTICE:  insufficient_privilege
++-- SHOULD FAIL
++SAVEPOINT fail_start_engine;
++SELECT anon.start_dynamic_masking();
++ERROR:  Only supersusers can start the dynamic masking engine.
++CONTEXT:  PL/pgSQL function anon.start_dynamic_masking(boolean) line 18 at RAISE
++ROLLBACK TO fail_start_engine;
++RESET ROLE;
++SELECT anon.start_dynamic_masking();
++ start_dynamic_masking 
++-----------------------
++ t
++(1 row)
++
++SET ROLE mallory_the_masked_user;
++SELECT * FROM mask.t1;
++ i | t 
++---+---
++ 1 | 
++(1 row)
++
++-- SHOULD FAIL
++DO $$
++BEGIN
++  SELECT * FROM public.t1;
++  EXCEPTION WHEN insufficient_privilege
++  THEN RAISE NOTICE 'insufficient_privilege';
++END$$;
++NOTICE:  insufficient_privilege
++-- SHOULD FAIL
++SAVEPOINT fail_stop_engine;
++SELECT anon.stop_dynamic_masking();
++ERROR:  Only supersusers can stop the dynamic masking engine.
++CONTEXT:  PL/pgSQL function anon.stop_dynamic_masking() line 18 at RAISE
++ROLLBACK TO fail_stop_engine;
++RESET ROLE;
++SELECT anon.stop_dynamic_masking();
++NOTICE:  The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
++ stop_dynamic_masking 
++----------------------
++ t
++(1 row)
++
++SET ROLE mallory_the_masked_user;
++SELECT COUNT(*)=1 FROM anon.pg_masking_rules;
++ ?column? 
++----------
++ t
++(1 row)
++
++-- SHOULD FAIL
++SAVEPOINT fail_seclabel_on_role;
++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
++ERROR:  permission denied
++DETAIL:  The current user must have the CREATEROLE attribute.
++ROLLBACK TO fail_seclabel_on_role;
++ROLLBACK;
+diff --git a/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
+new file mode 100644
+index 0000000..8b090fe
+--- /dev/null
++++ b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
+@@ -0,0 +1,104 @@
++BEGIN;
++CREATE EXTENSION anon CASCADE;
++NOTICE:  installing required extension "pgcrypto"
++SELECT anon.init();
++ init 
++------
++ t
++(1 row)
++
++CREATE ROLE oscar_the_owner;
++ALTER DATABASE :DBNAME OWNER TO oscar_the_owner;
++CREATE ROLE mallory_the_masked_user;
++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
++--
++-- We're checking the owner's permissions
++--
++-- see
++-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
++--
++SET ROLE oscar_the_owner;
++SELECT anon.pseudo_first_name(0) IS NOT NULL;
++ ?column? 
++----------
++ t
++(1 row)
++
++-- SHOULD FAIL
++DO $$
++BEGIN
++  PERFORM anon.init();
++  EXCEPTION WHEN insufficient_privilege
++  THEN RAISE NOTICE 'insufficient_privilege';
++END$$;
++NOTICE:  insufficient_privilege
++CREATE TABLE t1(i INT);
++ALTER TABLE t1 ADD COLUMN t TEXT;
++SECURITY LABEL FOR anon ON COLUMN t1.t
++IS 'MASKED WITH VALUE NULL';
++INSERT INTO t1 VALUES (1,'test');
++SELECT anon.anonymize_table('t1');
++ anonymize_table 
++-----------------
++ t
++(1 row)
++
++SELECT * FROM t1;
++ i | t 
++---+---
++ 1 | 
++(1 row)
++
++UPDATE t1 SET t='test' WHERE i=1;
++-- SHOULD FAIL
++SAVEPOINT fail_start_engine;
++SELECT anon.start_dynamic_masking();
++ start_dynamic_masking 
++-----------------------
++ t
++(1 row)
++
++ROLLBACK TO fail_start_engine;
++RESET ROLE;
++SELECT anon.start_dynamic_masking();
++ start_dynamic_masking 
++-----------------------
++ t
++(1 row)
++
++SET ROLE oscar_the_owner;
++SELECT * FROM t1;
++ i |  t   
++---+------
++ 1 | test
++(1 row)
++
++--SELECT * FROM mask.t1;
++-- SHOULD FAIL
++SAVEPOINT fail_stop_engine;
++SELECT anon.stop_dynamic_masking();
++ERROR:  permission denied for schema mask
++CONTEXT:  SQL statement "DROP VIEW mask.t1;"
++PL/pgSQL function anon.mask_drop_view(oid) line 3 at EXECUTE
++SQL statement "SELECT anon.mask_drop_view(oid)
++  FROM pg_catalog.pg_class
++  WHERE relnamespace=quote_ident(pg_catalog.current_setting('anon.sourceschema'))::REGNAMESPACE
++  AND relkind IN ('r','p','f')"
++PL/pgSQL function anon.stop_dynamic_masking() line 22 at PERFORM
++ROLLBACK TO fail_stop_engine;
++RESET ROLE;
++SELECT anon.stop_dynamic_masking();
++NOTICE:  The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
++ stop_dynamic_masking 
++----------------------
++ t
++(1 row)
++
++SET ROLE oscar_the_owner;
++-- SHOULD FAIL
++SAVEPOINT fail_seclabel_on_role;
++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
++ERROR:  permission denied
++DETAIL:  The current user must have the CREATEROLE attribute.
++ROLLBACK TO fail_seclabel_on_role;
++ROLLBACK;
diff --git a/patches/pg_cron.patch b/patches/pg_cron.patch
new file mode 100644
index 0000000000..c2b648c20c
--- /dev/null
+++ b/patches/pg_cron.patch
@@ -0,0 +1,19 @@
+commit b3ea51ee158f113f2f82d0b97c12c54343c9a695 (HEAD -> master)
+Author: Alexey Masterov <alexeymasterov@neon.tech>
+Date:   Fri Jun 7 19:23:42 2024 +0000
+
+    Disable REGRESS_OPTIONS causing initdb
+
+diff --git a/ext-src/pg_cron-src/Makefile b/ext-src/pg_cron-src/Makefile
+index 053314c..fbd5fb5 100644
+--- a/ext-src/pg_cron-src/Makefile
++++ b/ext-src/pg_cron-src/Makefile
+@@ -5,7 +5,7 @@ EXTENSION = pg_cron
+ DATA_built = $(EXTENSION)--1.0.sql
+ DATA = $(wildcard $(EXTENSION)--*--*.sql)
+ 
+-REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check
++#REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check
+ REGRESS = pg_cron-test 
+ 
+ # compilation configuration
diff --git a/patches/pg_hintplan.patch b/patches/pg_hintplan.patch
new file mode 100644
index 0000000000..61a5ecbb90
--- /dev/null
+++ b/patches/pg_hintplan.patch
@@ -0,0 +1,39 @@
+commit f7925d4d1406c0f0229e3c691c94b69e381899b1 (HEAD -> master)
+Author: Alexey Masterov <alexeymasterov@neon.tech>
+Date:   Thu Jun 6 08:02:42 2024 +0000
+
+    Patch expected files to consider Neon's log messages
+
+diff --git a/ext-src/pg_hint_plan-src/expected/ut-A.out b/ext-src/pg_hint_plan-src/expected/ut-A.out
+index da723b8..f8d0102 100644
+--- a/ext-src/pg_hint_plan-src/expected/ut-A.out
++++ b/ext-src/pg_hint_plan-src/expected/ut-A.out
+@@ -9,13 +9,16 @@ SET search_path TO public;
+ ----
+ -- No.A-1-1-3
+ CREATE EXTENSION pg_hint_plan;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+ -- No.A-1-2-3
+ DROP EXTENSION pg_hint_plan;
+ -- No.A-1-1-4
+ CREATE SCHEMA other_schema;
+ CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+ ERROR:  extension "pg_hint_plan" must be installed in schema "hint_plan"
+ CREATE EXTENSION pg_hint_plan;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+ DROP SCHEMA other_schema;
+ ----
+ ---- No. A-5-1 comment pattern
+diff --git a/ext-src/pg_hint_plan-src/expected/ut-fdw.out b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
+index d372459..6282afe 100644
+--- a/ext-src/pg_hint_plan-src/expected/ut-fdw.out
++++ b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
+@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
+ SET client_min_messages TO LOG;
+ SET pg_hint_plan.enable_hint TO on;
+ CREATE EXTENSION file_fdw;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
+ CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
+ CREATE USER MAPPING FOR PUBLIC SERVER file_server;
+ CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
diff --git a/pgxn/.dir-locals.el b/pgxn/.dir-locals.el
new file mode 100644
index 0000000000..ab6208b698
--- /dev/null
+++ b/pgxn/.dir-locals.el
@@ -0,0 +1,19 @@
+;; see also src/tools/editors/emacs.samples for more complete settings
+
+((c-mode . ((c-basic-offset . 4)
+            (c-file-style . "bsd")
+            (fill-column . 78)
+            (indent-tabs-mode . t)
+            (tab-width . 4)))
+ (nxml-mode . ((fill-column . 78)
+               (indent-tabs-mode . nil)))
+ (perl-mode . ((perl-indent-level . 4)
+               (perl-continued-statement-offset . 2)
+               (perl-continued-brace-offset . -2)
+               (perl-brace-offset . 0)
+               (perl-brace-imaginary-offset . 0)
+               (perl-label-offset . -2)
+               (indent-tabs-mode . t)
+               (tab-width . 4)))
+ (sgml-mode . ((fill-column . 78)
+               (indent-tabs-mode . nil))))
diff --git a/pgxn/.editorconfig b/pgxn/.editorconfig
new file mode 100644
index 0000000000..d69a3d1dc4
--- /dev/null
+++ b/pgxn/.editorconfig
@@ -0,0 +1,14 @@
+root = true
+
+[*.{c,h,l,y,pl,pm}]
+indent_style = tab
+indent_size = tab
+tab_width = 4
+
+[*.{sgml,xml}]
+indent_style = space
+indent_size = 1
+
+[*.xsl]
+indent_style = space
+indent_size = 2
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index b69a3819c9..276d1542fe 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -19,6 +19,7 @@
 #include "catalog/pg_type.h"
 #include "postmaster/bgworker.h"
 #include "postmaster/interrupt.h"
+#include "replication/logical.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
 #include "storage/procsignal.h"
@@ -280,6 +281,7 @@ _PG_init(void)
 	pg_init_libpagestore();
 	pg_init_walproposer();
         WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
+	LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 
 	InitLogicalReplicationMonitor();
 
diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c
index 93dce9de84..8f8d1dfc01 100644
--- a/pgxn/neon/walsender_hooks.c
+++ b/pgxn/neon/walsender_hooks.c
@@ -24,8 +24,12 @@
 #include "walproposer.h"
 
 static NeonWALReader *wal_reader = NULL;
+
+struct WalSnd;
+extern struct WalSnd *MyWalSnd;
 extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc);
 extern bool GetDonorShmem(XLogRecPtr *donor_lsn);
+extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI);
 
 static XLogRecPtr
 NeonWALReadWaitForWAL(XLogRecPtr loc)
@@ -36,7 +40,28 @@ NeonWALReadWaitForWAL(XLogRecPtr loc)
 		CHECK_FOR_INTERRUPTS();
 	}
 
-	return WalSndWaitForWal(loc);
+	// Walsender sends keepalives and stuff, so better use its normal wait
+	if (MyWalSnd != NULL)
+		return WalSndWaitForWal(loc);
+
+	for (;;)
+	{
+		XLogRecPtr flush_ptr;
+		if (!RecoveryInProgress())
+#if PG_VERSION_NUM >= 150000
+			flush_ptr = GetFlushRecPtr(NULL);
+#else
+			flush_ptr = GetFlushRecPtr();
+#endif
+		else
+			flush_ptr = GetXLogReplayRecPtr(NULL);
+
+		if (loc <= flush_ptr)
+			return flush_ptr;
+
+		CHECK_FOR_INTERRUPTS();
+		pg_usleep(1000);
+	}
 }
 
 static int
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 3ff0d87988..4ab85509dc 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -29,6 +29,8 @@ pub enum MaySchedule {
 struct SchedulerNode {
     /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
     shard_count: usize,
+    /// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`].
+    attached_shard_count: usize,
 
     /// Whether this node is currently elegible to have new shards scheduled (this is derived
     /// from a node's availability state and scheduling policy).
@@ -42,7 +44,9 @@ impl PartialEq for SchedulerNode {
             (MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
         );
 
-        may_schedule_matches && self.shard_count == other.shard_count
+        may_schedule_matches
+            && self.shard_count == other.shard_count
+            && self.attached_shard_count == other.attached_shard_count
     }
 }
 
@@ -138,6 +142,15 @@ impl ScheduleContext {
     }
 }
 
+pub(crate) enum RefCountUpdate {
+    PromoteSecondary,
+    Attach,
+    Detach,
+    DemoteAttached,
+    AddSecondary,
+    RemoveSecondary,
+}
+
 impl Scheduler {
     pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
         let mut scheduler_nodes = HashMap::new();
@@ -146,6 +159,7 @@ impl Scheduler {
                 node.get_id(),
                 SchedulerNode {
                     shard_count: 0,
+                    attached_shard_count: 0,
                     may_schedule: node.may_schedule(),
                 },
             );
@@ -171,6 +185,7 @@ impl Scheduler {
                 node.get_id(),
                 SchedulerNode {
                     shard_count: 0,
+                    attached_shard_count: 0,
                     may_schedule: node.may_schedule(),
                 },
             );
@@ -179,7 +194,10 @@ impl Scheduler {
         for shard in shards {
             if let Some(node_id) = shard.intent.get_attached() {
                 match expect_nodes.get_mut(node_id) {
-                    Some(node) => node.shard_count += 1,
+                    Some(node) => {
+                        node.shard_count += 1;
+                        node.attached_shard_count += 1;
+                    }
                     None => anyhow::bail!(
                         "Tenant {} references nonexistent node {}",
                         shard.tenant_shard_id,
@@ -227,31 +245,42 @@ impl Scheduler {
         Ok(())
     }
 
-    /// Increment the reference count of a node.  This reference count is used to guide scheduling
-    /// decisions, not for memory management: it represents one tenant shard whose IntentState targets
-    /// this node.
+    /// Update the reference counts of a node. These reference counts are used to guide scheduling
+    /// decisions, not for memory management: they represent the number of tenant shard whose IntentState
+    /// targets this node and the number of tenants shars whose IntentState is attached to this
+    /// node.
     ///
     /// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
     /// [`Self::new`] or [`Self::node_upsert`])
-    pub(crate) fn node_inc_ref(&mut self, node_id: NodeId) {
-        let Some(node) = self.nodes.get_mut(&node_id) else {
-            tracing::error!("Scheduler missing node {node_id}");
-            debug_assert!(false);
-            return;
-        };
-
-        node.shard_count += 1;
-    }
-
-    /// Decrement a node's reference count.  Inverse of [`Self::node_inc_ref`].
-    pub(crate) fn node_dec_ref(&mut self, node_id: NodeId) {
+    pub(crate) fn update_node_ref_counts(&mut self, node_id: NodeId, update: RefCountUpdate) {
         let Some(node) = self.nodes.get_mut(&node_id) else {
             debug_assert!(false);
             tracing::error!("Scheduler missing node {node_id}");
             return;
         };
 
-        node.shard_count -= 1;
+        match update {
+            RefCountUpdate::PromoteSecondary => {
+                node.attached_shard_count += 1;
+            }
+            RefCountUpdate::Attach => {
+                node.shard_count += 1;
+                node.attached_shard_count += 1;
+            }
+            RefCountUpdate::Detach => {
+                node.shard_count -= 1;
+                node.attached_shard_count -= 1;
+            }
+            RefCountUpdate::DemoteAttached => {
+                node.attached_shard_count -= 1;
+            }
+            RefCountUpdate::AddSecondary => {
+                node.shard_count += 1;
+            }
+            RefCountUpdate::RemoveSecondary => {
+                node.shard_count -= 1;
+            }
+        }
     }
 
     pub(crate) fn node_upsert(&mut self, node: &Node) {
@@ -263,6 +292,7 @@ impl Scheduler {
             Vacant(entry) => {
                 entry.insert(SchedulerNode {
                     shard_count: 0,
+                    attached_shard_count: 0,
                     may_schedule: node.may_schedule(),
                 });
             }
@@ -385,6 +415,11 @@ impl Scheduler {
     pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
         self.nodes.get(&node_id).unwrap().shard_count
     }
+
+    #[cfg(test)]
+    pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize {
+        self.nodes.get(&node_id).unwrap().attached_shard_count
+    }
 }
 
 #[cfg(test)]
@@ -437,18 +472,28 @@ mod tests {
         let scheduled = scheduler.schedule_shard(&[], &context)?;
         t2_intent.set_attached(&mut scheduler, Some(scheduled));
 
-        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
-        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
+
+        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
 
         let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
         t1_intent.push_secondary(&mut scheduler, scheduled);
 
-        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
-        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 2);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
+
+        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
 
         t1_intent.clear(&mut scheduler);
-        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
-        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
+
+        let total_attached = scheduler.get_node_attached_shard_count(NodeId(1))
+            + scheduler.get_node_attached_shard_count(NodeId(2));
+        assert_eq!(total_attached, 1);
 
         if cfg!(debug_assertions) {
             // Dropping an IntentState without clearing it causes a panic in debug mode,
@@ -459,8 +504,12 @@ mod tests {
             assert!(result.is_err());
         } else {
             t2_intent.clear(&mut scheduler);
-            assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
-            assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 0);
+
+            assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0);
+            assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 0);
+
+            assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 0);
+            assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);
         }
 
         Ok(())
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 756dc10a2a..1e81b5c5a2 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4312,7 +4312,7 @@ impl Service {
                         continue;
                     }
 
-                    if tenant_shard.intent.demote_attached(node_id) {
+                    if tenant_shard.intent.demote_attached(scheduler, node_id) {
                         tenant_shard.sequence = tenant_shard.sequence.next();
 
                         // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index dda17f9887..77bbf4c604 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -8,7 +8,7 @@ use crate::{
     metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
     persistence::TenantShardPersistence,
     reconciler::ReconcileUnits,
-    scheduler::{AffinityScore, MaySchedule, ScheduleContext},
+    scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext},
 };
 use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
 use pageserver_api::{
@@ -153,7 +153,7 @@ impl IntentState {
     }
     pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option<NodeId>) -> Self {
         if let Some(node_id) = node_id {
-            scheduler.node_inc_ref(node_id);
+            scheduler.update_node_ref_counts(node_id, RefCountUpdate::Attach);
         }
         Self {
             attached: node_id,
@@ -164,10 +164,10 @@ impl IntentState {
     pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option<NodeId>) {
         if self.attached != new_attached {
             if let Some(old_attached) = self.attached.take() {
-                scheduler.node_dec_ref(old_attached);
+                scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach);
             }
             if let Some(new_attached) = &new_attached {
-                scheduler.node_inc_ref(*new_attached);
+                scheduler.update_node_ref_counts(*new_attached, RefCountUpdate::Attach);
             }
             self.attached = new_attached;
         }
@@ -177,22 +177,27 @@ impl IntentState {
     /// secondary to attached while maintaining the scheduler's reference counts.
     pub(crate) fn promote_attached(
         &mut self,
-        _scheduler: &mut Scheduler,
+        scheduler: &mut Scheduler,
         promote_secondary: NodeId,
     ) {
         // If we call this with a node that isn't in secondary, it would cause incorrect
         // scheduler reference counting, since we assume the node is already referenced as a secondary.
         debug_assert!(self.secondary.contains(&promote_secondary));
 
-        // TODO: when scheduler starts tracking attached + secondary counts separately, we will
-        // need to call into it here.
         self.secondary.retain(|n| n != &promote_secondary);
+
+        let demoted = self.attached;
         self.attached = Some(promote_secondary);
+
+        scheduler.update_node_ref_counts(promote_secondary, RefCountUpdate::PromoteSecondary);
+        if let Some(demoted) = demoted {
+            scheduler.update_node_ref_counts(demoted, RefCountUpdate::DemoteAttached);
+        }
     }
 
     pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
         debug_assert!(!self.secondary.contains(&new_secondary));
-        scheduler.node_inc_ref(new_secondary);
+        scheduler.update_node_ref_counts(new_secondary, RefCountUpdate::AddSecondary);
         self.secondary.push(new_secondary);
     }
 
@@ -200,27 +205,27 @@ impl IntentState {
     pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) {
         let index = self.secondary.iter().position(|n| *n == node_id);
         if let Some(index) = index {
-            scheduler.node_dec_ref(node_id);
+            scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary);
             self.secondary.remove(index);
         }
     }
 
     pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) {
         for secondary in self.secondary.drain(..) {
-            scheduler.node_dec_ref(secondary);
+            scheduler.update_node_ref_counts(secondary, RefCountUpdate::RemoveSecondary);
         }
     }
 
     /// Remove the last secondary node from the list of secondaries
     pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
         if let Some(node_id) = self.secondary.pop() {
-            scheduler.node_dec_ref(node_id);
+            scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary);
         }
     }
 
     pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
         if let Some(old_attached) = self.attached.take() {
-            scheduler.node_dec_ref(old_attached);
+            scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach);
         }
 
         self.clear_secondary(scheduler);
@@ -251,12 +256,11 @@ impl IntentState {
     /// forget the location on the offline node.
     ///
     /// Returns true if a change was made
-    pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool {
+    pub(crate) fn demote_attached(&mut self, scheduler: &mut Scheduler, node_id: NodeId) -> bool {
         if self.attached == Some(node_id) {
-            // TODO: when scheduler starts tracking attached + secondary counts separately, we will
-            // need to call into it here.
             self.attached = None;
             self.secondary.push(node_id);
+            scheduler.update_node_ref_counts(node_id, RefCountUpdate::DemoteAttached);
             true
         } else {
             false
@@ -593,7 +597,7 @@ impl TenantShard {
             Secondary => {
                 if let Some(node_id) = self.intent.get_attached() {
                     // Populate secondary by demoting the attached node
-                    self.intent.demote_attached(*node_id);
+                    self.intent.demote_attached(scheduler, *node_id);
                     modified = true;
                 } else if self.intent.secondary.is_empty() {
                     // Populate secondary by scheduling a fresh node
@@ -783,7 +787,7 @@ impl TenantShard {
                 old_attached_node_id,
                 new_attached_node_id,
             }) => {
-                self.intent.demote_attached(old_attached_node_id);
+                self.intent.demote_attached(scheduler, old_attached_node_id);
                 self.intent
                     .promote_attached(scheduler, new_attached_node_id);
             }
@@ -1321,7 +1325,9 @@ pub(crate) mod tests {
         assert_ne!(attached_node_id, secondary_node_id);
 
         // Notifying the attached node is offline should demote it to a secondary
-        let changed = tenant_shard.intent.demote_attached(attached_node_id);
+        let changed = tenant_shard
+            .intent
+            .demote_attached(&mut scheduler, attached_node_id);
         assert!(changed);
         assert!(tenant_shard.intent.attached.is_none());
         assert_eq!(tenant_shard.intent.secondary.len(), 2);
@@ -1604,7 +1610,14 @@ pub(crate) mod tests {
 
         // We should see equal number of locations on the two nodes.
         assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
+        // Scheduling does not consider the number of attachments picking the initial
+        // pageserver to attach to (hence the assertion that all primaries are on the
+        // same node)
+        // TODO: Tweak the scheduling to evenly distribute attachments for new shards.
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 4);
+
         assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);
 
         // Add another two nodes: we should see the shards spread out when their optimize
         // methods are called
@@ -1613,9 +1626,16 @@ pub(crate) mod tests {
         optimize_til_idle(&nodes, &mut scheduler, &mut shards);
 
         assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
+
         assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
+
         assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 1);
+
         assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 1);
 
         for shard in shards.iter_mut() {
             shard.intent.clear(&mut scheduler);
diff --git a/s3_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml
similarity index 98%
rename from s3_scrubber/Cargo.toml
rename to storage_scrubber/Cargo.toml
index 48b50ca21c..050be66483 100644
--- a/s3_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "s3_scrubber"
+name = "storage_scrubber"
 version = "0.1.0"
 edition.workspace = true
 license.workspace = true
diff --git a/s3_scrubber/README.md b/storage_scrubber/README.md
similarity index 99%
rename from s3_scrubber/README.md
rename to storage_scrubber/README.md
index 8a96542ada..0930f343ec 100644
--- a/s3_scrubber/README.md
+++ b/storage_scrubber/README.md
@@ -1,4 +1,4 @@
-# Neon S3 scrubber
+# Neon Storage Scrubber
 
 This tool directly accesses the S3 buckets used by the Neon `pageserver`
 and `safekeeper`, and does housekeeping such as cleaning up objects for tenants & timelines that no longer exist.
diff --git a/s3_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
similarity index 97%
rename from s3_scrubber/src/checks.rs
rename to storage_scrubber/src/checks.rs
index 44fb53696c..4eb8580e32 100644
--- a/s3_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -78,17 +78,16 @@ pub(crate) fn branch_cleanup_and_check_errors(
                     index_part_generation: _index_part_generation,
                     s3_layers: _s3_layers,
                 } => {
-                    if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) {
-                        result.errors.push(format!(
-                            "index_part.json version: {}",
-                            index_part.get_version()
-                        ))
+                    if !IndexPart::KNOWN_VERSIONS.contains(&index_part.version()) {
+                        result
+                            .errors
+                            .push(format!("index_part.json version: {}", index_part.version()))
                     }
 
-                    if &index_part.get_version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
+                    if &index_part.version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
                         result.warnings.push(format!(
                             "index_part.json version is not latest: {}",
-                            index_part.get_version()
+                            index_part.version()
                         ))
                     }
 
diff --git a/s3_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs
similarity index 100%
rename from s3_scrubber/src/cloud_admin_api.rs
rename to storage_scrubber/src/cloud_admin_api.rs
diff --git a/s3_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
similarity index 100%
rename from s3_scrubber/src/garbage.rs
rename to storage_scrubber/src/garbage.rs
diff --git a/s3_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
similarity index 100%
rename from s3_scrubber/src/lib.rs
rename to storage_scrubber/src/lib.rs
diff --git a/s3_scrubber/src/main.rs b/storage_scrubber/src/main.rs
similarity index 96%
rename from s3_scrubber/src/main.rs
rename to storage_scrubber/src/main.rs
index ade8ef7d7a..222bd10ed2 100644
--- a/s3_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -1,11 +1,11 @@
 use anyhow::bail;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
-use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
-use s3_scrubber::pageserver_physical_gc::GcMode;
-use s3_scrubber::scan_pageserver_metadata::scan_metadata;
-use s3_scrubber::tenant_snapshot::SnapshotDownloader;
-use s3_scrubber::{
+use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
+use storage_scrubber::pageserver_physical_gc::GcMode;
+use storage_scrubber::scan_pageserver_metadata::scan_metadata;
+use storage_scrubber::tenant_snapshot::SnapshotDownloader;
+use storage_scrubber::{
     init_logging, pageserver_physical_gc::pageserver_physical_gc,
     scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind,
     TraversingDepth,
diff --git a/s3_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
similarity index 100%
rename from s3_scrubber/src/metadata_stream.rs
rename to storage_scrubber/src/metadata_stream.rs
diff --git a/s3_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
similarity index 100%
rename from s3_scrubber/src/pageserver_physical_gc.rs
rename to storage_scrubber/src/pageserver_physical_gc.rs
diff --git a/s3_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
similarity index 99%
rename from s3_scrubber/src/scan_pageserver_metadata.rs
rename to storage_scrubber/src/scan_pageserver_metadata.rs
index 6ff9783875..af74ffa4cd 100644
--- a/s3_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -125,7 +125,7 @@ impl MetadataSummary {
         {
             *self
                 .indices_by_version
-                .entry(index_part.get_version())
+                .entry(index_part.version())
                 .or_insert(0) += 1;
 
             if let Err(e) = self.update_histograms(index_part) {
diff --git a/s3_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
similarity index 100%
rename from s3_scrubber/src/scan_safekeeper_metadata.rs
rename to storage_scrubber/src/scan_safekeeper_metadata.rs
diff --git a/s3_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs
similarity index 100%
rename from s3_scrubber/src/tenant_snapshot.rs
rename to storage_scrubber/src/tenant_snapshot.rs
diff --git a/test_runner/README.md b/test_runner/README.md
index fd68cfff79..7d95634ea8 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -285,6 +285,21 @@ def test_foobar(neon_env_builder: NeonEnvBuilder):
     ...
 ```
 
+The env includes a default tenant and timeline. Therefore, you do not need to create your own
+tenant/timeline for testing.
+
+```python
+def test_foobar2(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start() # Start the environment
+    with env.endpoints.create_start("main") as endpoint:
+        # Start the compute endpoint
+    client = env.pageserver.http_client() # Get the pageserver client
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)
+```
+
 For more information about pytest fixtures, see https://docs.pytest.org/en/stable/fixture.html
 
 At the end of a test, all the nodes in the environment are automatically stopped, so you
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a70f4228e4..469937c558 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -833,7 +833,7 @@ class NeonEnvBuilder:
     def enable_scrub_on_exit(self):
         """
         Call this if you would like the fixture to automatically run
-        s3_scrubber at the end of the test, as a bidirectional test
+        storage_scrubber at the end of the test, as a bidirectional test
         that the scrubber is working properly, and that the code within
         the test didn't produce any invalid remote state.
         """
@@ -948,7 +948,7 @@ class NeonEnvBuilder:
 
             if self.scrub_on_exit:
                 try:
-                    S3Scrubber(self).scan_metadata()
+                    StorageScrubber(self).scan_metadata()
                 except Exception as e:
                     log.error(f"Error during remote storage scrub: {e}")
                     cleanup_error = e
@@ -3391,7 +3391,7 @@ def static_proxy(
         yield proxy
 
 
-class Endpoint(PgProtocol):
+class Endpoint(PgProtocol, LogUtils):
     """An object representing a Postgres compute endpoint managed by the control plane."""
 
     def __init__(
@@ -3457,6 +3457,7 @@ class Endpoint(PgProtocol):
         )
         path = Path("endpoints") / self.endpoint_id / "pgdata"
         self.pgdata_dir = os.path.join(self.env.repo_dir, path)
+        self.logfile = self.endpoint_path() / "compute.log"
 
         config_lines = config_lines or []
 
@@ -3941,7 +3942,7 @@ class Safekeeper(LogUtils):
         wait_until(20, 0.5, paused)
 
 
-class S3Scrubber:
+class StorageScrubber:
     def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
         self.env = env
         self.log_dir = log_dir or env.test_output_dir
@@ -3961,7 +3962,7 @@ class S3Scrubber:
         if s3_storage.endpoint is not None:
             env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})
 
-        base_args = [str(self.env.neon_binpath / "s3_scrubber")]
+        base_args = [str(self.env.neon_binpath / "storage_scrubber")]
         args = base_args + args
 
         (output_path, stdout, status_code) = subprocess_capture(
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 91435e8a1f..72384c138b 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -313,7 +313,7 @@ def assert_prefix_empty(
             # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865
             # this seems like a mock_s3 issue
             log.warning(
-                f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0"
+                f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0"
             )
             keys = 0
         elif keys != 0 and len(objects) == 0:
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index d06d2f3d93..7f54eb0b0a 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -591,3 +591,20 @@ class PropagatingThread(threading.Thread):
         if self.exc:
             raise self.exc
         return self.ret
+
+
+def human_bytes(amt: float) -> str:
+    """
+    Render a bytes amount into nice IEC bytes string.
+    """
+
+    suffixes = ["", "Ki", "Mi", "Gi"]
+
+    last = suffixes[-1]
+
+    for name in suffixes:
+        if amt < 1024 or name == last:
+            return f"{int(round(amt))} {name}B"
+        amt = amt / 1024
+
+    raise RuntimeError("unreachable")
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 7ae2352c06..7722828c79 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -17,7 +17,7 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_upload_queue_empty
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.utils import wait_until
+from fixtures.utils import human_bytes, wait_until
 
 GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"
 
@@ -218,19 +218,6 @@ def count_layers_per_tenant(
     return dict(ret)
 
 
-def human_bytes(amt: float) -> str:
-    suffixes = ["", "Ki", "Mi", "Gi"]
-
-    last = suffixes[-1]
-
-    for name in suffixes:
-        if amt < 1024 or name == last:
-            return f"{int(round(amt))} {name}B"
-        amt = amt / 1024
-
-    raise RuntimeError("unreachable")
-
-
 def _eviction_env(
     request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, num_pageservers: int
 ) -> EvictionEnv:
@@ -294,7 +281,7 @@ def pgbench_init_tenant(
             "gc_period": "0s",
             "compaction_period": "0s",
             "checkpoint_distance": f"{layer_size}",
-            "image_creation_threshold": "100",
+            "image_creation_threshold": "999999",
             "compaction_target_size": f"{layer_size}",
         }
     )
@@ -668,11 +655,10 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or
         finish_tenant_creation(env, tenant_id, timeline_id, min_expected_layers)
 
     tenant_layers = count_layers_per_tenant(env.pageserver, map(lambda x: x[0], timelines))
-    (total_on_disk, _, _) = poor_mans_du(env, map(lambda x: x[0], timelines), env.pageserver, False)
+    (total_on_disk, _, _) = poor_mans_du(env, map(lambda x: x[0], timelines), env.pageserver, True)
 
-    # cut 10 percent
     response = env.pageserver.http_client().disk_usage_eviction_run(
-        {"evict_bytes": total_on_disk // 10, "eviction_order": order.config()}
+        {"evict_bytes": total_on_disk // 5, "eviction_order": order.config()}
     )
     log.info(f"{response}")
 
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index cf7a1c56ee..1d1b2fb485 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -1,6 +1,5 @@
 import asyncio
 import os
-import re
 import threading
 import time
 from functools import partial
@@ -18,20 +17,6 @@ from fixtures.neon_fixtures import (
 from fixtures.utils import wait_until
 
 
-# Check for corrupted WAL messages which might otherwise go unnoticed if
-# reconnection fixes this.
-def scan_standby_log_for_errors(secondary):
-    log_path = secondary.endpoint_path() / "compute.log"
-    with log_path.open("r") as f:
-        markers = re.compile(
-            r"incorrect resource manager data|record with incorrect|invalid magic number|unexpected pageaddr"
-        )
-        for line in f:
-            if markers.search(line):
-                log.info(f"bad error in standby log: {line}")
-                raise AssertionError()
-
-
 def test_hot_standby(neon_simple_env: NeonEnv):
     env = neon_simple_env
 
@@ -91,7 +76,11 @@ def test_hot_standby(neon_simple_env: NeonEnv):
                         assert response is not None
                         assert response == responses[query]
 
-            scan_standby_log_for_errors(secondary)
+            # Check for corrupted WAL messages which might otherwise go unnoticed if
+            # reconnection fixes this.
+            assert not secondary.log_contains(
+                "incorrect resource manager data|record with incorrect|invalid magic number|unexpected pageaddr"
+            )
 
     # clean up
     if slow_down_send:
diff --git a/test_runner/regress/test_ingestion_layer_size.py b/test_runner/regress/test_ingestion_layer_size.py
new file mode 100644
index 0000000000..44c77b3410
--- /dev/null
+++ b/test_runner/regress/test_ingestion_layer_size.py
@@ -0,0 +1,151 @@
+from dataclasses import dataclass
+from typing import Iterable, List, Union
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
+from fixtures.pageserver.http import HistoricLayerInfo, LayerMapInfo
+from fixtures.utils import human_bytes
+
+
+def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder, build_type: str):
+    """
+    Build a non-small GIN index which includes similarly batched up images in WAL stream as does pgvector
+    to show that we no longer create oversized layers.
+    """
+
+    if build_type == "debug":
+        pytest.skip("debug run is unnecessarily slow")
+
+    minimum_initdb_size = 20 * 1024**2
+    checkpoint_distance = 32 * 1024**2
+    minimum_good_layer_size = checkpoint_distance * 0.9
+    minimum_too_large_layer_size = 2 * checkpoint_distance
+
+    # index size: 99MiB
+    rows = 2_500_000
+
+    # bucket lower limits
+    buckets = [0, minimum_initdb_size, minimum_good_layer_size, minimum_too_large_layer_size]
+
+    assert (
+        minimum_initdb_size < minimum_good_layer_size
+    ), "keep checkpoint_distance higher than the initdb size (find it by experimenting)"
+
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_distance": f"{checkpoint_distance}",
+            "compaction_target_size": f"{checkpoint_distance}",
+            # this test is primarly interested in L0 sizes but we'll compact after ingestion to ensure sizes are good even then
+            "compaction_period": "0s",
+            "gc_period": "0s",
+            "compaction_threshold": "255",
+            "image_creation_threshold": "99999",
+        }
+    )
+
+    # build a larger than 3*checkpoint_distance sized gin index.
+    # gin index building exhibits the same behaviour as the pgvector with the two phase build
+    with env.endpoints.create_start("main") as ep, ep.cursor() as cur:
+        cur.execute(
+            f"create table int_array_test as select array_agg(g) as int_array from generate_series(1, {rows}) g group by g / 10;"
+        )
+        cur.execute(
+            "create index int_array_test_gin_index on int_array_test using gin (int_array);"
+        )
+        cur.execute("select pg_table_size('int_array_test_gin_index')")
+        size = cur.fetchone()
+        assert size is not None
+        assert isinstance(size[0], int)
+        log.info(f"gin index size: {human_bytes(size[0])}")
+        assert (
+            size[0] > checkpoint_distance * 3
+        ), f"gin index is not large enough: {human_bytes(size[0])}"
+        wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
+
+    ps_http = env.pageserver.http_client()
+    ps_http.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
+
+    infos = ps_http.layer_map_info(env.initial_tenant, env.initial_timeline)
+    assert len(infos.in_memory_layers) == 0, "should had flushed open layers"
+    post_ingest = histogram_historic_layers(infos, buckets)
+
+    # describe first, assert later for easier debugging
+    log.info("non-cumulative layer size distribution after ingestion:")
+    print_layer_size_histogram(post_ingest)
+
+    # since all we have are L0s, we should be getting nice L1s and images out of them now
+    ps_http.patch_tenant_config_client_side(
+        env.initial_tenant,
+        {
+            "compaction_threshold": 1,
+            "image_creation_threshold": 1,
+        },
+    )
+
+    ps_http.timeline_compact(env.initial_tenant, env.initial_timeline, True, True)
+
+    infos = ps_http.layer_map_info(env.initial_tenant, env.initial_timeline)
+    assert len(infos.in_memory_layers) == 0, "no new inmem layers expected"
+    post_compact = histogram_historic_layers(infos, buckets)
+
+    log.info("non-cumulative layer size distribution after compaction:")
+    print_layer_size_histogram(post_compact)
+
+    assert (
+        post_ingest.counts[3] == 0
+    ), f"there should be no layers larger than 2*checkpoint_distance ({human_bytes(2*checkpoint_distance)})"
+    assert post_ingest.counts[1] == 1, "expect one smaller layer for initdb"
+    assert (
+        post_ingest.counts[0] <= 1
+    ), "expect at most one tiny layer from shutting down the endpoint"
+
+    # just make sure we don't have trouble splitting the layers apart
+    assert post_compact.counts[3] == 0
+
+
+@dataclass
+class Histogram:
+    buckets: List[Union[int, float]]
+    counts: List[int]
+    sums: List[int]
+
+
+def histogram_historic_layers(
+    infos: LayerMapInfo, minimum_sizes: List[Union[int, float]]
+) -> Histogram:
+    def log_layer(layer: HistoricLayerInfo) -> HistoricLayerInfo:
+        log.info(
+            f"{layer.layer_file_name} {human_bytes(layer.layer_file_size)} ({layer.layer_file_size} bytes)"
+        )
+        return layer
+
+    layers = map(log_layer, infos.historic_layers)
+    sizes = (x.layer_file_size for x in layers)
+    return histogram(sizes, minimum_sizes)
+
+
+def histogram(sizes: Iterable[int], minimum_sizes: List[Union[int, float]]) -> Histogram:
+    assert all(minimum_sizes[i] < minimum_sizes[i + 1] for i in range(len(minimum_sizes) - 1))
+    buckets = list(enumerate(minimum_sizes))
+    counts = [0 for _ in buckets]
+    sums = [0 for _ in buckets]
+
+    for size in sizes:
+        found = False
+        for index, min_size in reversed(buckets):
+            if size >= min_size:
+                counts[index] += 1
+                sums[index] += size
+                found = True
+                break
+        assert found
+
+    return Histogram(minimum_sizes, counts, sums)
+
+
+def print_layer_size_histogram(h: Histogram):
+    for index, min_size in enumerate(h.buckets):
+        log.info(
+            f">= {human_bytes(min_size)}: {h.counts[index]} layers total {human_bytes(h.sums[index])}"
+        )
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 18e5111786..54d3b2d515 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -37,7 +37,8 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     """
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_configs()
+    env.start()
     env.pageserver.allowed_errors.extend(
         [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
     )
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index a657d5a035..ca3c81d6e5 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -221,6 +221,35 @@ def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
     wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint))
 
 
+def test_ondemand_wal_download_in_replication_slot_funcs(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch("init")
+    endpoint = env.endpoints.create_start("init")
+
+    with endpoint.connect().cursor() as cur:
+        cur.execute("create table wal_generator (id serial primary key, data text)")
+        cur.execute(
+            "SELECT * FROM pg_create_logical_replication_slot('slotty_mcslotface', 'test_decoding')"
+        )
+        cur.execute(
+            """
+INSERT INTO wal_generator (data)
+SELECT repeat('A', 1024) -- Generates a kilobyte of data per row
+FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of data
+"""
+        )
+
+    endpoint.stop_and_destroy()
+    endpoint = env.endpoints.create_start("init")
+
+    with endpoint.connect().cursor() as cur:
+        cur.execute(
+            "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')"
+        )
+
+
 # Tests that walsender correctly blocks until WAL is downloaded from safekeepers
 def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg):
     neon_env_builder.num_safekeepers = 3
@@ -247,6 +276,7 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
     connstr = endpoint.connstr().replace("'", "''")
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub")
     logical_replication_sync(vanilla_pg, endpoint)
+
     vanilla_pg.stop()
 
     # Pause the safekeepers so that they can't send WAL (except to pageserver)
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
index 526ae14b87..5637f160cf 100644
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -8,8 +8,6 @@ def test_migrations(neon_simple_env: NeonEnv):
     env.neon_cli.create_branch("test_migrations", "empty")
 
     endpoint = env.endpoints.create("test_migrations")
-    log_path = endpoint.endpoint_path() / "compute.log"
-
     endpoint.respec(skip_pg_catalog_updates=False)
     endpoint.start()
 
@@ -22,9 +20,7 @@ def test_migrations(neon_simple_env: NeonEnv):
         migration_id = cur.fetchall()
         assert migration_id[0][0] == num_migrations
 
-    with open(log_path, "r") as log_file:
-        logs = log_file.read()
-        assert f"INFO handle_migrations: Ran {num_migrations} migrations" in logs
+    endpoint.assert_log_contains(f"INFO handle_migrations: Ran {num_migrations} migrations")
 
     endpoint.stop()
     endpoint.start()
@@ -36,6 +32,4 @@ def test_migrations(neon_simple_env: NeonEnv):
         migration_id = cur.fetchall()
         assert migration_id[0][0] == num_migrations
 
-    with open(log_path, "r") as log_file:
-        logs = log_file.read()
-        assert "INFO handle_migrations: Ran 0 migrations" in logs
+    endpoint.assert_log_contains("INFO handle_migrations: Ran 0 migrations")
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 0235cf6d20..696af24e5c 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -22,7 +22,7 @@ from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
     PgBin,
-    S3Scrubber,
+    StorageScrubber,
     generate_uploads_and_deletions,
 )
 from fixtures.pageserver.common_types import parse_layer_file_name
@@ -215,7 +215,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
 
     # Having written a mixture of generation-aware and legacy index_part.json,
     # ensure the scrubber handles the situation as expected.
-    metadata_summary = S3Scrubber(neon_env_builder).scan_metadata()
+    metadata_summary = StorageScrubber(neon_env_builder).scan_metadata()
     assert metadata_summary["tenant_count"] == 1  # Scrubber should have seen our timeline
     assert metadata_summary["timeline_count"] == 1
     assert metadata_summary["timeline_shard_count"] == 1
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 5bfa9cce8c..2782d33e15 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -7,7 +7,7 @@ from typing import Any, Dict, Optional
 import pytest
 from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
+from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, StorageScrubber
 from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     assert_prefix_empty,
@@ -214,7 +214,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     # Having done a bunch of attach/detach cycles, we will have generated some index garbage: check
     # that the scrubber sees it and cleans it up.  We do this before the final attach+validate pass,
     # to also validate that the scrubber isn't breaking anything.
-    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
+    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
     assert gc_summary["remote_storage_errors"] == 0
     assert gc_summary["indices_deleted"] > 0
 
@@ -536,7 +536,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     # Scrub the remote storage
     # ========================
     # This confirms that the scrubber isn't upset by the presence of the heatmap
-    S3Scrubber(neon_env_builder).scan_metadata()
+    StorageScrubber(neon_env_builder).scan_metadata()
 
     # Detach secondary and delete tenant
     # ===================================
@@ -563,6 +563,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
             )
         ),
     )
+    workload.stop()
 
 
 def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
index 7fdabaaec7..6383d24c57 100644
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -2,6 +2,7 @@ import time
 from datetime import datetime, timezone
 
 from fixtures.common_types import Lsn
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
@@ -32,7 +33,12 @@ def test_tenant_s3_restore(
         assert remote_storage, "remote storage not configured"
         enable_remote_storage_versioning(remote_storage)
 
-    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+    # change it back after initdb, recovery doesn't work if the two
+    # index_part.json uploads happen at same second or too close to each other.
+    initial_tenant_conf = MANY_SMALL_LAYERS_TENANT_CONFIG
+    del initial_tenant_conf["checkpoint_distance"]
+
+    env = neon_env_builder.init_start(initial_tenant_conf)
     env.pageserver.allowed_errors.extend(
         [
             # The deletion queue will complain when it encounters simulated S3 errors
@@ -43,14 +49,16 @@ def test_tenant_s3_restore(
     )
 
     ps_http = env.pageserver.http_client()
-
     tenant_id = env.initial_tenant
 
+    # now lets create the small layers
+    ps_http.set_tenant_config(tenant_id, MANY_SMALL_LAYERS_TENANT_CONFIG)
+
     # Default tenant and the one we created
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
 
     # create two timelines one being the parent of another, both with non-trivial data
-    parent = None
+    parent = "main"
     last_flush_lsns = []
 
     for timeline in ["first", "second"]:
@@ -64,6 +72,7 @@ def test_tenant_s3_restore(
             last_flush_lsns.append(last_flush_lsn)
         ps_http.timeline_checkpoint(tenant_id, timeline_id)
         wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
+        log.info(f"{timeline} timeline {timeline_id} {last_flush_lsn=}")
         parent = timeline
 
     # These sleeps are important because they fend off differences in clocks between us and S3
@@ -108,6 +117,9 @@ def test_tenant_s3_restore(
     ps_http.tenant_attach(tenant_id, generation=generation)
     env.pageserver.quiesce_tenants()
 
+    for tline in ps_http.timeline_list(env.initial_tenant):
+        log.info(f"timeline detail: {tline}")
+
     for i, timeline in enumerate(["first", "second"]):
         with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
             endpoint.safe_psql(f"SELECT * FROM created_{timeline};")
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 390cbd242a..ae1ce3eb67 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -11,8 +11,8 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
-    S3Scrubber,
     StorageControllerApiException,
+    StorageScrubber,
     last_flush_lsn_upload,
     tenant_get_shards,
     wait_for_last_flush_lsn,
@@ -126,7 +126,7 @@ def test_sharding_smoke(
 
     # Check the scrubber isn't confused by sharded content, then disable
     # it during teardown because we'll have deleted by then
-    S3Scrubber(neon_env_builder).scan_metadata()
+    StorageScrubber(neon_env_builder).scan_metadata()
     neon_env_builder.scrub_on_exit = False
 
     env.storage_controller.pageserver_api().tenant_delete(tenant_id)
@@ -693,6 +693,9 @@ def test_sharding_ingest_layer_sizes(
         # small checkpointing and compaction targets to ensure we generate many upload operations
         "checkpoint_distance": f"{expect_layer_size}",
         "compaction_target_size": f"{expect_layer_size}",
+        # aim to reduce flakyness, we are not doing explicit checkpointing
+        "compaction_period": "0s",
+        "gc_period": "0s",
     }
     shard_count = 4
     neon_env_builder.num_pageservers = shard_count
@@ -708,6 +711,23 @@ def test_sharding_ingest_layer_sizes(
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
+    # ignore the initdb layer(s) for the purposes of the size comparison as a initdb image layer optimization
+    # will produce a lot more smaller layers.
+    initial_layers_per_shard = {}
+    log.info("initdb distribution (not asserted on):")
+    for shard in env.storage_controller.locate(tenant_id):
+        pageserver = env.get_pageserver(shard["node_id"])
+        shard_id = shard["shard_id"]
+        layers = (
+            env.get_pageserver(shard["node_id"]).http_client().layer_map_info(shard_id, timeline_id)
+        )
+        for layer in layers.historic_layers:
+            log.info(
+                f"layer[{pageserver.id}]: {layer.layer_file_name} (size {layer.layer_file_size})"
+            )
+
+        initial_layers_per_shard[shard_id] = set(layers.historic_layers)
+
     workload = Workload(env, tenant_id, timeline_id)
     workload.init()
     workload.write_rows(4096, upload=False)
@@ -729,7 +749,13 @@ def test_sharding_ingest_layer_sizes(
 
         historic_layers = sorted(layer_map.historic_layers, key=lambda layer: layer.lsn_start)
 
+        initial_layers = initial_layers_per_shard[shard_id]
+
         for layer in historic_layers:
+            if layer in initial_layers:
+                # ignore the initdb image layers for the size histogram
+                continue
+
             if layer.layer_file_size < expect_layer_size // 2:
                 classification = "Small"
                 small_layer_count += 1
@@ -759,7 +785,8 @@ def test_sharding_ingest_layer_sizes(
         pass
     else:
         # General case:
-        assert float(small_layer_count) / float(ok_layer_count) < 0.25
+        # old limit was 0.25 but pg14 is right at the limit with 7/28
+        assert float(small_layer_count) / float(ok_layer_count) < 0.3
 
     # Each shard may emit up to one huge layer, because initdb ingest doesn't respect checkpoint_distance.
     assert huge_layer_count <= shard_count
diff --git a/test_runner/regress/test_s3_scrubber.py b/test_runner/regress/test_storage_scrubber.py
similarity index 94%
rename from test_runner/regress/test_s3_scrubber.py
rename to test_runner/regress/test_storage_scrubber.py
index 6baba190f3..35ae61c380 100644
--- a/test_runner/regress/test_s3_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -6,7 +6,7 @@ import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
-    S3Scrubber,
+    StorageScrubber,
 )
 from fixtures.remote_storage import S3Storage, s3_storage
 from fixtures.workload import Workload
@@ -60,7 +60,7 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
     output_path = neon_env_builder.test_output_dir / "snapshot"
     os.makedirs(output_path)
 
-    scrubber = S3Scrubber(neon_env_builder)
+    scrubber = StorageScrubber(neon_env_builder)
     scrubber.tenant_snapshot(tenant_id, output_path)
 
     assert len(os.listdir(output_path)) > 0
@@ -143,18 +143,18 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt
         workload.write_rows(1)
 
     # With a high min_age, the scrubber should decline to delete anything
-    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=3600)
+    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=3600)
     assert gc_summary["remote_storage_errors"] == 0
     assert gc_summary["indices_deleted"] == 0
 
     # If targeting a different tenant, the scrubber shouldn't do anything
-    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(
+    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(
         min_age_secs=1, tenant_ids=[TenantId.generate()]
     )
     assert gc_summary["remote_storage_errors"] == 0
     assert gc_summary["indices_deleted"] == 0
 
     #  With a low min_age, the scrubber should go ahead and clean up all but the latest 2 generations
-    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
+    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
     assert gc_summary["remote_storage_errors"] == 0
     assert gc_summary["indices_deleted"] == (expect_indices_per_shard - 2) * shard_count
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index fa7cead1bd..fd3cc45c3f 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -10,7 +10,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
-    S3Scrubber,
+    StorageScrubber,
     last_flush_lsn_upload,
     wait_for_last_flush_lsn,
 )
@@ -707,7 +707,7 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
 
     remote_storage_kind = RemoteStorageKind.MOCK_S3
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-    scrubber = S3Scrubber(neon_env_builder)
+    scrubber = StorageScrubber(neon_env_builder)
     env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
 
     ps_http = env.pageserver.http_client()
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 89aa855352..6c762f3871 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -1,7 +1,9 @@
 import time
+from contextlib import closing
 
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, fork_at_current_lsn
+from fixtures.utils import query_scalar
 
 
 #
@@ -113,11 +115,90 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
     assert cur_new.fetchall() == []
 
 
-#
-# Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK
-# record.
-#
-def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
+def test_vm_bit_clear_on_heap_lock_whitebox(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK record.
+
+    This is a repro for the bug fixed in commit 66fa176cc8.
+    """
+    # Debugging https://github.com/neondatabase/neon/issues/6967
+    neon_env_builder.preserve_database_files = True
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            # If auto-analyze runs at the same time that we run VACUUM FREEZE, it
+            # can hold a snasphot that prevent the tuples from being frozen.
+            "autovacuum=off",
+            "log_checkpoints=on",
+        ],
+    )
+
+    # Run the tests in a dedicated database, because the activity monitor
+    # periodically runs some queries on to the 'postgres' database. If that
+    # happens at the same time that we're trying to freeze, the activity
+    # monitor's queries can hold back the xmin horizon and prevent freezing.
+    with closing(endpoint.connect()) as pg_conn:
+        pg_conn.cursor().execute("CREATE DATABASE vmbitsdb")
+    pg_conn = endpoint.connect(dbname="vmbitsdb")
+    cur = pg_conn.cursor()
+
+    # Install extension containing function needed for test
+    cur.execute("CREATE EXTENSION neon_test_utils")
+    cur.execute("CREATE EXTENSION pageinspect")
+
+    # Create a test table and freeze it to set the all-frozen VM bit on all pages.
+    cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)")
+    cur.execute("BEGIN")
+    cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g")
+    xid = int(query_scalar(cur, "SELECT txid_current()"))
+    cur.execute("COMMIT")
+    cur.execute("VACUUM (FREEZE, DISABLE_PAGE_SKIPPING true, VERBOSE) vmtest_lock")
+    for notice in pg_conn.notices:
+        log.info(f"{notice}")
+
+    # This test has been flaky in the past, because background activity like
+    # auto-analyze and compute_ctl's activity monitor queries have prevented the
+    # tuples from being frozen. Check that they were frozen.
+    relfrozenxid = int(
+        query_scalar(cur, "SELECT relfrozenxid FROM pg_class WHERE relname='vmtest_lock'")
+    )
+    assert (
+        relfrozenxid > xid
+    ), f"Inserted rows were not frozen. This can be caused by concurrent activity in the database. (XID {xid}, relfrozenxid {relfrozenxid}"
+
+    # Lock a row. This clears the all-frozen VM bit for that page.
+    cur.execute("BEGIN")
+    cur.execute("SELECT * FROM vmtest_lock WHERE id = 40000 FOR UPDATE")
+    cur.execute("COMMIT")
+
+    # The VM page in shared buffer cache, and the same page as reconstructed by
+    # the pageserver, should be equal. Except for the LSN: Clearing a bit in the
+    # VM doesn't bump the LSN in PostgreSQL, but the pageserver updates the LSN
+    # when it replays the VM-bit clearing record (since commit 387a36874c)
+    #
+    # This is a bit fragile, we've had lot of flakiness in this test before. For
+    # example, because all the VM bits were not set because concurrent
+    # autoanalyze prevented the VACUUM FREEZE from freezing the tuples. Or
+    # because autoavacuum kicked in and re-froze the page between the
+    # get_raw_page() and get_raw_page_at_lsn() calls. We disable autovacuum now,
+    # which should make this deterministic.
+    cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
+    vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex()
+    cur.execute(
+        "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )"
+    )
+    vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex()
+
+    assert vm_page_at_pageserver == vm_page_in_cache
+
+
+def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder):
+    """
+    The previous test is enough to verify the bug that was fixed in
+    commit 66fa176cc8. But for good measure, we also reproduce the
+    original problem that the missing VM page update caused.
+    """
     # Debugging https://github.com/neondatabase/neon/issues/6967
     neon_env_builder.preserve_database_files = True
 
@@ -133,9 +214,9 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
 
     tenant_id = env.initial_tenant
-    timeline_id = env.neon_cli.create_branch("test_vm_bit_clear_on_heap_lock")
+    timeline_id = env.initial_timeline
     endpoint = env.endpoints.create_start(
-        "test_vm_bit_clear_on_heap_lock",
+        "main",
         config_lines=[
             "log_autovacuum_min_duration = 0",
             # Perform anti-wraparound vacuuming aggressively
@@ -149,12 +230,10 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
 
     # Install extension containing function needed for test
     cur.execute("CREATE EXTENSION neon_test_utils")
-    cur.execute("CREATE EXTENSION pageinspect")
 
     # Create a test table and freeze it to set the all-frozen VM bit on all pages.
     cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)")
     cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g")
-
     cur.execute("VACUUM (FREEZE, DISABLE_PAGE_SKIPPING true) vmtest_lock")
 
     # Lock a row. This clears the all-frozen VM bit for that page.
@@ -168,27 +247,6 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
 
     cur.execute("COMMIT")
 
-    # The VM page in shared buffer cache, and the same page as reconstructed
-    # by the pageserver, should be equal.
-    #
-    # Ignore page header (24 bytes) of visibility map.
-    # If the dirty VM page is flushed from the cache for some reason,
-    # it gets WAL-logged, which changes the LSN on the page.
-    # Also in neon SMGR we can replace empty heap page with zero (uninitialized) heap page.
-    cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
-    vm_page_in_cache = (cur.fetchall()[0][0])[24:100].hex()
-    cur.execute(
-        "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )"
-    )
-    vm_page_at_pageserver = (cur.fetchall()[0][0])[24:100].hex()
-
-    assert vm_page_at_pageserver == vm_page_in_cache
-
-    # The above assert is enough to verify the bug that was fixed in
-    # commit 66fa176cc8. But for good measure, we also reproduce the
-    # original problem that the missing VM page update caused. The
-    # rest of the test does that.
-
     # Kill and restart postgres, to clear the buffer cache.
     #
     # NOTE: clear_buffer_cache() will not do, because it evicts the dirty pages
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 17e0f5ff4e..4c51945a61 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 17e0f5ff4e1905691aa40e1e08f9b79b14c99652
+Subproject commit 4c51945a6167ca06c0169e7a4ca5a8e7ffa3faba
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index c2c3d40534..e22098d86d 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit c2c3d40534db97d83dd7e185d1971e707fa2f445
+Subproject commit e22098d86d6c40276b6bd75c29133a33fb283ab6
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index b228f20372..9837db1578 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit b228f20372ebcabfd7946647cb7adbd38bacb14a
+Subproject commit 9837db157837fcf43ef7348be0017d3a2238cd27
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 5bf4e289ef..f945ea6d73 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "b228f20372ebcabfd7946647cb7adbd38bacb14a"],
-  "v15": ["15.7", "c2c3d40534db97d83dd7e185d1971e707fa2f445"],
-  "v14": ["14.12", "17e0f5ff4e1905691aa40e1e08f9b79b14c99652"]
+  "v16": ["16.3", "9837db157837fcf43ef7348be0017d3a2238cd27"],
+  "v15": ["15.7", "e22098d86d6c40276b6bd75c29133a33fb283ab6"],
+  "v14": ["14.12", "4c51945a6167ca06c0169e7a4ca5a8e7ffa3faba"]
 }