diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 01f5c3ede9..cd95a5b16d 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -728,30 +728,6 @@ jobs:
           tags: |
             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
 
-      - name: Build compute-tools image
-        # compute-tools are Postgres independent, so build it only once
-        # We pick 16, because that builds on debian 11 with older glibc (and is
-        # thus compatible with newer glibc), rather than 17 on Debian 12, as
-        # that isn't guaranteed to be compatible with Debian 11
-        if: matrix.version.pg == 'v16'
-        uses: docker/build-push-action@v6
-        with:
-          target: compute-tools-image
-          context: .
-          build-args: |
-            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
-            DEBIAN_VERSION=${{ matrix.version.debian }}
-          provenance: false
-          push: true
-          pull: true
-          file: compute/compute-node.Dockerfile
-          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
-          tags: |
-            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
-
   compute-node-image:
     needs: [ compute-node-image-arch, tag ]
     permissions:
@@ -794,14 +770,6 @@ jobs:
                                              neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
                                              neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
 
-      - name: Create multi-arch compute-tools image
-        if: matrix.version.pg == 'v16'
-        run: |
-          docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
-                                          -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
-
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@v4
         with:
@@ -817,12 +785,6 @@ jobs:
           docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
                                                                                 neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
 
-      - name: Push multi-arch compute-tools image to ECR
-        if: matrix.version.pg == 'v16'
-        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
-                                                                                neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
-
   vm-compute-node-image:
     needs: [ check-permissions, tag, compute-node-image ]
     runs-on: [ self-hosted, large ]
@@ -1001,9 +963,6 @@ jobs:
             docker buildx imagetools create -t $repo/neon:latest \
                                                $repo/neon:${{ needs.tag.outputs.build-tag }}
 
-            docker buildx imagetools create -t $repo/compute-tools:latest \
-                                               $repo/compute-tools:${{ needs.tag.outputs.build-tag }}
-
             for version in ${VERSIONS}; do
               docker buildx imagetools create -t $repo/compute-node-${version}:latest \
                                                  $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
@@ -1032,7 +991,7 @@ jobs:
       - name: Copy all images to prod ECR
         if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
         run: |
-          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do
+          for image in neon {vm-,}compute-node-{v14,v15,v16,v17}; do
             docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
                                                369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
           done
@@ -1044,7 +1003,7 @@ jobs:
     with:
       client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
       image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
+      images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
       registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
@@ -1056,7 +1015,7 @@ jobs:
     with:
       client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
       image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
+      images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
       registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
       subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
diff --git a/Cargo.lock b/Cargo.lock
index f727741883..1e29f4fc08 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1605,6 +1605,32 @@ dependencies = [
  "typenum",
 ]
 
+[[package]]
+name = "curve25519-dalek"
+version = "4.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "curve25519-dalek-derive",
+ "digest",
+ "fiat-crypto",
+ "rustc_version",
+ "subtle",
+]
+
+[[package]]
+name = "curve25519-dalek-derive"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
+]
+
 [[package]]
 name = "darling"
 version = "0.20.1"
@@ -1875,6 +1901,28 @@ dependencies = [
  "spki 0.7.3",
 ]
 
+[[package]]
+name = "ed25519"
+version = "2.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53"
+dependencies = [
+ "signature 2.2.0",
+]
+
+[[package]]
+name = "ed25519-dalek"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871"
+dependencies = [
+ "curve25519-dalek",
+ "ed25519",
+ "rand_core 0.6.4",
+ "sha2",
+ "subtle",
+]
+
 [[package]]
 name = "either"
 version = "1.8.1"
@@ -2113,6 +2161,12 @@ dependencies = [
  "subtle",
 ]
 
+[[package]]
+name = "fiat-crypto"
+version = "0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d"
+
 [[package]]
 name = "filetime"
 version = "0.2.22"
@@ -3990,6 +4044,7 @@ dependencies = [
  "postgres_connection",
  "postgres_ffi",
  "postgres_initdb",
+ "pprof",
  "pq_proto",
  "procfs",
  "rand 0.8.5",
@@ -4745,6 +4800,7 @@ dependencies = [
  "consumption_metrics",
  "dashmap 5.5.0",
  "ecdsa 0.16.9",
+ "ed25519-dalek",
  "env_logger 0.10.2",
  "fallible-iterator",
  "flate2",
diff --git a/Dockerfile b/Dockerfile
index d3659f917a..2e4f8e5546 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -71,6 +71,7 @@ RUN set -e \
         ca-certificates \
 	# System postgres for use with client libraries (e.g. in storage controller)
         postgresql-15 \
+        openssl \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
     && useradd -d /data neon \
     && chown -R neon:neon /data
diff --git a/Makefile b/Makefile
index 9cffc74508..22ebfea7d5 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,6 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Where to install Postgres, default is ./pg_install, maybe useful for package managers
 POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
 
-OPENSSL_PREFIX_DIR := /usr/local/openssl
 ICU_PREFIX_DIR := /usr/local/icu
 
 #
@@ -26,11 +25,9 @@ endif
 ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
 	# Exclude static build openssl, icu for local build (MacOS, Linux)
 	# Only keep for build type release and debug
-	PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include
 	PG_CONFIGURE_OPTS += --with-icu
 	PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION'
 	PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm'
-	PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread'
 endif
 
 UNAME_S := $(shell uname -s)
diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 79210a2e1b..7a2ec9c43e 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -115,7 +115,7 @@ RUN set -e \
 
 # Keep the version the same as in compute/compute-node.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-ENV SQL_EXPORTER_VERSION=0.16.0
+ENV SQL_EXPORTER_VERSION=0.17.0
 RUN curl -fsSL \
     "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
     --output sql_exporter.tar.gz \
@@ -190,21 +190,6 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
     && make install \
     && rm -rf ../lcov.tar.gz
 
-# Compile and install the static OpenSSL library
-ENV OPENSSL_VERSION=1.1.1w
-ENV OPENSSL_PREFIX=/usr/local/openssl
-RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
-    echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
-    cd /tmp && \
-    tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
-    rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
-    cd /tmp/openssl-${OPENSSL_VERSION} && \
-    ./config --prefix=${OPENSSL_PREFIX}  -static --static no-shared -fPIC && \
-    make -j "$(nproc)" && \
-    make install && \
-    cd /tmp && \
-    rm -rf /tmp/openssl-${OPENSSL_VERSION}
-
 # Use the same version of libicu as the compute nodes so that
 # clusters created using inidb on pageserver can be used by computes.
 #
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 303daec240..299f4444a3 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -104,16 +104,18 @@ RUN cd postgres && \
         esac; \
     done;
 
+# Set PATH for all the subsequent build steps
+ENV PATH="/usr/local/pgsql/bin:$PATH"
+
 #########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
 #########################################################################################
-FROM build-deps AS postgis-build
+FROM pg-build AS postgis-build
 ARG DEBIAN_VERSION
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
     apt install --no-install-recommends --no-install-suggests -y \
     gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
@@ -151,8 +153,6 @@ RUN case "${DEBIAN_VERSION}" in \
     DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \
     ninja clean && cp -R /sfcgal/* /
 
-ENV PATH="/usr/local/pgsql/bin:$PATH"
-
 # Postgis 3.5.0 supports v17
 RUN case "${PG_VERSION}" in \
     "v17") \
@@ -170,7 +170,6 @@ RUN case "${PG_VERSION}" in \
     wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \
     echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \
     mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     ./autogen.sh && \
     ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -220,11 +219,7 @@ RUN case "${PG_VERSION}" in \
     cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \
     ninja -j $(getconf _NPROCESSORS_ONLN) && \
     ninja -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
 
 #########################################################################################
 #
@@ -232,9 +227,8 @@ RUN case "${PG_VERSION}" in \
 # Build plv8
 #
 #########################################################################################
-FROM build-deps AS plv8-build
+FROM pg-build AS plv8-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch
 
@@ -269,7 +263,6 @@ RUN case "${PG_VERSION}" in \
     # generate and copy upgrade scripts
     mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \
     cp upgrade/* /usr/local/pgsql/share/extension/ && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
     make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
     rm -rf /plv8-* && \
     find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
@@ -296,9 +289,8 @@ RUN case "${PG_VERSION}" in \
 # Build h3_pg
 #
 #########################################################################################
-FROM build-deps AS h3-pg-build
+FROM pg-build AS h3-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v4.1.0 - Jan 18, 2023
@@ -319,7 +311,6 @@ RUN mkdir -p /h3/usr/ && \
 RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
     echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
     mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
@@ -331,17 +322,16 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3
 # compile unit extension
 #
 #########################################################################################
-FROM build-deps AS unit-pg-build
+FROM pg-build AS unit-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release 7.9 - Sep 15, 2024
 RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -O postgresql-unit.tar.gz && \
     echo "e46de6245dcc8b2c2ecf29873dbd43b2b346773f31dd5ce4b8315895a052b456 postgresql-unit.tar.gz" | sha256sum --check && \
     mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
     # We move the extension from '/usr/local/pgsql/' to '/usr/local/'  after it is build. So we need to adjust the path.
     # This one-liner removes pgsql/ part of the path.
@@ -355,9 +345,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -
 # compile pgvector extension
 #
 #########################################################################################
-FROM build-deps AS vector-pg-build
+FROM pg-build AS vector-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY compute/patches/pgvector.patch /pgvector.patch
 
@@ -371,8 +360,8 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
     echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
     patch -p1 < /pgvector.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
 
 #########################################################################################
@@ -381,16 +370,15 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
 # compile pgjwt extension
 #
 #########################################################################################
-FROM build-deps AS pgjwt-pg-build
+FROM pg-build AS pgjwt-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # doesn't use releases, last commit f3d82fd - Mar 2, 2023
 RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \
     echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \
     mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
 
 #########################################################################################
@@ -399,17 +387,16 @@ RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71
 # compile hypopg extension
 #
 #########################################################################################
-FROM build-deps AS hypopg-pg-build
+FROM pg-build AS hypopg-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # HypoPG 1.4.1 supports v17
 # last release 1.4.1 - Apr 28, 2024
 RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \
     echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \
     mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
 
 #########################################################################################
@@ -418,17 +405,16 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypo
 # compile pg_hashids extension
 #
 #########################################################################################
-FROM build-deps AS pg-hashids-pg-build
+FROM pg-build AS pg-hashids-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v1.2.1 -Jan 12, 2018
 RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
     echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
     mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
 
 #########################################################################################
@@ -437,9 +423,8 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
 # compile rum extension
 #
 #########################################################################################
-FROM build-deps AS rum-pg-build
+FROM pg-build AS rum-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY compute/patches/rum.patch /rum.patch
 
@@ -450,8 +435,8 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea
     echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \
     mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
     patch -p1 < /rum.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
 
 #########################################################################################
@@ -460,17 +445,16 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea
 # compile pgTAP extension
 #
 #########################################################################################
-FROM build-deps AS pgtap-pg-build
+FROM pg-build AS pgtap-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # pgtap 1.3.3 supports v17
 # last release v1.3.3 - Apr 8, 2024
 RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \
     echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \
     mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
 
 #########################################################################################
@@ -479,17 +463,16 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgta
 # compile ip4r extension
 #
 #########################################################################################
-FROM build-deps AS ip4r-pg-build
+FROM pg-build AS ip4r-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v2.4.2 - Jul 29, 2023
 RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
     echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
     mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
 
 #########################################################################################
@@ -498,17 +481,16 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i
 # compile Prefix extension
 #
 #########################################################################################
-FROM build-deps AS prefix-pg-build
+FROM pg-build AS prefix-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v1.2.10  - Jul 5, 2023
 RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
     echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
     mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
 
 #########################################################################################
@@ -517,17 +499,16 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p
 # compile hll extension
 #
 #########################################################################################
-FROM build-deps AS hll-pg-build
+FROM pg-build AS hll-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v2.18 - Aug 29, 2023
 RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
     echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
     mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
 
 #########################################################################################
@@ -536,17 +517,16 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
 # compile plpgsql_check extension
 #
 #########################################################################################
-FROM build-deps AS plpgsql-check-pg-build
+FROM pg-build AS plpgsql-check-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # plpgsql_check v2.7.11 supports v17
 # last release v2.7.11 - Sep 16, 2024
 RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \
     echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \
     mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
 
 #########################################################################################
@@ -555,11 +535,8 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz
 # compile timescaledb extension
 #
 #########################################################################################
-FROM build-deps AS timescaledb-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
+FROM pg-build AS timescaledb-pg-build
 ARG PG_VERSION
-ENV PATH="/usr/local/pgsql/bin:$PATH"
 
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
@@ -590,11 +567,8 @@ RUN case "${PG_VERSION}" in \
 # compile pg_hint_plan extension
 #
 #########################################################################################
-FROM build-deps AS pg-hint-plan-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
+FROM pg-build AS pg-hint-plan-pg-build
 ARG PG_VERSION
-ENV PATH="/usr/local/pgsql/bin:$PATH"
 
 # version-specific, has separate releases for each version
 RUN case "${PG_VERSION}" in \
@@ -632,14 +606,12 @@ RUN case "${PG_VERSION}" in \
 # compile pg_cron extension
 #
 #########################################################################################
-FROM build-deps AS pg-cron-pg-build
+FROM pg-build AS pg-cron-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # This is an experimental extension that we do not support on prod yet.
 # !Do not remove!
 # We set it in shared_preload_libraries and computes will fail to start if library is not found.
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \
     echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \
     mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
@@ -653,9 +625,8 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O
 # compile rdkit extension
 #
 #########################################################################################
-FROM build-deps AS rdkit-pg-build
+FROM pg-build AS rdkit-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN apt update && \
     apt install --no-install-recommends --no-install-suggests -y \
@@ -673,7 +644,13 @@ RUN apt update && \
 # Use new version only for v17
 # because Release_2024_09_1 has some backward incompatible changes
 # https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
-ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
+
+# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find
+# pg_config. For some reason the rdkit cmake script doesn't work with just that,
+# however. By also adding /usr/local/pgsql, it works, which is weird because there
+# are no executables in that directory.
+ENV PATH="/usr/local/pgsql:$PATH"
+
 RUN case "${PG_VERSION}" in \
     "v17") \
         export RDKIT_VERSION=Release_2024_09_1 \
@@ -726,13 +703,11 @@ RUN case "${PG_VERSION}" in \
 # compile pg_uuidv7 extension
 #
 #########################################################################################
-FROM build-deps AS pg-uuidv7-pg-build
+FROM pg-build AS pg-uuidv7-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v1.6.0 - Oct 9, 2024
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \
     echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \
     mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
@@ -746,13 +721,11 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz
 # compile pg_roaringbitmap extension
 #
 #########################################################################################
-FROM build-deps AS pg-roaringbitmap-pg-build
+FROM pg-build AS pg-roaringbitmap-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
 # last release v0.5.4 - Jun 28, 2022
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
     echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
     mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
@@ -766,16 +739,14 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
 # compile pg_semver extension
 #
 #########################################################################################
-FROM build-deps AS pg-semver-pg-build
+FROM pg-build AS pg-semver-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # Release 0.40.0 breaks backward compatibility with previous versions
 # see release note https://github.com/theory/pg-semver/releases/tag/v0.40.0
 # Use new version only for v17
 #
 # last release v0.40.0 - Jul 22, 2024
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
     "v17") \
         export SEMVER_VERSION=0.40.0 \
@@ -802,13 +773,11 @@ RUN case "${PG_VERSION}" in \
 # compile pg_embedding extension
 #
 #########################################################################################
-FROM build-deps AS pg-embedding-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+FROM pg-build AS pg-embedding-pg-build
 
 # This is our extension, support stopped in favor of pgvector
 # TODO: deprecate it
 ARG PG_VERSION
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
         export PG_EMBEDDING_VERSION=0.3.5 \
@@ -829,26 +798,19 @@ RUN case "${PG_VERSION}" in \
 # compile anon extension
 #
 #########################################################################################
-FROM build-deps AS pg-anon-pg-build
+FROM pg-build AS pg-anon-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # This is an experimental extension, never got to real production.
 # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
     echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
     esac && \
     wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
     echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
     mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control
 
 #########################################################################################
 #
@@ -856,9 +818,8 @@ RUN case "${PG_VERSION}" in "v17") \
 # This layer is used to build `pgrx` deps
 #
 #########################################################################################
-FROM build-deps AS rust-extensions-build
+FROM pg-build AS rust-extensions-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN apt update && \
     apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
@@ -866,7 +827,7 @@ RUN apt update && \
     useradd -ms /bin/bash nonroot -b /home
 
 ENV HOME=/home/nonroot
-ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/home/nonroot/.cargo/bin:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
 
@@ -893,9 +854,8 @@ USER root
 # and eventually get merged with `rust-extensions-build`
 #
 #########################################################################################
-FROM build-deps AS rust-extensions-build-pgrx12
+FROM pg-build AS rust-extensions-build-pgrx12
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 RUN apt update && \
     apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
@@ -903,7 +863,7 @@ RUN apt update && \
     useradd -ms /bin/bash nonroot -b /home
 
 ENV HOME=/home/nonroot
-ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/home/nonroot/.cargo/bin:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
 
@@ -976,22 +936,9 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p
 
 FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build
 ARG PG_VERSION
-# version 0.3.3 supports v17
 # last release v0.3.3 - Oct 16, 2024
-#
-# there were no breaking changes
-# so we can use the same version for all postgres versions
-RUN case "${PG_VERSION}" in \
-    "v14" | "v15" | "v16" | "v17") \
-        export PG_JSONSCHEMA_VERSION=0.3.3 \
-        export PG_JSONSCHEMA_CHECKSUM=40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac \
-    ;; \
-    *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-    ;; \
-    esac && \
-    wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v${PG_JSONSCHEMA_VERSION}.tar.gz -O pg_jsonschema.tar.gz && \
-    echo "${PG_JSONSCHEMA_CHECKSUM} pg_jsonschema.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.gz -O pg_jsonschema.tar.gz && \
+    echo "40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac pg_jsonschema.tar.gz" | sha256sum --check && \
     mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
     # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
     # `unsafe-postgres` feature allows to build pgx extensions
@@ -1012,22 +959,9 @@ RUN case "${PG_VERSION}" in \
 FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build
 ARG PG_VERSION
 
-# version 1.5.9 supports v17
 # last release v1.5.9 - Oct 16, 2024
-#
-# there were no breaking changes
-# so we can use the same version for all postgres versions
-RUN case "${PG_VERSION}" in \
-    "v14" | "v15" | "v16" | "v17") \
-        export PG_GRAPHQL_VERSION=1.5.9 \
-        export PG_GRAPHQL_CHECKSUM=cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 \
-    ;; \
-    *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-    ;; \
-    esac && \
-    wget https://github.com/supabase/pg_graphql/archive/refs/tags/v${PG_GRAPHQL_VERSION}.tar.gz -O pg_graphql.tar.gz && \
-    echo "${PG_GRAPHQL_CHECKSUM} pg_graphql.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \
+    echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
@@ -1091,8 +1025,8 @@ ARG PG_VERSION
 # NOTE: local_proxy depends on the version of pg_session_jwt
 # Do not update without approve from proxy team
 # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
-RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2-v17.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "c8ecbed9cb8c6441bce5134a176002b043018adf9d05a08e457dda233090a86e pg_session_jwt.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release
@@ -1104,13 +1038,11 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2
 #
 #########################################################################################
 
-FROM build-deps AS wal2json-pg-build
+FROM pg-build AS wal2json-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # wal2json wal2json_2_6 supports v17
 # last release wal2json_2_6 - Apr 25, 2024
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \
     echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \
     mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \
@@ -1123,13 +1055,11 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.
 # compile pg_ivm extension
 #
 #########################################################################################
-FROM build-deps AS pg-ivm-build
+FROM pg-build AS pg-ivm-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # pg_ivm v1.9 supports v17
 # last release v1.9 - Jul 31
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \
     echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \
     mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
@@ -1143,13 +1073,11 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_iv
 # compile pg_partman extension
 #
 #########################################################################################
-FROM build-deps AS pg-partman-build
+FROM pg-build AS pg-partman-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # should support v17 https://github.com/pgpartman/pg_partman/discussions/693
 # last release 5.1.0  Apr 2, 2024
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \
     echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \
     mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
@@ -1165,9 +1093,6 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
 #########################################################################################
 FROM rust-extensions-build AS pg-mooncake-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 
 RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \
     echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \
@@ -1183,11 +1108,8 @@ RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/p
 #
 #########################################################################################
 
-FROM build-deps AS pg-repack-build
+FROM pg-build AS pg-repack-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 
 RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \
     echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \
@@ -1258,20 +1180,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
     make -j $(getconf _NPROCESSORS_ONLN) \
         PG_CONFIG=/usr/local/pgsql/bin/pg_config \
         -C pgxn/neon_rmgr \
-        -s install && \
-    case "${PG_VERSION}" in \
-        "v14" | "v15") \
-        ;; \
-        "v16" | "v17") \
-            echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \
-        ;; \
-        *) \
-            echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-        esac && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/hnsw \
         -s install
 
 #########################################################################################
@@ -1288,17 +1196,6 @@ USER nonroot
 COPY --chown=nonroot . .
 RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy
 
-#########################################################################################
-#
-# Final compute-tools image
-#
-#########################################################################################
-
-FROM debian:$DEBIAN_FLAVOR AS compute-tools-image
-
-COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
-COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import
-
 #########################################################################################
 #
 # Layer "pgbouncer"
@@ -1335,11 +1232,11 @@ RUN set -e \
 #
 #########################################################################################
 
-FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
+FROM quay.io/prometheuscommunity/postgres-exporter:v0.16.0 AS postgres-exporter
 
 # Keep the version the same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter
+FROM burningalchemist/sql_exporter:0.17.0 AS sql-exporter
 
 #########################################################################################
 #
diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index 793ec4cf10..f554362751 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -17,7 +17,7 @@
 //!
 //! # Local Testing
 //!
-//! - Comment out most of the pgxns in The Dockerfile.compute-tools to speed up the build.
+//! - Comment out most of the pgxns in compute-node.Dockerfile to speed up the build.
 //! - Build the image with the following command:
 //!
 //! ```bash
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 5b82acb3a5..2fe4cd5202 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -483,7 +483,6 @@ impl LocalEnv {
             .iter()
             .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
             .map(|&(_, timeline_id)| timeline_id)
-            .map(TimelineId::from)
     }
 
     pub fn timeline_name_mappings(&self) -> HashMap<TenantTimelineId, String> {
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 22d2420ed4..c41ff22d15 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -822,10 +822,7 @@ impl StorageController {
         self.dispatch(
             Method::PUT,
             format!("control/v1/tenant/{tenant_shard_id}/migrate"),
-            Some(TenantShardMigrateRequest {
-                tenant_shard_id,
-                node_id,
-            }),
+            Some(TenantShardMigrateRequest { node_id }),
         )
         .await
     }
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 617b2cd1ba..9d133e4af1 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,12 +1,16 @@
 use futures::StreamExt;
-use std::{str::FromStr, time::Duration};
+use std::{
+    collections::{HashMap, HashSet},
+    str::FromStr,
+    time::Duration,
+};
 
 use clap::{Parser, Subcommand};
 use pageserver_api::{
     controller_api::{
         AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
-        SafekeeperDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
-        TenantDescribeResponse, TenantPolicyRequest,
+        SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
+        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
     },
     models::{
         EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -112,6 +116,13 @@ enum Command {
         #[arg(long)]
         node: NodeId,
     },
+    /// Migrate the secondary location for a tenant shard to a specific pageserver.
+    TenantShardMigrateSecondary {
+        #[arg(long)]
+        tenant_shard_id: TenantShardId,
+        #[arg(long)]
+        node: NodeId,
+    },
     /// Cancel any ongoing reconciliation for this shard
     TenantShardCancelReconcile {
         #[arg(long)]
@@ -146,6 +157,12 @@ enum Command {
         #[arg(long)]
         tenant_id: TenantId,
     },
+    TenantSetPreferredAz {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        preferred_az: Option<String>,
+    },
     /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
     /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
     TenantDrop {
@@ -395,11 +412,12 @@ async fn main() -> anyhow::Result<()> {
             resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
 
             let mut table = comfy_table::Table::new();
-            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
+            table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]);
             for node in resp {
                 table.add_row([
                     format!("{}", node.id),
                     node.listen_http_addr,
+                    node.availability_zone_id,
                     format!("{:?}", node.scheduling),
                     format!("{:?}", node.availability),
                 ]);
@@ -472,6 +490,7 @@ async fn main() -> anyhow::Result<()> {
             let mut table = comfy_table::Table::new();
             table.set_header([
                 "TenantId",
+                "Preferred AZ",
                 "ShardCount",
                 "StripeSize",
                 "Placement",
@@ -481,6 +500,11 @@ async fn main() -> anyhow::Result<()> {
                 let shard_zero = tenant.shards.into_iter().next().unwrap();
                 table.add_row([
                     format!("{}", tenant.tenant_id),
+                    shard_zero
+                        .preferred_az_id
+                        .as_ref()
+                        .cloned()
+                        .unwrap_or("".to_string()),
                     format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
                     format!("{:?}", tenant.stripe_size),
                     format!("{:?}", tenant.policy),
@@ -540,10 +564,7 @@ async fn main() -> anyhow::Result<()> {
             tenant_shard_id,
             node,
         } => {
-            let req = TenantShardMigrateRequest {
-                tenant_shard_id,
-                node_id: node,
-            };
+            let req = TenantShardMigrateRequest { node_id: node };
 
             storcon_client
                 .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
@@ -553,6 +574,20 @@ async fn main() -> anyhow::Result<()> {
                 )
                 .await?;
         }
+        Command::TenantShardMigrateSecondary {
+            tenant_shard_id,
+            node,
+        } => {
+            let req = TenantShardMigrateRequest { node_id: node };
+
+            storcon_client
+                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
+                    Method::PUT,
+                    format!("control/v1/tenant/{tenant_shard_id}/migrate_secondary"),
+                    Some(req),
+                )
+                .await?;
+        }
         Command::TenantShardCancelReconcile { tenant_shard_id } => {
             storcon_client
                 .dispatch::<(), ()>(
@@ -596,6 +631,19 @@ async fn main() -> anyhow::Result<()> {
                     None,
                 )
                 .await?;
+
+            let nodes = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+            let nodes = nodes
+                .into_iter()
+                .map(|n| (n.id, n))
+                .collect::<HashMap<_, _>>();
+
             println!("Tenant {tenant_id}");
             let mut table = comfy_table::Table::new();
             table.add_row(["Policy", &format!("{:?}", policy)]);
@@ -604,7 +652,14 @@ async fn main() -> anyhow::Result<()> {
             println!("{table}");
             println!("Shards:");
             let mut table = comfy_table::Table::new();
-            table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
+            table.set_header([
+                "Shard",
+                "Attached",
+                "Attached AZ",
+                "Secondary",
+                "Last error",
+                "status",
+            ]);
             for shard in shards {
                 let secondary = shard
                     .node_secondary
@@ -627,11 +682,18 @@ async fn main() -> anyhow::Result<()> {
                 }
                 let status = status_parts.join(",");
 
+                let attached_node = shard
+                    .node_attached
+                    .as_ref()
+                    .map(|id| nodes.get(id).expect("Shard references nonexistent node"));
+
                 table.add_row([
                     format!("{}", shard.tenant_shard_id),
-                    shard
-                        .node_attached
-                        .map(|n| format!("{}", n))
+                    attached_node
+                        .map(|n| format!("{} ({})", n.listen_http_addr, n.id))
+                        .unwrap_or(String::new()),
+                    attached_node
+                        .map(|n| n.availability_zone_id.clone())
                         .unwrap_or(String::new()),
                     secondary,
                     shard.last_error,
@@ -640,6 +702,66 @@ async fn main() -> anyhow::Result<()> {
             }
             println!("{table}");
         }
+        Command::TenantSetPreferredAz {
+            tenant_id,
+            preferred_az,
+        } => {
+            // First learn about the tenant's shards
+            let describe_response = storcon_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await?;
+
+            // Learn about nodes to validate the AZ ID
+            let nodes = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+
+            if let Some(preferred_az) = &preferred_az {
+                let azs = nodes
+                    .into_iter()
+                    .map(|n| (n.availability_zone_id))
+                    .collect::<HashSet<_>>();
+                if !azs.contains(preferred_az) {
+                    anyhow::bail!(
+                        "AZ {} not found on any node: known AZs are: {:?}",
+                        preferred_az,
+                        azs
+                    );
+                }
+            } else {
+                // Make it obvious to the user that since they've omitted an AZ, we're clearing it
+                eprintln!("Clearing preferred AZ for tenant {}", tenant_id);
+            }
+
+            // Construct a request that modifies all the tenant's shards
+            let req = ShardsPreferredAzsRequest {
+                preferred_az_ids: describe_response
+                    .shards
+                    .into_iter()
+                    .map(|s| {
+                        (
+                            s.tenant_shard_id,
+                            preferred_az.clone().map(AvailabilityZone),
+                        )
+                    })
+                    .collect(),
+            };
+            storcon_client
+                .dispatch::<ShardsPreferredAzsRequest, ()>(
+                    Method::PUT,
+                    "control/v1/preferred_azs".to_string(),
+                    Some(req),
+                )
+                .await?;
+        }
         Command::TenantWarmup { tenant_id } => {
             let describe_response = storcon_client
                 .dispatch::<(), TenantDescribeResponse>(
@@ -915,10 +1037,7 @@ async fn main() -> anyhow::Result<()> {
                             .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
                                 Method::PUT,
                                 format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
-                                Some(TenantShardMigrateRequest {
-                                    tenant_shard_id: mv.tenant_shard_id,
-                                    node_id: mv.to,
-                                }),
+                                Some(TenantShardMigrateRequest { node_id: mv.to }),
                             )
                             .await
                             .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
diff --git a/docs/docker.md b/docs/docker.md
index 0914a00082..ae74c2b2ab 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -7,15 +7,11 @@ Currently we build two main images:
 - [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
 - [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile).
 
-And additional intermediate image:
-
-- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.
-
 ## Build pipeline
 
 We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs
 
-1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14)
+1. `neondatabase/compute-node-v17` (and -16, -v15, -v14)
 
 2. `neondatabase/neon`
 
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 7eb3547183..f3880cb766 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -87,7 +87,7 @@ impl Display for AvailabilityZone {
 #[derive(Serialize, Deserialize)]
 pub struct ShardsPreferredAzsRequest {
     #[serde(flatten)]
-    pub preferred_az_ids: HashMap<TenantShardId, AvailabilityZone>,
+    pub preferred_az_ids: HashMap<TenantShardId, Option<AvailabilityZone>>,
 }
 
 #[derive(Serialize, Deserialize)]
@@ -144,6 +144,8 @@ pub struct NodeDescribeResponse {
     pub availability: NodeAvailabilityWrapper,
     pub scheduling: NodeSchedulingPolicy,
 
+    pub availability_zone_id: String,
+
     pub listen_http_addr: String,
     pub listen_http_port: u16,
 
@@ -179,7 +181,6 @@ pub struct TenantDescribeResponseShard {
 /// specifies some constraints, e.g. asking it to get off particular node(s)
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateRequest {
-    pub tenant_shard_id: TenantShardId,
     pub node_id: NodeId,
 }
 
@@ -368,6 +369,16 @@ pub enum PlacementPolicy {
     Detached,
 }
 
+impl PlacementPolicy {
+    pub fn want_secondaries(&self) -> usize {
+        match self {
+            PlacementPolicy::Attached(secondary_count) => *secondary_count,
+            PlacementPolicy::Secondary => 1,
+            PlacementPolicy::Detached => 0,
+        }
+    }
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateResponse {}
 
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index f0cd713c38..328dea5dec 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -706,7 +706,7 @@ pub fn repl_origin_key_range() -> Range<Key> {
 /// Non inherited range for vectored get.
 pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
 /// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
-pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
+pub const SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
 
 impl Key {
     // AUX_FILES currently stores only data for logical replication (slots etc), and
@@ -714,7 +714,42 @@ impl Key {
     // switch (and generally it likely should be optional), so ignore these.
     #[inline(always)]
     pub fn is_inherited_key(self) -> bool {
-        !NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self)
+        if self.is_sparse() {
+            self.is_inherited_sparse_key()
+        } else {
+            !NON_INHERITED_RANGE.contains(&self)
+        }
+    }
+
+    #[inline(always)]
+    pub fn is_sparse(self) -> bool {
+        self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
+    }
+
+    /// Check if the key belongs to the inherited keyspace.
+    fn is_inherited_sparse_key(self) -> bool {
+        debug_assert!(self.is_sparse());
+        self.field1 == RELATION_SIZE_PREFIX
+    }
+
+    pub fn sparse_non_inherited_keyspace() -> Range<Key> {
+        // The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace
+        debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX);
+        Key {
+            field1: AUX_KEY_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }..Key {
+            field1: REPL_ORIGIN_KEY_PREFIX + 1,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }
     }
 
     #[inline(always)]
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 227bc19d67..2e6949e6ce 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -272,6 +272,8 @@ pub struct CompactInfoResponse {
     pub compact_key_range: Option<CompactKeyRange>,
     pub compact_lsn_range: Option<CompactLsnRange>,
     pub sub_compaction: bool,
+    pub running: bool,
+    pub job_id: usize,
 }
 
 #[derive(Serialize, Deserialize, Clone)]
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 94714359a3..50b2c69d24 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -44,7 +44,7 @@ pub struct ProtocolVersion(u32);
 
 impl ProtocolVersion {
     pub const fn new(major: u16, minor: u16) -> Self {
-        Self((major as u32) << 16 | minor as u32)
+        Self(((major as u32) << 16) | minor as u32)
     }
     pub const fn minor(self) -> u16 {
         self.0 as u16
diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index 49b1d9dc87..dae141bf77 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -43,6 +43,17 @@ impl RemoteStorageKind {
     }
 }
 
+impl RemoteStorageConfig {
+    /// Helper to fetch the configured concurrency limit.
+    pub fn concurrency_limit(&self) -> Option<usize> {
+        match &self.storage {
+            RemoteStorageKind::LocalFs { .. } => None,
+            RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()),
+            RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()),
+        }
+    }
+}
+
 fn default_timeout() -> Duration {
     RemoteStorageConfig::DEFAULT_TIMEOUT
 }
diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs
index 5970836033..44565ee6a2 100644
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -112,9 +112,9 @@ impl Serialize for Generation {
             // We should never be asked to serialize a None. Structures
             // that include an optional generation should convert None to an
             // Option<Generation>::None
-            Err(serde::ser::Error::custom(
-                "Tried to serialize invalid generation ({self})",
-            ))
+            Err(serde::ser::Error::custom(format!(
+                "Tried to serialize invalid generation ({self:?})"
+            )))
         }
     }
 }
diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 4b4aa88d6b..9f38373ca0 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -15,7 +15,7 @@ use once_cell::sync::Lazy;
 use regex::Regex;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
-use tokio::sync::{mpsc, Mutex};
+use tokio::sync::{mpsc, Mutex, Notify};
 use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::io::ReaderStream;
 use tracing::{debug, info, info_span, warn, Instrument};
@@ -350,33 +350,53 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
     };
     let seconds = match parse_query_param(&req, "seconds")? {
         None => 5,
-        Some(seconds @ 1..=30) => seconds,
-        Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))),
+        Some(seconds @ 1..=60) => seconds,
+        Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-60 secs"))),
     };
     let frequency_hz = match parse_query_param(&req, "frequency")? {
         None => 99,
         Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))),
         Some(frequency) => frequency,
     };
-
-    // Only allow one profiler at a time.
-    static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
-    let _lock = PROFILE_LOCK
-        .try_lock()
-        .map_err(|_| ApiError::Conflict("profiler already running".into()))?;
+    let force: bool = parse_query_param(&req, "force")?.unwrap_or_default();
 
     // Take the profile.
-    let report = tokio::task::spawn_blocking(move || {
+    static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
+    static PROFILE_CANCEL: Lazy<Notify> = Lazy::new(Notify::new);
+
+    let report = {
+        // Only allow one profiler at a time. If force is true, cancel a running profile (e.g. a
+        // Grafana continuous profile). We use a try_lock() loop when cancelling instead of waiting
+        // for a lock(), to avoid races where the notify isn't currently awaited.
+        let _lock = loop {
+            match PROFILE_LOCK.try_lock() {
+                Ok(lock) => break lock,
+                Err(_) if force => PROFILE_CANCEL.notify_waiters(),
+                Err(_) => {
+                    return Err(ApiError::Conflict(
+                        "profiler already running (use ?force=true to cancel it)".into(),
+                    ))
+                }
+            }
+            tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait
+        };
+
         let guard = ProfilerGuardBuilder::default()
             .frequency(frequency_hz)
             .blocklist(&["libc", "libgcc", "pthread", "vdso"])
-            .build()?;
-        std::thread::sleep(Duration::from_secs(seconds));
-        guard.report().build()
-    })
-    .await
-    .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
-    .map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?;
+            .build()
+            .map_err(|err| ApiError::InternalServerError(err.into()))?;
+
+        tokio::select! {
+            _ = tokio::time::sleep(Duration::from_secs(seconds)) => {},
+            _ = PROFILE_CANCEL.notified() => {},
+        };
+
+        guard
+            .report()
+            .build()
+            .map_err(|err| ApiError::InternalServerError(err.into()))?
+    };
 
     // Return the report in the requested format.
     match format {
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index f188165600..c874fa30ff 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -260,7 +260,7 @@ impl FromStr for Lsn {
         {
             let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
             let right_num = u32::from_str_radix(right, 16).map_err(|_| LsnParseError)?;
-            Ok(Lsn((left_num as u64) << 32 | right_num as u64))
+            Ok(Lsn(((left_num as u64) << 32) | right_num as u64))
         } else {
             Err(LsnParseError)
         }
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 140b287ccc..8547746d94 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -44,6 +44,7 @@ postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
 postgres_initdb.workspace = true
+pprof.workspace = true
 rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
@@ -108,3 +109,7 @@ harness = false
 [[bench]]
 name = "bench_ingest"
 harness = false
+
+[[bench]]
+name = "upload_queue"
+harness = false
diff --git a/pageserver/benches/upload_queue.rs b/pageserver/benches/upload_queue.rs
new file mode 100644
index 0000000000..ed5daa8ae1
--- /dev/null
+++ b/pageserver/benches/upload_queue.rs
@@ -0,0 +1,87 @@
+//! Upload queue benchmarks.
+
+use std::str::FromStr as _;
+use std::sync::atomic::AtomicU32;
+use std::sync::Arc;
+
+use criterion::{criterion_group, criterion_main, Bencher, Criterion};
+use pageserver::tenant::metadata::TimelineMetadata;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::storage_layer::LayerName;
+use pageserver::tenant::upload_queue::{Delete, UploadOp, UploadQueue, UploadTask};
+use pageserver::tenant::IndexPart;
+use pprof::criterion::{Output, PProfProfiler};
+use utils::generation::Generation;
+use utils::shard::{ShardCount, ShardIndex, ShardNumber};
+
+// Register benchmarks with Criterion.
+criterion_group!(
+    name = benches;
+    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = bench_upload_queue_next_ready,
+);
+criterion_main!(benches);
+
+/// Benchmarks the cost of UploadQueue::next_ready() with the given number of in-progress tasks
+/// (which is equivalent to tasks ahead of it in the queue). This has linear cost, and the upload
+/// queue as a whole is thus quadratic.
+///
+/// UploadOp::UploadLayer requires an entire tenant and timeline to construct, so we just test
+/// Delete and UploadMetadata instead. This is incidentally the most expensive case.
+fn bench_upload_queue_next_ready(c: &mut Criterion) {
+    let mut g = c.benchmark_group("upload_queue_next_ready");
+    for inprogress in [0, 1, 10, 100, 1_000, 10_000, 100_000, 1_000_000] {
+        g.bench_function(format!("inprogress={inprogress}"), |b| {
+            run_bench(b, inprogress).unwrap()
+        });
+    }
+
+    fn run_bench(b: &mut Bencher, inprogress: usize) -> anyhow::Result<()> {
+        // Construct two layers. layer0 is in the indexes, layer1 will be deleted.
+        let layer0 = LayerName::from_str("000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
+        let layer1 = LayerName::from_str("100000000000000000000000000000000001-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
+
+        let metadata = LayerFileMetadata {
+            shard: ShardIndex::new(ShardNumber(1), ShardCount(2)),
+            generation: Generation::Valid(1),
+            file_size: 0,
+        };
+
+        // Construct the (initial and uploaded) index with layer0.
+        let mut index = IndexPart::empty(TimelineMetadata::example());
+        index.layer_metadata.insert(layer0, metadata.clone());
+
+        // Construct the queue.
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&index, 0)?;
+
+        // Populate inprogress_tasks with a bunch of layer1 deletions.
+        let delete = UploadOp::Delete(Delete {
+            layers: vec![(layer1, metadata)],
+        });
+
+        for task_id in 0..(inprogress as u64) {
+            queue.inprogress_tasks.insert(
+                task_id,
+                Arc::new(UploadTask {
+                    task_id,
+                    retries: AtomicU32::new(0),
+                    op: delete.clone(),
+                    coalesced_ops: Vec::new(),
+                }),
+            );
+        }
+
+        // Benchmark index upload scheduling.
+        let index_upload = UploadOp::UploadMetadata {
+            uploaded: Box::new(index),
+        };
+
+        b.iter(|| {
+            queue.queued_operations.push_front(index_upload.clone());
+            assert!(queue.next_ready().is_some());
+        });
+
+        Ok(())
+    }
+}
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 567a69da3b..921c6a5092 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -53,10 +53,12 @@ project_build_tag!(BUILD_TAG);
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
-/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
+/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21).
+/// This adds roughly 3% overhead for allocations on average, which is acceptable considering
+/// performance-sensitive code will avoid allocations as far as possible anyway.
 #[allow(non_upper_case_globals)]
 #[export_name = "malloc_conf"]
-pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
 
 const PID_FILE_NAME: &str = "pageserver.pid";
 
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 60ef4c3702..94e0b101bd 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -97,8 +97,8 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::DEFAULT_PG_VERSION;
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
-    CompactInfoResponse, StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest,
-    TimelineGcRequest, TimelineInfo,
+    StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
+    TimelineInfo,
 };
 use utils::{
     auth::SwappableJwtAuth,
@@ -2052,15 +2052,7 @@ async fn timeline_compact_info_handler(
         let tenant = state
             .tenant_manager
             .get_attached_tenant_shard(tenant_shard_id)?;
-        let res = tenant.get_scheduled_compaction_tasks(timeline_id);
-        let mut resp = Vec::new();
-        for item in res {
-            resp.push(CompactInfoResponse {
-                compact_key_range: item.compact_key_range,
-                compact_lsn_range: item.compact_lsn_range,
-                sub_compaction: item.sub_compaction,
-            });
-        }
+        let resp = tenant.get_scheduled_compaction_tasks(timeline_id);
         json_response(StatusCode::OK, resp)
     }
     .instrument(info_span!("timeline_compact_info", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 9b877fc368..3c4830e3cd 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -91,15 +91,6 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_layers_visited_per_read_global",
-        "Number of layers visited to reconstruct one key",
-        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_layers_visited_per_vectored_read_global",
@@ -3887,7 +3878,6 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
 
     // histograms
     [
-        &READ_NUM_LAYERS_VISITED,
         &VEC_READ_NUM_LAYERS_VISITED,
         &WAIT_LSN_TIME,
         &WAL_REDO_TIME,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 5c337bb6bf..f6d758ad22 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -21,6 +21,7 @@ use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use pageserver_api::models;
+use pageserver_api::models::CompactInfoResponse;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::models::TimelineState;
@@ -37,21 +38,17 @@ use remote_timeline_client::manifest::{
 };
 use remote_timeline_client::UploadQueueNotReadyError;
 use std::collections::BTreeMap;
-use std::collections::VecDeque;
 use std::fmt;
 use std::future::Future;
 use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
-use timeline::compaction::GcCompactJob;
-use timeline::compaction::ScheduledCompactionTask;
+use timeline::compaction::GcCompactionQueue;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
 use timeline::offload::OffloadError;
-use timeline::CompactFlags;
 use timeline::CompactOptions;
-use timeline::CompactionError;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -347,10 +344,8 @@ pub struct Tenant {
     /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
     compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
 
-    /// Scheduled compaction tasks. Currently, this can only be populated by triggering
-    /// a manual gc-compaction from the manual compaction API.
-    scheduled_compaction_tasks:
-        std::sync::Mutex<HashMap<TimelineId, VecDeque<ScheduledCompactionTask>>>,
+    /// Scheduled gc-compaction tasks.
+    scheduled_compaction_tasks: std::sync::Mutex<HashMap<TimelineId, Arc<GcCompactionQueue>>>,
 
     /// If the tenant is in Activating state, notify this to encourage it
     /// to proceed to Active as soon as possible, rather than waiting for lazy
@@ -2999,104 +2994,18 @@ impl Tenant {
                 if has_pending_l0_compaction_task {
                     Some(true)
                 } else {
-                    let mut has_pending_scheduled_compaction_task;
-                    let next_scheduled_compaction_task = {
-                        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-                        if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
-                            if !tline_pending_tasks.is_empty() {
-                                info!(
-                                    "{} tasks left in the compaction schedule queue",
-                                    tline_pending_tasks.len()
-                                );
-                            }
-                            let next_task = tline_pending_tasks.pop_front();
-                            has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
-                            next_task
-                        } else {
-                            has_pending_scheduled_compaction_task = false;
-                            None
-                        }
+                    let queue = {
+                        let guard = self.scheduled_compaction_tasks.lock().unwrap();
+                        guard.get(timeline_id).cloned()
                     };
-                    if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task
-                    {
-                        if !next_scheduled_compaction_task
-                            .options
-                            .flags
-                            .contains(CompactFlags::EnhancedGcBottomMostCompaction)
-                        {
-                            warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
-                        } else if next_scheduled_compaction_task.options.sub_compaction {
-                            info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
-                            let jobs: Vec<GcCompactJob> = timeline
-                                .gc_compaction_split_jobs(
-                                    GcCompactJob::from_compact_options(
-                                        next_scheduled_compaction_task.options.clone(),
-                                    ),
-                                    next_scheduled_compaction_task
-                                        .options
-                                        .sub_compaction_max_job_size_mb,
-                                )
-                                .await
-                                .map_err(CompactionError::Other)?;
-                            if jobs.is_empty() {
-                                info!("no jobs to run, skipping scheduled compaction task");
-                            } else {
-                                has_pending_scheduled_compaction_task = true;
-                                let jobs_len = jobs.len();
-                                let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-                                let tline_pending_tasks = guard.entry(*timeline_id).or_default();
-                                for (idx, job) in jobs.into_iter().enumerate() {
-                                    // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
-                                    // until we do further refactors to allow directly call `compact_with_gc`.
-                                    let mut flags: EnumSet<CompactFlags> = EnumSet::default();
-                                    flags |= CompactFlags::EnhancedGcBottomMostCompaction;
-                                    if job.dry_run {
-                                        flags |= CompactFlags::DryRun;
-                                    }
-                                    let options = CompactOptions {
-                                        flags,
-                                        sub_compaction: false,
-                                        compact_key_range: Some(job.compact_key_range.into()),
-                                        compact_lsn_range: Some(job.compact_lsn_range.into()),
-                                        sub_compaction_max_job_size_mb: None,
-                                    };
-                                    tline_pending_tasks.push_back(if idx == jobs_len - 1 {
-                                        ScheduledCompactionTask {
-                                            options,
-                                            // The last job in the queue sends the signal and releases the gc guard
-                                            result_tx: next_scheduled_compaction_task
-                                                .result_tx
-                                                .take(),
-                                            gc_block: next_scheduled_compaction_task
-                                                .gc_block
-                                                .take(),
-                                        }
-                                    } else {
-                                        ScheduledCompactionTask {
-                                            options,
-                                            result_tx: None,
-                                            gc_block: None,
-                                        }
-                                    });
-                                }
-                                info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
-                            }
-                        } else {
-                            let _ = timeline
-                                .compact_with_options(
-                                    cancel,
-                                    next_scheduled_compaction_task.options,
-                                    ctx,
-                                )
-                                .instrument(info_span!("scheduled_compact_timeline", %timeline_id))
-                                .await?;
-                            if let Some(tx) = next_scheduled_compaction_task.result_tx.take() {
-                                // TODO: we can send compaction statistics in the future
-                                tx.send(()).ok();
-                            }
-                        }
+                    if let Some(queue) = queue {
+                        let has_pending_tasks = queue
+                            .iteration(cancel, ctx, &self.gc_block, timeline)
+                            .await?;
+                        Some(has_pending_tasks)
+                    } else {
+                        Some(false)
                     }
-                    Some(has_pending_scheduled_compaction_task)
                 }
             } else {
                 None
@@ -3126,34 +3035,32 @@ impl Tenant {
     }
 
     /// Cancel scheduled compaction tasks
-    pub(crate) fn cancel_scheduled_compaction(
-        &self,
-        timeline_id: TimelineId,
-    ) -> Vec<ScheduledCompactionTask> {
+    pub(crate) fn cancel_scheduled_compaction(&self, timeline_id: TimelineId) {
         let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-        if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) {
-            let current_tline_pending_tasks = std::mem::take(tline_pending_tasks);
-            current_tline_pending_tasks.into_iter().collect()
-        } else {
-            Vec::new()
+        if let Some(q) = guard.get_mut(&timeline_id) {
+            q.cancel_scheduled();
         }
     }
 
     pub(crate) fn get_scheduled_compaction_tasks(
         &self,
         timeline_id: TimelineId,
-    ) -> Vec<CompactOptions> {
-        use itertools::Itertools;
-        let guard = self.scheduled_compaction_tasks.lock().unwrap();
-        guard
-            .get(&timeline_id)
-            .map(|tline_pending_tasks| {
-                tline_pending_tasks
-                    .iter()
-                    .map(|x| x.options.clone())
-                    .collect_vec()
-            })
-            .unwrap_or_default()
+    ) -> Vec<CompactInfoResponse> {
+        let res = {
+            let guard = self.scheduled_compaction_tasks.lock().unwrap();
+            guard.get(&timeline_id).map(|q| q.remaining_jobs())
+        };
+        let Some((running, remaining)) = res else {
+            return Vec::new();
+        };
+        let mut result = Vec::new();
+        if let Some((id, running)) = running {
+            result.extend(running.into_compact_info_resp(id, true));
+        }
+        for (id, job) in remaining {
+            result.extend(job.into_compact_info_resp(id, false));
+        }
+        result
     }
 
     /// Schedule a compaction task for a timeline.
@@ -3162,20 +3069,12 @@ impl Tenant {
         timeline_id: TimelineId,
         options: CompactOptions,
     ) -> anyhow::Result<tokio::sync::oneshot::Receiver<()>> {
-        let gc_guard = match self.gc_block.start().await {
-            Ok(guard) => guard,
-            Err(e) => {
-                bail!("cannot run gc-compaction because gc is blocked: {}", e);
-            }
-        };
         let (tx, rx) = tokio::sync::oneshot::channel();
         let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-        let tline_pending_tasks = guard.entry(timeline_id).or_default();
-        tline_pending_tasks.push_back(ScheduledCompactionTask {
-            options,
-            result_tx: Some(tx),
-            gc_block: Some(gc_guard),
-        });
+        let q = guard
+            .entry(timeline_id)
+            .or_insert_with(|| Arc::new(GcCompactionQueue::new()));
+        q.schedule_manual_compaction(options, Some(tx));
         Ok(rx)
     }
 
@@ -5791,7 +5690,7 @@ mod tests {
     use bytes::{Bytes, BytesMut};
     use hex_literal::hex;
     use itertools::Itertools;
-    use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
+    use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
     use pageserver_api::keyspace::KeySpace;
     use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
     use pageserver_api::value::Value;
@@ -7850,7 +7749,18 @@ mod tests {
         let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
         let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
         let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
+        let base_key_overwrite = Key::from_hex("620000000033333333444444445500000003").unwrap();
+
+        let base_inherited_key = Key::from_hex("610000000033333333444444445500000000").unwrap();
+        let base_inherited_key_child =
+            Key::from_hex("610000000033333333444444445500000001").unwrap();
+        let base_inherited_key_nonexist =
+            Key::from_hex("610000000033333333444444445500000002").unwrap();
+        let base_inherited_key_overwrite =
+            Key::from_hex("610000000033333333444444445500000003").unwrap();
+
         assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
+        assert_eq!(base_inherited_key.field1, RELATION_SIZE_PREFIX);
 
         let tline = tenant
             .create_test_timeline_with_layers(
@@ -7859,7 +7769,18 @@ mod tests {
                 DEFAULT_PG_VERSION,
                 &ctx,
                 Vec::new(), // delta layers
-                vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
+                vec![(
+                    Lsn(0x20),
+                    vec![
+                        (base_inherited_key, test_img("metadata inherited key 1")),
+                        (
+                            base_inherited_key_overwrite,
+                            test_img("metadata key overwrite 1a"),
+                        ),
+                        (base_key, test_img("metadata key 1")),
+                        (base_key_overwrite, test_img("metadata key overwrite 1b")),
+                    ],
+                )], // image layers
                 Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
             )
             .await?;
@@ -7873,7 +7794,18 @@ mod tests {
                 Vec::new(), // delta layers
                 vec![(
                     Lsn(0x30),
-                    vec![(base_key_child, test_img("metadata key 2"))],
+                    vec![
+                        (
+                            base_inherited_key_child,
+                            test_img("metadata inherited key 2"),
+                        ),
+                        (
+                            base_inherited_key_overwrite,
+                            test_img("metadata key overwrite 2a"),
+                        ),
+                        (base_key_child, test_img("metadata key 2")),
+                        (base_key_overwrite, test_img("metadata key overwrite 2b")),
+                    ],
                 )], // image layers
                 Lsn(0x30),
             )
@@ -7895,6 +7827,26 @@ mod tests {
             get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?,
             None
         );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 1b"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key, lsn, &ctx).await?,
+            Some(test_img("metadata inherited key 1"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key_child, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key_nonexist, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 1a"))
+        );
 
         // test vectored get on child timeline
         assert_eq!(
@@ -7909,6 +7861,82 @@ mod tests {
             get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?,
             None
         );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key, lsn, &ctx).await?,
+            Some(test_img("metadata inherited key 1"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key_child, lsn, &ctx).await?,
+            Some(test_img("metadata inherited key 2"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key_nonexist, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 2b"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 2a"))
+        );
+
+        // test vectored scan on parent timeline
+        let mut reconstruct_state = ValuesReconstructState::new();
+        let res = tline
+            .get_vectored_impl(
+                KeySpace::single(Key::metadata_key_range()),
+                lsn,
+                &mut reconstruct_state,
+                &ctx,
+            )
+            .await?;
+
+        assert_eq!(
+            res.into_iter()
+                .map(|(k, v)| (k, v.unwrap()))
+                .collect::<Vec<_>>(),
+            vec![
+                (base_inherited_key, test_img("metadata inherited key 1")),
+                (
+                    base_inherited_key_overwrite,
+                    test_img("metadata key overwrite 1a")
+                ),
+                (base_key, test_img("metadata key 1")),
+                (base_key_overwrite, test_img("metadata key overwrite 1b")),
+            ]
+        );
+
+        // test vectored scan on child timeline
+        let mut reconstruct_state = ValuesReconstructState::new();
+        let res = child
+            .get_vectored_impl(
+                KeySpace::single(Key::metadata_key_range()),
+                lsn,
+                &mut reconstruct_state,
+                &ctx,
+            )
+            .await?;
+
+        assert_eq!(
+            res.into_iter()
+                .map(|(k, v)| (k, v.unwrap()))
+                .collect::<Vec<_>>(),
+            vec![
+                (base_inherited_key, test_img("metadata inherited key 1")),
+                (
+                    base_inherited_key_child,
+                    test_img("metadata inherited key 2")
+                ),
+                (
+                    base_inherited_key_overwrite,
+                    test_img("metadata key overwrite 2a")
+                ),
+                (base_key_child, test_img("metadata key 2")),
+                (base_key_overwrite, test_img("metadata key overwrite 2b")),
+            ]
+        );
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index d54dded778..edf2e6a3aa 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -11,7 +11,7 @@
 pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
-use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig};
+use pageserver_api::models::{self, TenantConfigPatch};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
@@ -597,7 +597,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
                 .map(humantime),
             heatmap_period: value.heatmap_period.map(humantime),
             lazy_slru_download: value.lazy_slru_download,
-            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
+            timeline_get_throttle: value.timeline_get_throttle,
             image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
             lsn_lease_length: value.lsn_lease_length.map(humantime),
             lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index c77342b144..bb9df020b5 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -84,17 +84,17 @@ impl Value {
 
     fn to_u64(self) -> u64 {
         let b = &self.0;
-        (b[0] as u64) << 32
-            | (b[1] as u64) << 24
-            | (b[2] as u64) << 16
-            | (b[3] as u64) << 8
+        ((b[0] as u64) << 32)
+            | ((b[1] as u64) << 24)
+            | ((b[2] as u64) << 16)
+            | ((b[3] as u64) << 8)
             | b[4] as u64
     }
 
     fn to_blknum(self) -> u32 {
         let b = &self.0;
         assert!(b[0] == 0x80);
-        (b[1] as u32) << 24 | (b[2] as u32) << 16 | (b[3] as u32) << 8 | b[4] as u32
+        ((b[1] as u32) << 24) | ((b[2] as u32) << 16) | ((b[3] as u32) << 8) | b[4] as u32
     }
 }
 
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 24440d4b35..d281eb305f 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -320,7 +320,6 @@ impl TimelineMetadata {
 
     // Checksums make it awkward to build a valid instance by hand.  This helper
     // provides a TimelineMetadata with a valid checksum in its header.
-    #[cfg(test)]
     pub fn example() -> Self {
         let instance = Self::new(
             "0/16960E8".parse::<Lsn>().unwrap(),
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 813111245d..47c4a8637d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -63,22 +63,18 @@
 //! The contract between client and its user is that the user is responsible of
 //! scheduling operations in an order that keeps the remote consistent as
 //! described above.
+//!
 //! From the user's perspective, the operations are executed sequentially.
 //! Internally, the client knows which operations can be performed in parallel,
 //! and which operations act like a "barrier" that require preceding operations
 //! to finish. The calling code just needs to call the schedule-functions in the
 //! correct order, and the client will parallelize the operations in a way that
-//! is safe.
-//!
-//! The caller should be careful with deletion, though. They should not delete
-//! local files that have been scheduled for upload but not yet finished uploading.
-//! Otherwise the upload will fail. To wait for an upload to finish, use
-//! the 'wait_completion' function (more on that later.)
+//! is safe. For more details, see `UploadOp::can_bypass`.
 //!
 //! All of this relies on the following invariants:
 //!
 //! - We rely on read-after write consistency in the remote storage.
-//! - Layer files are immutable
+//! - Layer files are immutable.
 //!
 //! NB: Pageserver assumes that it has exclusive write access to the tenant in remote
 //! storage. Different tenants can be attached to different pageservers, but if the
@@ -429,8 +425,16 @@ impl RemoteTimelineClient {
     /// an index file upload, i.e., it's not empty.
     /// The given `index_part` must be the one on the remote.
     pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
+        // Set the maximum number of inprogress tasks to the remote storage concurrency. There's
+        // certainly no point in starting more upload tasks than this.
+        let inprogress_limit = self
+            .conf
+            .remote_storage_config
+            .as_ref()
+            .and_then(|r| r.concurrency_limit())
+            .unwrap_or(0);
         let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_with_current_remote_index_part(index_part)?;
+        upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
         self.update_remote_physical_size_gauge(Some(index_part));
         info!(
             "initialized upload queue from remote index with {} layer files",
@@ -445,8 +449,16 @@ impl RemoteTimelineClient {
         &self,
         local_metadata: &TimelineMetadata,
     ) -> anyhow::Result<()> {
+        // Set the maximum number of inprogress tasks to the remote storage concurrency. There's
+        // certainly no point in starting more upload tasks than this.
+        let inprogress_limit = self
+            .conf
+            .remote_storage_config
+            .as_ref()
+            .and_then(|r| r.concurrency_limit())
+            .unwrap_or(0);
         let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_empty_remote(local_metadata)?;
+        upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
         self.update_remote_physical_size_gauge(None);
         info!("initialized upload queue as empty");
         Ok(())
@@ -462,9 +474,15 @@ impl RemoteTimelineClient {
         let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
             "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
         ))?;
+        let inprogress_limit = self
+            .conf
+            .remote_storage_config
+            .as_ref()
+            .and_then(|r| r.concurrency_limit())
+            .unwrap_or(0);
 
         let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_with_current_remote_index_part(index_part)?;
+        upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
         self.update_remote_physical_size_gauge(Some(index_part));
         self.stop_impl(&mut upload_queue);
 
@@ -1855,57 +1873,17 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
-    ///
     /// Pick next tasks from the queue, and start as many of them as possible without violating
     /// the ordering constraints.
     ///
-    /// The caller needs to already hold the `upload_queue` lock.
+    /// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does.
+    /// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has
+    /// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks.
     fn launch_queued_tasks(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
-        while let Some(next_op) = upload_queue.queued_operations.front() {
-            // Can we run this task now?
-            let can_run_now = match next_op {
-                UploadOp::UploadLayer(..) => {
-                    // Can always be scheduled.
-                    true
-                }
-                UploadOp::UploadMetadata { .. } => {
-                    // These can only be performed after all the preceding operations
-                    // have finished.
-                    upload_queue.inprogress_tasks.is_empty()
-                }
-                UploadOp::Delete(..) => {
-                    // Wait for preceding uploads to finish. Concurrent deletions are OK, though.
-                    upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
-                }
+        while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() {
+            debug!("starting op: {next_op}");
 
-                UploadOp::Barrier(_) | UploadOp::Shutdown => {
-                    upload_queue.inprogress_tasks.is_empty()
-                }
-            };
-
-            // If we cannot launch this task, don't look any further.
-            //
-            // In some cases, we could let some non-frontmost tasks to "jump the queue" and launch
-            // them now, but we don't try to do that currently.  For example, if the frontmost task
-            // is an index-file upload that cannot proceed until preceding uploads have finished, we
-            // could still start layer uploads that were scheduled later.
-            if !can_run_now {
-                break;
-            }
-
-            if let UploadOp::Shutdown = next_op {
-                // leave the op in the queue but do not start more tasks; it will be dropped when
-                // the stop is called.
-                upload_queue.shutdown_ready.close();
-                break;
-            }
-
-            // We can launch this task. Remove it from the queue first.
-            let mut next_op = upload_queue.queued_operations.pop_front().unwrap();
-
-            debug!("starting op: {}", next_op);
-
-            // Update the counters and prepare
+            // Prepare upload.
             match &mut next_op {
                 UploadOp::UploadLayer(layer, meta, mode) => {
                     if upload_queue
@@ -1916,18 +1894,14 @@ impl RemoteTimelineClient {
                     } else {
                         *mode = Some(OpType::MayReorder)
                     }
-                    upload_queue.num_inprogress_layer_uploads += 1;
-                }
-                UploadOp::UploadMetadata { .. } => {
-                    upload_queue.num_inprogress_metadata_uploads += 1;
                 }
+                UploadOp::UploadMetadata { .. } => {}
                 UploadOp::Delete(Delete { layers }) => {
                     for (name, meta) in layers {
                         upload_queue
                             .recently_deleted
                             .insert((name.clone(), meta.generation));
                     }
-                    upload_queue.num_inprogress_deletions += 1;
                 }
                 UploadOp::Barrier(sender) => {
                     sender.send_replace(());
@@ -1944,6 +1918,7 @@ impl RemoteTimelineClient {
             let task = Arc::new(UploadTask {
                 task_id: upload_task_id,
                 op: next_op,
+                coalesced_ops,
                 retries: AtomicU32::new(0),
             });
             upload_queue
@@ -2027,6 +2002,8 @@ impl RemoteTimelineClient {
 
             let upload_result: anyhow::Result<()> = match &task.op {
                 UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => {
+                    // TODO: check if this mechanism can be removed now that can_bypass() performs
+                    // conflict checks during scheduling.
                     if let Some(OpType::FlushDeletion) = mode {
                         if self.config.read().unwrap().block_deletions {
                             // Of course, this is not efficient... but usually the queue should be empty.
@@ -2249,13 +2226,8 @@ impl RemoteTimelineClient {
             upload_queue.inprogress_tasks.remove(&task.task_id);
 
             let lsn_update = match task.op {
-                UploadOp::UploadLayer(_, _, _) => {
-                    upload_queue.num_inprogress_layer_uploads -= 1;
-                    None
-                }
+                UploadOp::UploadLayer(_, _, _) => None,
                 UploadOp::UploadMetadata { ref uploaded } => {
-                    upload_queue.num_inprogress_metadata_uploads -= 1;
-
                     // the task id is reused as a monotonicity check for storing the "clean"
                     // IndexPart.
                     let last_updater = upload_queue.clean.1;
@@ -2289,10 +2261,7 @@ impl RemoteTimelineClient {
                         None
                     }
                 }
-                UploadOp::Delete(_) => {
-                    upload_queue.num_inprogress_deletions -= 1;
-                    None
-                }
+                UploadOp::Delete(_) => None,
                 UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(),
             };
 
@@ -2317,6 +2286,9 @@ impl RemoteTimelineClient {
         }
 
         self.metric_end(&task.op);
+        for coalesced_op in &task.coalesced_ops {
+            self.metric_end(coalesced_op);
+        }
     }
 
     fn metric_impl(
@@ -2409,6 +2381,7 @@ impl RemoteTimelineClient {
                     // but for this use case it doesnt really makes sense to bring unsafe code only for this usage point.
                     // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
                     let upload_queue_for_deletion = UploadQueueInitialized {
+                        inprogress_limit: initialized.inprogress_limit,
                         task_counter: 0,
                         dirty: initialized.dirty.clone(),
                         clean: initialized.clean.clone(),
@@ -2416,9 +2389,6 @@ impl RemoteTimelineClient {
                         visible_remote_consistent_lsn: initialized
                             .visible_remote_consistent_lsn
                             .clone(),
-                        num_inprogress_layer_uploads: 0,
-                        num_inprogress_metadata_uploads: 0,
-                        num_inprogress_deletions: 0,
                         inprogress_tasks: HashMap::default(),
                         queued_operations: VecDeque::default(),
                         #[cfg(feature = "testing")]
@@ -2445,14 +2415,6 @@ impl RemoteTimelineClient {
                     }
                 };
 
-                // consistency check
-                assert_eq!(
-                    qi.num_inprogress_layer_uploads
-                        + qi.num_inprogress_metadata_uploads
-                        + qi.num_inprogress_deletions,
-                    qi.inprogress_tasks.len()
-                );
-
                 // We don't need to do anything here for in-progress tasks. They will finish
                 // on their own, decrement the unfinished-task counter themselves, and observe
                 // that the queue is Stopped.
@@ -2899,8 +2861,8 @@ mod tests {
             let mut guard = client.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut().unwrap();
             assert!(upload_queue.queued_operations.is_empty());
-            assert!(upload_queue.inprogress_tasks.len() == 2);
-            assert!(upload_queue.num_inprogress_layer_uploads == 2);
+            assert_eq!(upload_queue.inprogress_tasks.len(), 2);
+            assert_eq!(upload_queue.num_inprogress_layer_uploads(), 2);
 
             // also check that `latest_file_changes` was updated
             assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
@@ -2970,8 +2932,8 @@ mod tests {
             // Deletion schedules upload of the index file, and the file deletion itself
             assert_eq!(upload_queue.queued_operations.len(), 2);
             assert_eq!(upload_queue.inprogress_tasks.len(), 1);
-            assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
-            assert_eq!(upload_queue.num_inprogress_deletions, 0);
+            assert_eq!(upload_queue.num_inprogress_layer_uploads(), 1);
+            assert_eq!(upload_queue.num_inprogress_deletions(), 0);
             assert_eq!(
                 upload_queue.latest_files_changes_since_metadata_upload_scheduled,
                 0
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 51f093cb87..244be5bbb7 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -104,7 +104,7 @@ impl IndexPart {
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
-    pub(crate) fn empty(metadata: TimelineMetadata) -> Self {
+    pub fn empty(metadata: TimelineMetadata) -> Self {
         IndexPart {
             version: Self::LATEST_VERSION,
             layer_metadata: Default::default(),
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index b8206fca5a..3913637ca0 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -12,7 +12,7 @@ pub mod merge_iterator;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
 use bytes::Bytes;
-use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE};
+use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::value::Value;
@@ -209,7 +209,7 @@ impl ValuesReconstructState {
             .keys
             .entry(*key)
             .or_insert(Ok(VectoredValueReconstructState::default()));
-        let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key);
+        let is_sparse_key = key.is_sparse();
         if let Ok(state) = state {
             let key_done = match state.situation {
                 ValueReconstructSituation::Complete => {
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 71e53da20f..2b67f55a17 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -112,8 +112,8 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
 ///
 /// Layout:
 /// - 1 bit: `will_init`
-/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len`
-/// - [`MAX_SUPPORTED_POS_BITS`]: `pos`
+/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`][]: `len`
+/// - [`MAX_SUPPORTED_POS_BITS`](IndexEntry::MAX_SUPPORTED_POS_BITS): `pos`
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct IndexEntry(u64);
 
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 8933e8ceb1..2b06c88e8b 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1812,7 +1812,7 @@ enum LayerKind {
 
 /// Guard for forcing a layer be resident while it exists.
 #[derive(Clone)]
-pub(crate) struct ResidentLayer {
+pub struct ResidentLayer {
     owner: Layer,
     downloaded: Arc<DownloadedLayer>,
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e222a624de..4aa6b7a05a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -27,7 +27,7 @@ use pageserver_api::{
     config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
     key::{
         KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
-        NON_INHERITED_SPARSE_RANGE,
+        SPARSE_RANGE,
     },
     keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
@@ -3221,7 +3221,7 @@ impl Timeline {
             // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
             // stalling compaction.
             keyspace.remove_overlapping_with(&KeySpace {
-                ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
+                ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()],
             });
 
             // Keyspace is fully retrieved
@@ -3242,7 +3242,11 @@ impl Timeline {
             // keys from `keyspace`, we expect there to be no overlap between it and the image covered key
             // space. If that's not the case, we had at least one key encounter a gap in the image layer
             // and stop the search as a result of that.
-            let removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
+            let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
+            // Do not fire missing key error for sparse keys.
+            removed.remove_overlapping_with(&KeySpace {
+                ranges: vec![SPARSE_RANGE],
+            });
             if !removed.is_empty() {
                 break Some(removed);
             }
@@ -3257,6 +3261,21 @@ impl Timeline {
             timeline = &*timeline_owned;
         };
 
+        // Remove sparse keys from the keyspace so that it doesn't fire errors.
+        let missing_keyspace = if let Some(missing_keyspace) = missing_keyspace {
+            let mut missing_keyspace = missing_keyspace;
+            missing_keyspace.remove_overlapping_with(&KeySpace {
+                ranges: vec![SPARSE_RANGE],
+            });
+            if missing_keyspace.is_empty() {
+                None
+            } else {
+                Some(missing_keyspace)
+            }
+        } else {
+            None
+        };
+
         if let Some(missing_keyspace) = missing_keyspace {
             return Err(GetVectoredError::MissingKey(MissingKeyError {
                 key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
@@ -3762,36 +3781,35 @@ impl Timeline {
                 return Err(FlushLayerError::Cancelled);
             }
 
-            let mut layers_to_upload = Vec::new();
-            layers_to_upload.extend(
-                self.create_image_layers(
-                    &rel_partition,
-                    self.initdb_lsn,
-                    ImageLayerCreationMode::Initial,
-                    ctx,
-                )
-                .await?,
-            );
+            // Ensure that we have a single call to `create_image_layers` with a combined dense keyspace.
+            // So that the key ranges don't overlap.
+            let mut partitions = KeyPartitioning::default();
+            partitions.parts.extend(rel_partition.parts);
             if !metadata_partition.parts.is_empty() {
                 assert_eq!(
                     metadata_partition.parts.len(),
                     1,
                     "currently sparse keyspace should only contain a single metadata keyspace"
                 );
-                layers_to_upload.extend(
-                    self.create_image_layers(
-                        // Safety: create_image_layers treat sparse keyspaces differently that it does not scan
-                        // every single key within the keyspace, and therefore, it's safe to force converting it
-                        // into a dense keyspace before calling this function.
-                        &metadata_partition.into_dense(),
-                        self.initdb_lsn,
-                        ImageLayerCreationMode::Initial,
-                        ctx,
-                    )
-                    .await?,
-                );
+                // Safety: create_image_layers treat sparse keyspaces differently that it does not scan
+                // every single key within the keyspace, and therefore, it's safe to force converting it
+                // into a dense keyspace before calling this function.
+                partitions
+                    .parts
+                    .extend(metadata_partition.into_dense().parts);
             }
 
+            let mut layers_to_upload = Vec::new();
+            layers_to_upload.extend(
+                self.create_image_layers(
+                    &partitions,
+                    self.initdb_lsn,
+                    ImageLayerCreationMode::Initial,
+                    ctx,
+                )
+                .await?,
+            );
+
             (layers_to_upload, None)
         } else {
             // Normal case, write out a L0 delta layer file.
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 55cde8603e..05f8d476f9 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,7 +4,7 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.
 
-use std::collections::{BinaryHeap, HashMap, HashSet};
+use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
 
@@ -16,10 +16,12 @@ use super::{
 
 use anyhow::{anyhow, bail, Context};
 use bytes::Bytes;
+use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
+use pageserver_api::models::CompactInfoResponse;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use serde::Serialize;
 use tokio_util::sync::CancellationToken;
@@ -30,6 +32,7 @@ use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}
 use crate::page_cache;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
+use crate::tenant::gc_block::GcBlock;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::batch_split_writer::{
     BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -63,16 +66,284 @@ use super::CompactionError;
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;
 
-/// A scheduled compaction task.
-pub(crate) struct ScheduledCompactionTask {
-    /// It's unfortunate that we need to store a compact options struct here because the only outer
-    /// API we can call here is `compact_with_options` which does a few setup calls before starting the
-    /// actual compaction job... We should refactor this to store `GcCompactionJob` in the future.
-    pub options: CompactOptions,
-    /// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender.
-    pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
-    /// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard.
-    pub gc_block: Option<gc_block::Guard>,
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+pub struct GcCompactionJobId(pub usize);
+
+impl std::fmt::Display for GcCompactionJobId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+#[derive(Debug, Clone)]
+pub enum GcCompactionQueueItem {
+    Manual(CompactOptions),
+    SubCompactionJob(CompactOptions),
+    #[allow(dead_code)]
+    UpdateL2Lsn(Lsn),
+    Notify(GcCompactionJobId),
+}
+
+impl GcCompactionQueueItem {
+    pub fn into_compact_info_resp(
+        self,
+        id: GcCompactionJobId,
+        running: bool,
+    ) -> Option<CompactInfoResponse> {
+        match self {
+            GcCompactionQueueItem::Manual(options) => Some(CompactInfoResponse {
+                compact_key_range: options.compact_key_range,
+                compact_lsn_range: options.compact_lsn_range,
+                sub_compaction: options.sub_compaction,
+                running,
+                job_id: id.0,
+            }),
+            GcCompactionQueueItem::SubCompactionJob(options) => Some(CompactInfoResponse {
+                compact_key_range: options.compact_key_range,
+                compact_lsn_range: options.compact_lsn_range,
+                sub_compaction: options.sub_compaction,
+                running,
+                job_id: id.0,
+            }),
+            GcCompactionQueueItem::UpdateL2Lsn(_) => None,
+            GcCompactionQueueItem::Notify(_) => None,
+        }
+    }
+}
+
+struct GcCompactionQueueInner {
+    running: Option<(GcCompactionJobId, GcCompactionQueueItem)>,
+    queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
+    notify: HashMap<GcCompactionJobId, tokio::sync::oneshot::Sender<()>>,
+    gc_guards: HashMap<GcCompactionJobId, gc_block::Guard>,
+    last_id: GcCompactionJobId,
+}
+
+impl GcCompactionQueueInner {
+    fn next_id(&mut self) -> GcCompactionJobId {
+        let id = self.last_id;
+        self.last_id = GcCompactionJobId(id.0 + 1);
+        id
+    }
+}
+
+/// A structure to store gc_compaction jobs.
+pub struct GcCompactionQueue {
+    /// All items in the queue, and the currently-running job.
+    inner: std::sync::Mutex<GcCompactionQueueInner>,
+    /// Ensure only one thread is consuming the queue.
+    consumer_lock: tokio::sync::Mutex<()>,
+}
+
+impl GcCompactionQueue {
+    pub fn new() -> Self {
+        GcCompactionQueue {
+            inner: std::sync::Mutex::new(GcCompactionQueueInner {
+                running: None,
+                queued: VecDeque::new(),
+                notify: HashMap::new(),
+                gc_guards: HashMap::new(),
+                last_id: GcCompactionJobId(0),
+            }),
+            consumer_lock: tokio::sync::Mutex::new(()),
+        }
+    }
+
+    pub fn cancel_scheduled(&self) {
+        let mut guard = self.inner.lock().unwrap();
+        guard.queued.clear();
+        guard.notify.clear();
+        guard.gc_guards.clear();
+    }
+
+    /// Schedule a manual compaction job.
+    pub fn schedule_manual_compaction(
+        &self,
+        options: CompactOptions,
+        notify: Option<tokio::sync::oneshot::Sender<()>>,
+    ) -> GcCompactionJobId {
+        let mut guard = self.inner.lock().unwrap();
+        let id = guard.next_id();
+        guard
+            .queued
+            .push_back((id, GcCompactionQueueItem::Manual(options)));
+        if let Some(notify) = notify {
+            guard.notify.insert(id, notify);
+        }
+        info!("scheduled compaction job id={}", id);
+        id
+    }
+
+    /// Trigger an auto compaction.
+    #[allow(dead_code)]
+    pub fn trigger_auto_compaction(&self, _: &Arc<Timeline>) {}
+
+    /// Notify the caller the job has finished and unblock GC.
+    fn notify_and_unblock(&self, id: GcCompactionJobId) {
+        info!("compaction job id={} finished", id);
+        let mut guard = self.inner.lock().unwrap();
+        if let Some(blocking) = guard.gc_guards.remove(&id) {
+            drop(blocking)
+        }
+        if let Some(tx) = guard.notify.remove(&id) {
+            let _ = tx.send(());
+        }
+    }
+
+    async fn handle_sub_compaction(
+        &self,
+        id: GcCompactionJobId,
+        options: CompactOptions,
+        timeline: &Arc<Timeline>,
+        gc_block: &GcBlock,
+    ) -> Result<(), CompactionError> {
+        info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+        let jobs: Vec<GcCompactJob> = timeline
+            .gc_compaction_split_jobs(
+                GcCompactJob::from_compact_options(options.clone()),
+                options.sub_compaction_max_job_size_mb,
+            )
+            .await
+            .map_err(CompactionError::Other)?;
+        if jobs.is_empty() {
+            info!("no jobs to run, skipping scheduled compaction task");
+            self.notify_and_unblock(id);
+        } else {
+            let gc_guard = match gc_block.start().await {
+                Ok(guard) => guard,
+                Err(e) => {
+                    return Err(CompactionError::Other(anyhow!(
+                        "cannot run gc-compaction because gc is blocked: {}",
+                        e
+                    )));
+                }
+            };
+
+            let jobs_len = jobs.len();
+            let mut pending_tasks = Vec::new();
+            for job in jobs {
+                // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
+                // until we do further refactors to allow directly call `compact_with_gc`.
+                let mut flags: EnumSet<CompactFlags> = EnumSet::default();
+                flags |= CompactFlags::EnhancedGcBottomMostCompaction;
+                if job.dry_run {
+                    flags |= CompactFlags::DryRun;
+                }
+                let options = CompactOptions {
+                    flags,
+                    sub_compaction: false,
+                    compact_key_range: Some(job.compact_key_range.into()),
+                    compact_lsn_range: Some(job.compact_lsn_range.into()),
+                    sub_compaction_max_job_size_mb: None,
+                };
+                pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options));
+            }
+            pending_tasks.push(GcCompactionQueueItem::Notify(id));
+            {
+                let mut guard = self.inner.lock().unwrap();
+                guard.gc_guards.insert(id, gc_guard);
+                let mut tasks = Vec::new();
+                for task in pending_tasks {
+                    let id = guard.next_id();
+                    tasks.push((id, task));
+                }
+                tasks.reverse();
+                for item in tasks {
+                    guard.queued.push_front(item);
+                }
+            }
+            info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
+        }
+        Ok(())
+    }
+
+    /// Take a job from the queue and process it. Returns if there are still pending tasks.
+    pub async fn iteration(
+        &self,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+        gc_block: &GcBlock,
+        timeline: &Arc<Timeline>,
+    ) -> Result<bool, CompactionError> {
+        let _one_op_at_a_time_guard = self.consumer_lock.lock().await;
+        let has_pending_tasks;
+        let (id, item) = {
+            let mut guard = self.inner.lock().unwrap();
+            let Some((id, item)) = guard.queued.pop_front() else {
+                return Ok(false);
+            };
+            guard.running = Some((id, item.clone()));
+            has_pending_tasks = !guard.queued.is_empty();
+            (id, item)
+        };
+
+        match item {
+            GcCompactionQueueItem::Manual(options) => {
+                if !options
+                    .flags
+                    .contains(CompactFlags::EnhancedGcBottomMostCompaction)
+                {
+                    warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", options);
+                } else if options.sub_compaction {
+                    self.handle_sub_compaction(id, options, timeline, gc_block)
+                        .await?;
+                } else {
+                    let gc_guard = match gc_block.start().await {
+                        Ok(guard) => guard,
+                        Err(e) => {
+                            return Err(CompactionError::Other(anyhow!(
+                                "cannot run gc-compaction because gc is blocked: {}",
+                                e
+                            )));
+                        }
+                    };
+                    {
+                        let mut guard = self.inner.lock().unwrap();
+                        guard.gc_guards.insert(id, gc_guard);
+                    }
+                    let _ = timeline
+                        .compact_with_options(cancel, options, ctx)
+                        .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
+                        .await?;
+                    self.notify_and_unblock(id);
+                }
+            }
+            GcCompactionQueueItem::SubCompactionJob(options) => {
+                let _ = timeline
+                    .compact_with_options(cancel, options, ctx)
+                    .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
+                    .await?;
+            }
+            GcCompactionQueueItem::Notify(id) => {
+                self.notify_and_unblock(id);
+            }
+            GcCompactionQueueItem::UpdateL2Lsn(_) => {
+                unreachable!()
+            }
+        }
+        {
+            let mut guard = self.inner.lock().unwrap();
+            guard.running = None;
+        }
+        Ok(has_pending_tasks)
+    }
+
+    #[allow(clippy::type_complexity)]
+    pub fn remaining_jobs(
+        &self,
+    ) -> (
+        Option<(GcCompactionJobId, GcCompactionQueueItem)>,
+        VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
+    ) {
+        let guard = self.inner.lock().unwrap();
+        (guard.running.clone(), guard.queued.clone())
+    }
+
+    #[allow(dead_code)]
+    pub fn remaining_jobs_num(&self) -> usize {
+        let guard = self.inner.lock().unwrap();
+        guard.queued.len() + if guard.running.is_some() { 1 } else { 0 }
+    }
 }
 
 /// A job description for the gc-compaction job. This structure describes the rectangle range that the job will
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index d74faa1af5..3a8796add8 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -403,7 +403,7 @@ pub(super) async fn handle_walreceiver_connection(
                 // need to advance last record LSN on all shards. If we've not ingested the latest
                 // record, then set the LSN of the modification past it. This way all shards
                 // advance their last record LSN at the same time.
-                let needs_last_record_lsn_advance = match next_record_lsn.map(Lsn::from) {
+                let needs_last_record_lsn_advance = match next_record_lsn {
                     Some(lsn) if lsn > modification.get_lsn() => {
                         modification.set_lsn(lsn).unwrap();
                         true
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index ef3aa759f3..d302205ffe 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,28 +1,38 @@
+use std::collections::{HashMap, HashSet, VecDeque};
+use std::fmt::Debug;
+use std::sync::atomic::AtomicU32;
+use std::sync::Arc;
+
+use super::remote_timeline_client::is_same_remote_layer_path;
+use super::storage_layer::AsLayerDesc as _;
 use super::storage_layer::LayerName;
 use super::storage_layer::ResidentLayer;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
-use std::collections::HashSet;
-use std::collections::{HashMap, VecDeque};
-use std::fmt::Debug;
+use utils::generation::Generation;
+use utils::lsn::{AtomicLsn, Lsn};
 
 use chrono::NaiveDateTime;
-use std::sync::Arc;
+use once_cell::sync::Lazy;
 use tracing::info;
-use utils::lsn::AtomicLsn;
 
-use std::sync::atomic::AtomicU32;
-use utils::lsn::Lsn;
+/// Kill switch for upload queue reordering in case it causes problems.
+/// TODO: remove this once we have confidence in it.
+static DISABLE_UPLOAD_QUEUE_REORDERING: Lazy<bool> =
+    Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_REORDERING").as_deref() == Ok("true"));
 
-use utils::generation::Generation;
+/// Kill switch for index upload coalescing in case it causes problems.
+/// TODO: remove this once we have confidence in it.
+static DISABLE_UPLOAD_QUEUE_INDEX_COALESCING: Lazy<bool> =
+    Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_INDEX_COALESCING").as_deref() == Ok("true"));
 
 // clippy warns that Uninitialized is much smaller than Initialized, which wastes
 // memory for Uninitialized variants. Doesn't matter in practice, there are not
 // that many upload queues in a running pageserver, and most of them are initialized
 // anyway.
 #[allow(clippy::large_enum_variant)]
-pub(super) enum UploadQueue {
+pub enum UploadQueue {
     Uninitialized,
     Initialized(UploadQueueInitialized),
     Stopped(UploadQueueStopped),
@@ -39,13 +49,16 @@ impl UploadQueue {
 }
 
 #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
-pub(crate) enum OpType {
+pub enum OpType {
     MayReorder,
     FlushDeletion,
 }
 
 /// This keeps track of queued and in-progress tasks.
-pub(crate) struct UploadQueueInitialized {
+pub struct UploadQueueInitialized {
+    /// Maximum number of inprogress tasks to schedule. 0 is no limit.
+    pub(crate) inprogress_limit: usize,
+
     /// Counter to assign task IDs
     pub(crate) task_counter: u64,
 
@@ -70,21 +83,16 @@ pub(crate) struct UploadQueueInitialized {
     /// we skip validation)
     pub(crate) visible_remote_consistent_lsn: Arc<AtomicLsn>,
 
-    // Breakdown of different kinds of tasks currently in-progress
-    pub(crate) num_inprogress_layer_uploads: usize,
-    pub(crate) num_inprogress_metadata_uploads: usize,
-    pub(crate) num_inprogress_deletions: usize,
-
     /// Tasks that are currently in-progress. In-progress means that a tokio Task
     /// has been launched for it. An in-progress task can be busy uploading, but it can
     /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can
     /// be waiting for retry in `exponential_backoff`.
-    pub(crate) inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
+    pub inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
 
     /// Queued operations that have not been launched yet. They might depend on previous
     /// tasks to finish. For example, metadata upload cannot be performed before all
     /// preceding layer file uploads have completed.
-    pub(crate) queued_operations: VecDeque<UploadOp>,
+    pub queued_operations: VecDeque<UploadOp>,
 
     /// Files which have been unlinked but not yet had scheduled a deletion for. Only kept around
     /// for error logging.
@@ -122,6 +130,167 @@ impl UploadQueueInitialized {
         let lsn = self.clean.0.metadata.disk_consistent_lsn();
         self.clean.1.map(|_| lsn)
     }
+
+    /// Returns and removes the next ready operation from the queue, if any. This isn't necessarily
+    /// the first operation in the queue, to avoid head-of-line blocking -- an operation can jump
+    /// the queue if it doesn't conflict with operations ahead of it.
+    ///
+    /// Also returns any operations that were coalesced into this one, e.g. multiple index uploads.
+    ///
+    /// None may be returned even if the queue isn't empty, if no operations are ready yet.
+    ///
+    /// NB: this is quadratic, but queues are expected to be small, and bounded by inprogress_limit.
+    pub fn next_ready(&mut self) -> Option<(UploadOp, Vec<UploadOp>)> {
+        // If inprogress_tasks is already at limit, don't schedule anything more.
+        if self.inprogress_limit > 0 && self.inprogress_tasks.len() >= self.inprogress_limit {
+            return None;
+        }
+
+        for (i, candidate) in self.queued_operations.iter().enumerate() {
+            // If this candidate is ready, go for it. Otherwise, try the next one.
+            if self.is_ready(i) {
+                // Shutdown operations are left at the head of the queue, to prevent further
+                // operations from starting. Signal that we're ready to shut down.
+                if matches!(candidate, UploadOp::Shutdown) {
+                    assert!(self.inprogress_tasks.is_empty(), "shutdown with tasks");
+                    assert_eq!(i, 0, "shutdown not at head of queue");
+                    self.shutdown_ready.close();
+                    return None;
+                }
+
+                let mut op = self.queued_operations.remove(i).expect("i can't disappear");
+
+                // Coalesce any back-to-back index uploads by only uploading the newest one that's
+                // ready. This typically happens with layer/index/layer/index/... sequences, where
+                // the layers bypass the indexes, leaving the indexes queued.
+                //
+                // If other operations are interleaved between index uploads we don't try to
+                // coalesce them, since we may as well update the index concurrently with them.
+                // This keeps the index fresh and avoids starvation.
+                //
+                // NB: we assume that all uploaded indexes have the same remote path. This
+                // is true at the time of writing: the path only depends on the tenant,
+                // timeline and generation, all of which are static for a timeline instance.
+                // Otherwise, we must be careful not to coalesce different paths.
+                let mut coalesced_ops = Vec::new();
+                if matches!(op, UploadOp::UploadMetadata { .. }) {
+                    while let Some(UploadOp::UploadMetadata { .. }) = self.queued_operations.get(i)
+                    {
+                        if *DISABLE_UPLOAD_QUEUE_INDEX_COALESCING {
+                            break;
+                        }
+                        if !self.is_ready(i) {
+                            break;
+                        }
+                        coalesced_ops.push(op);
+                        op = self.queued_operations.remove(i).expect("i can't disappear");
+                    }
+                }
+
+                return Some((op, coalesced_ops));
+            }
+
+            // Nothing can bypass a barrier or shutdown. If it wasn't scheduled above, give up.
+            if matches!(candidate, UploadOp::Barrier(_) | UploadOp::Shutdown) {
+                return None;
+            }
+
+            // If upload queue reordering is disabled, bail out after the first operation.
+            if *DISABLE_UPLOAD_QUEUE_REORDERING {
+                return None;
+            }
+        }
+        None
+    }
+
+    /// Returns true if the queued operation at the given position is ready to be uploaded, i.e. if
+    /// it doesn't conflict with any in-progress or queued operations ahead of it. Operations are
+    /// allowed to skip the queue when it's safe to do so, to increase parallelism.
+    ///
+    /// The position must be valid for the queue size.
+    fn is_ready(&self, pos: usize) -> bool {
+        let candidate = self.queued_operations.get(pos).expect("invalid position");
+        self
+            // Look at in-progress operations, in random order.
+            .inprogress_tasks
+            .values()
+            .map(|task| &task.op)
+            // Then queued operations ahead of the candidate, front-to-back.
+            .chain(self.queued_operations.iter().take(pos))
+            // Keep track of the active index ahead of each operation. This is used to ensure that
+            // an upload doesn't skip the queue too far, such that it modifies a layer that's
+            // referenced by an active index.
+            //
+            // It's okay that in-progress operations are emitted in random order above, since at
+            // most one of them can be an index upload (enforced by can_bypass).
+            .scan(&self.clean.0, |next_active_index, op| {
+                let active_index = *next_active_index;
+                if let UploadOp::UploadMetadata { ref uploaded } = op {
+                    *next_active_index = uploaded; // stash index for next operation after this
+                }
+                Some((op, active_index))
+            })
+            // Check if the candidate can bypass all of them.
+            .all(|(op, active_index)| candidate.can_bypass(op, active_index))
+    }
+
+    /// Returns the number of in-progress deletion operations.
+    #[cfg(test)]
+    pub(crate) fn num_inprogress_deletions(&self) -> usize {
+        self.inprogress_tasks
+            .iter()
+            .filter(|(_, t)| matches!(t.op, UploadOp::Delete(_)))
+            .count()
+    }
+
+    /// Returns the number of in-progress layer uploads.
+    #[cfg(test)]
+    pub(crate) fn num_inprogress_layer_uploads(&self) -> usize {
+        self.inprogress_tasks
+            .iter()
+            .filter(|(_, t)| matches!(t.op, UploadOp::UploadLayer(_, _, _)))
+            .count()
+    }
+
+    /// Test helper that schedules all ready operations into inprogress_tasks, and returns
+    /// references to them.
+    ///
+    /// TODO: the corresponding production logic should be moved from RemoteTimelineClient into
+    /// UploadQueue, so we can use the same code path.
+    #[cfg(test)]
+    fn schedule_ready(&mut self) -> Vec<Arc<UploadTask>> {
+        let mut tasks = Vec::new();
+        // NB: schedule operations one by one, to handle conflicts with inprogress_tasks.
+        while let Some((op, coalesced_ops)) = self.next_ready() {
+            self.task_counter += 1;
+            let task = Arc::new(UploadTask {
+                task_id: self.task_counter,
+                op,
+                coalesced_ops,
+                retries: 0.into(),
+            });
+            self.inprogress_tasks.insert(task.task_id, task.clone());
+            tasks.push(task);
+        }
+        tasks
+    }
+
+    /// Test helper that marks an operation as completed, removing it from inprogress_tasks.
+    ///
+    /// TODO: the corresponding production logic should be moved from RemoteTimelineClient into
+    /// UploadQueue, so we can use the same code path.
+    #[cfg(test)]
+    fn complete(&mut self, task_id: u64) {
+        let Some(task) = self.inprogress_tasks.remove(&task_id) else {
+            return;
+        };
+        // Update the clean index on uploads.
+        if let UploadOp::UploadMetadata { ref uploaded } = task.op {
+            if task.task_id > self.clean.1.unwrap_or_default() {
+                self.clean = (*uploaded.clone(), Some(task.task_id));
+            }
+        }
+    }
 }
 
 #[derive(Clone, Copy)]
@@ -131,12 +300,12 @@ pub(super) enum SetDeletedFlagProgress {
     Successful(NaiveDateTime),
 }
 
-pub(super) struct UploadQueueStoppedDeletable {
+pub struct UploadQueueStoppedDeletable {
     pub(super) upload_queue_for_deletion: UploadQueueInitialized,
     pub(super) deleted_at: SetDeletedFlagProgress,
 }
 
-pub(super) enum UploadQueueStopped {
+pub enum UploadQueueStopped {
     Deletable(UploadQueueStoppedDeletable),
     Uninitialized,
 }
@@ -163,9 +332,10 @@ impl NotInitialized {
 }
 
 impl UploadQueue {
-    pub(crate) fn initialize_empty_remote(
+    pub fn initialize_empty_remote(
         &mut self,
         metadata: &TimelineMetadata,
+        inprogress_limit: usize,
     ) -> anyhow::Result<&mut UploadQueueInitialized> {
         match self {
             UploadQueue::Uninitialized => (),
@@ -179,15 +349,13 @@ impl UploadQueue {
         let index_part = IndexPart::empty(metadata.clone());
 
         let state = UploadQueueInitialized {
+            inprogress_limit,
             dirty: index_part.clone(),
             clean: (index_part, None),
             latest_files_changes_since_metadata_upload_scheduled: 0,
             visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
             // what follows are boring default initializations
             task_counter: 0,
-            num_inprogress_layer_uploads: 0,
-            num_inprogress_metadata_uploads: 0,
-            num_inprogress_deletions: 0,
             inprogress_tasks: HashMap::new(),
             queued_operations: VecDeque::new(),
             #[cfg(feature = "testing")]
@@ -202,9 +370,10 @@ impl UploadQueue {
         Ok(self.initialized_mut().expect("we just set it"))
     }
 
-    pub(crate) fn initialize_with_current_remote_index_part(
+    pub fn initialize_with_current_remote_index_part(
         &mut self,
         index_part: &IndexPart,
+        inprogress_limit: usize,
     ) -> anyhow::Result<&mut UploadQueueInitialized> {
         match self {
             UploadQueue::Uninitialized => (),
@@ -219,6 +388,7 @@ impl UploadQueue {
         );
 
         let state = UploadQueueInitialized {
+            inprogress_limit,
             dirty: index_part.clone(),
             clean: (index_part.clone(), None),
             latest_files_changes_since_metadata_upload_scheduled: 0,
@@ -227,9 +397,6 @@ impl UploadQueue {
             ),
             // what follows are boring default initializations
             task_counter: 0,
-            num_inprogress_layer_uploads: 0,
-            num_inprogress_metadata_uploads: 0,
-            num_inprogress_deletions: 0,
             inprogress_tasks: HashMap::new(),
             queued_operations: VecDeque::new(),
             #[cfg(feature = "testing")]
@@ -244,9 +411,7 @@ impl UploadQueue {
         Ok(self.initialized_mut().expect("we just set it"))
     }
 
-    pub(crate) fn initialized_mut(
-        &mut self,
-    ) -> Result<&mut UploadQueueInitialized, NotInitialized> {
+    pub fn initialized_mut(&mut self) -> Result<&mut UploadQueueInitialized, NotInitialized> {
         use UploadQueue::*;
         match self {
             Uninitialized => Err(NotInitialized::Uninitialized),
@@ -276,23 +441,27 @@ impl UploadQueue {
 
 /// An in-progress upload or delete task.
 #[derive(Debug)]
-pub(crate) struct UploadTask {
+pub struct UploadTask {
     /// Unique ID of this task. Used as the key in `inprogress_tasks` above.
-    pub(crate) task_id: u64,
-    pub(crate) retries: AtomicU32,
-
-    pub(crate) op: UploadOp,
+    pub task_id: u64,
+    /// Number of task retries.
+    pub retries: AtomicU32,
+    /// The upload operation.
+    pub op: UploadOp,
+    /// Any upload operations that were coalesced into this operation. This typically happens with
+    /// back-to-back index uploads, see `UploadQueueInitialized::next_ready()`.
+    pub coalesced_ops: Vec<UploadOp>,
 }
 
 /// A deletion of some layers within the lifetime of a timeline.  This is not used
 /// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
 #[derive(Debug, Clone)]
-pub(crate) struct Delete {
-    pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>,
+pub struct Delete {
+    pub layers: Vec<(LayerName, LayerFileMetadata)>,
 }
 
-#[derive(Debug)]
-pub(crate) enum UploadOp {
+#[derive(Clone, Debug)]
+pub enum UploadOp {
     /// Upload a layer file. The last field indicates the last operation for thie file.
     UploadLayer(ResidentLayer, LayerFileMetadata, Option<OpType>),
 
@@ -338,3 +507,796 @@ impl std::fmt::Display for UploadOp {
         }
     }
 }
+
+impl UploadOp {
+    /// Returns true if self can bypass other, i.e. if the operations don't conflict. index is the
+    /// active index when other would be uploaded -- if we allow self to bypass other, this would
+    /// be the active index when self is uploaded.
+    pub fn can_bypass(&self, other: &UploadOp, index: &IndexPart) -> bool {
+        match (self, other) {
+            // Nothing can bypass a barrier or shutdown, and it can't bypass anything.
+            (UploadOp::Barrier(_), _) | (_, UploadOp::Barrier(_)) => false,
+            (UploadOp::Shutdown, _) | (_, UploadOp::Shutdown) => false,
+
+            // Uploads and deletes can bypass each other unless they're for the same file.
+            (UploadOp::UploadLayer(a, ameta, _), UploadOp::UploadLayer(b, bmeta, _)) => {
+                let aname = &a.layer_desc().layer_name();
+                let bname = &b.layer_desc().layer_name();
+                !is_same_remote_layer_path(aname, ameta, bname, bmeta)
+            }
+            (UploadOp::UploadLayer(u, umeta, _), UploadOp::Delete(d))
+            | (UploadOp::Delete(d), UploadOp::UploadLayer(u, umeta, _)) => {
+                d.layers.iter().all(|(dname, dmeta)| {
+                    !is_same_remote_layer_path(&u.layer_desc().layer_name(), umeta, dname, dmeta)
+                })
+            }
+
+            // Deletes are idempotent and can always bypass each other.
+            (UploadOp::Delete(_), UploadOp::Delete(_)) => true,
+
+            // Uploads and deletes can bypass an index upload as long as neither the uploaded index
+            // nor the active index below it references the file. A layer can't be modified or
+            // deleted while referenced by an index.
+            //
+            // Similarly, index uploads can bypass uploads and deletes as long as neither the
+            // uploaded index nor the active index references the file (the latter would be
+            // incorrect use by the caller).
+            (UploadOp::UploadLayer(u, umeta, _), UploadOp::UploadMetadata { uploaded: i })
+            | (UploadOp::UploadMetadata { uploaded: i }, UploadOp::UploadLayer(u, umeta, _)) => {
+                let uname = u.layer_desc().layer_name();
+                !i.references(&uname, umeta) && !index.references(&uname, umeta)
+            }
+            (UploadOp::Delete(d), UploadOp::UploadMetadata { uploaded: i })
+            | (UploadOp::UploadMetadata { uploaded: i }, UploadOp::Delete(d)) => {
+                d.layers.iter().all(|(dname, dmeta)| {
+                    !i.references(dname, dmeta) && !index.references(dname, dmeta)
+                })
+            }
+
+            // Indexes can never bypass each other. They can coalesce though, and
+            // `UploadQueue::next_ready()` currently does this when possible.
+            (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => false,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
+    use crate::tenant::storage_layer::layer::local_layer_path;
+    use crate::tenant::storage_layer::Layer;
+    use crate::tenant::Timeline;
+    use crate::DEFAULT_PG_VERSION;
+    use itertools::Itertools as _;
+    use std::str::FromStr as _;
+    use utils::shard::{ShardCount, ShardIndex, ShardNumber};
+
+    /// Test helper which asserts that two operations are the same, in lieu of UploadOp PartialEq.
+    #[track_caller]
+    fn assert_same_op(a: &UploadOp, b: &UploadOp) {
+        use UploadOp::*;
+        match (a, b) {
+            (UploadLayer(a, ameta, atype), UploadLayer(b, bmeta, btype)) => {
+                assert_eq!(a.layer_desc().layer_name(), b.layer_desc().layer_name());
+                assert_eq!(ameta, bmeta);
+                assert_eq!(atype, btype);
+            }
+            (Delete(a), Delete(b)) => assert_eq!(a.layers, b.layers),
+            (UploadMetadata { uploaded: a }, UploadMetadata { uploaded: b }) => assert_eq!(a, b),
+            (Barrier(_), Barrier(_)) => {}
+            (Shutdown, Shutdown) => {}
+            (a, b) => panic!("{a:?} != {b:?}"),
+        }
+    }
+
+    /// Test helper which asserts that two sets of operations are the same.
+    #[track_caller]
+    fn assert_same_ops<'a>(
+        a: impl IntoIterator<Item = &'a UploadOp>,
+        b: impl IntoIterator<Item = &'a UploadOp>,
+    ) {
+        a.into_iter()
+            .zip_eq(b)
+            .for_each(|(a, b)| assert_same_op(a, b))
+    }
+
+    /// Test helper to construct a test timeline.
+    ///
+    /// TODO: it really shouldn't be necessary to construct an entire tenant and timeline just to
+    /// test the upload queue -- decouple ResidentLayer from Timeline.
+    ///
+    /// TODO: the upload queue uses TimelineMetadata::example() instead, because there's no way to
+    /// obtain a TimelineMetadata from a Timeline.
+    fn make_timeline() -> Arc<Timeline> {
+        // Grab the current test name from the current thread name.
+        // TODO: TenantHarness shouldn't take a &'static str, but just leak the test name for now.
+        let test_name = std::thread::current().name().unwrap().to_string();
+        let test_name = Box::leak(test_name.into_boxed_str());
+
+        let runtime = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .expect("failed to create runtime");
+
+        runtime
+            .block_on(async {
+                let harness = TenantHarness::create(test_name).await?;
+                let (tenant, ctx) = harness.load().await;
+                tenant
+                    .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
+                    .await
+            })
+            .expect("failed to create timeline")
+    }
+
+    /// Test helper to construct an (empty) resident layer.
+    fn make_layer(timeline: &Arc<Timeline>, name: &str) -> ResidentLayer {
+        make_layer_with_size(timeline, name, 0)
+    }
+
+    /// Test helper to construct a resident layer with the given size.
+    fn make_layer_with_size(timeline: &Arc<Timeline>, name: &str, size: usize) -> ResidentLayer {
+        let metadata = LayerFileMetadata {
+            generation: timeline.generation,
+            shard: timeline.get_shard_index(),
+            file_size: size as u64,
+        };
+        make_layer_with_metadata(timeline, name, metadata)
+    }
+
+    /// Test helper to construct a layer with the given metadata.
+    fn make_layer_with_metadata(
+        timeline: &Arc<Timeline>,
+        name: &str,
+        metadata: LayerFileMetadata,
+    ) -> ResidentLayer {
+        let name = LayerName::from_str(name).expect("invalid name");
+        let local_path = local_layer_path(
+            timeline.conf,
+            &timeline.tenant_shard_id,
+            &timeline.timeline_id,
+            &name,
+            &metadata.generation,
+        );
+        std::fs::write(&local_path, vec![0; metadata.file_size as usize])
+            .expect("failed to write file");
+        Layer::for_resident(timeline.conf, timeline, local_path, name, metadata)
+    }
+
+    /// Test helper to add a layer to an index and return a new index.
+    fn index_with(index: &IndexPart, layer: &ResidentLayer) -> Box<IndexPart> {
+        let mut index = index.clone();
+        index
+            .layer_metadata
+            .insert(layer.layer_desc().layer_name(), layer.metadata());
+        Box::new(index)
+    }
+
+    /// Test helper to remove a layer from an index and return a new index.
+    fn index_without(index: &IndexPart, layer: &ResidentLayer) -> Box<IndexPart> {
+        let mut index = index.clone();
+        index
+            .layer_metadata
+            .remove(&layer.layer_desc().layer_name());
+        Box::new(index)
+    }
+
+    /// Nothing can bypass a barrier, and it can't bypass inprogress tasks.
+    #[test]
+    fn schedule_barrier() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?;
+        let tli = make_timeline();
+
+        let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let (barrier, _) = tokio::sync::watch::channel(());
+
+        // Enqueue non-conflicting upload, delete, and index before and after a barrier.
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())],
+            }),
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+            UploadOp::Barrier(barrier),
+            UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
+            }),
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Schedule the initial operations ahead of the barrier.
+        let tasks = queue.schedule_ready();
+
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..3]);
+        assert!(matches!(
+            queue.queued_operations.front(),
+            Some(&UploadOp::Barrier(_))
+        ));
+
+        // Complete the initial operations. The barrier isn't scheduled while they're pending.
+        for task in tasks {
+            assert!(queue.schedule_ready().is_empty());
+            queue.complete(task.task_id);
+        }
+
+        // Schedule the barrier. The later tasks won't schedule until it completes.
+        let tasks = queue.schedule_ready();
+
+        assert_eq!(tasks.len(), 1);
+        assert!(matches!(tasks[0].op, UploadOp::Barrier(_)));
+        assert_eq!(queue.queued_operations.len(), 3);
+
+        // Complete the barrier. The rest of the tasks schedule immediately.
+        queue.complete(tasks[0].task_id);
+
+        let tasks = queue.schedule_ready();
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops[4..]);
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Deletes can be scheduled in parallel, even if they're for the same file.
+    #[test]
+    fn schedule_delete_parallel() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?;
+        let tli = make_timeline();
+
+        // Enqueue a bunch of deletes, some with conflicting names.
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        let ops = [
+            UploadOp::Delete(Delete {
+                layers: vec![(layer0.layer_desc().layer_name(), layer0.metadata())],
+            }),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())],
+            }),
+            UploadOp::Delete(Delete {
+                layers: vec![
+                    (layer1.layer_desc().layer_name(), layer1.metadata()),
+                    (layer2.layer_desc().layer_name(), layer2.metadata()),
+                ],
+            }),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer2.layer_desc().layer_name(), layer2.metadata())],
+            }),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
+            }),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Schedule all ready operations. Since deletes don't conflict, they're all scheduled.
+        let tasks = queue.schedule_ready();
+
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops);
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Conflicting uploads are serialized.
+    #[test]
+    fn schedule_upload_conflicts() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
+        let tli = make_timeline();
+
+        // Enqueue three versions of the same layer, with different file sizes.
+        let layer0a = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 1);
+        let layer0b = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 2);
+        let layer0c = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 3);
+
+        let ops = [
+            UploadOp::UploadLayer(layer0a.clone(), layer0a.metadata(), None),
+            UploadOp::UploadLayer(layer0b.clone(), layer0b.metadata(), None),
+            UploadOp::UploadLayer(layer0c.clone(), layer0c.metadata(), None),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Only one version should be scheduled and uploaded at a time.
+        for op in ops {
+            let tasks = queue.schedule_ready();
+            assert_eq!(tasks.len(), 1);
+            assert_same_op(&tasks[0].op, &op);
+            queue.complete(tasks[0].task_id);
+        }
+        assert!(queue.schedule_ready().is_empty());
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Conflicting uploads and deletes are serialized.
+    #[test]
+    fn schedule_upload_delete_conflicts() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
+        let tli = make_timeline();
+
+        // Enqueue two layer uploads, with a delete of both layers in between them. These should be
+        // scheduled one at a time, since deletes can't bypass uploads and vice versa.
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![
+                    (layer0.layer_desc().layer_name(), layer0.metadata()),
+                    (layer1.layer_desc().layer_name(), layer1.metadata()),
+                ],
+            }),
+            UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Only one version should be scheduled and uploaded at a time.
+        for op in ops {
+            let tasks = queue.schedule_ready();
+            assert_eq!(tasks.len(), 1);
+            assert_same_op(&tasks[0].op, &op);
+            queue.complete(tasks[0].task_id);
+        }
+        assert!(queue.schedule_ready().is_empty());
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Non-conflicting uploads and deletes can bypass the queue, avoiding the conflicting
+    /// delete/upload operations at the head of the queue.
+    #[test]
+    fn schedule_upload_delete_conflicts_bypass() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
+        let tli = make_timeline();
+
+        // Enqueue two layer uploads, with a delete of both layers in between them. These should be
+        // scheduled one at a time, since deletes can't bypass uploads and vice versa.
+        //
+        // Also enqueue non-conflicting uploads and deletes at the end. These can bypass the queue
+        // and run immediately.
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![
+                    (layer0.layer_desc().layer_name(), layer0.metadata()),
+                    (layer1.layer_desc().layer_name(), layer1.metadata()),
+                ],
+            }),
+            UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
+            UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
+            }),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Operations 0, 3, and 4 are scheduled immediately.
+        let tasks = queue.schedule_ready();
+        assert_same_ops(tasks.iter().map(|t| &t.op), [&ops[0], &ops[3], &ops[4]]);
+        assert_eq!(queue.queued_operations.len(), 2);
+
+        Ok(())
+    }
+
+    /// Non-conflicting uploads are parallelized.
+    #[test]
+    fn schedule_upload_parallel() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
+        let tli = make_timeline();
+
+        // Enqueue three different layer uploads.
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
+            UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // All uploads should be scheduled concurrently.
+        let tasks = queue.schedule_ready();
+
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops);
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Index uploads are coalesced.
+    #[test]
+    fn schedule_index_coalesce() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
+
+        // Enqueue three uploads of the current empty index.
+        let index = Box::new(queue.clean.0.clone());
+
+        let ops = [
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // The index uploads are coalesced into a single operation.
+        let tasks = queue.schedule_ready();
+        assert_eq!(tasks.len(), 1);
+        assert_same_op(&tasks[0].op, &ops[2]);
+        assert_same_ops(&tasks[0].coalesced_ops, &ops[0..2]);
+
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Chains of upload/index operations lead to parallel layer uploads and serial index uploads.
+    /// This is the common case with layer flushes.
+    #[test]
+    fn schedule_index_upload_chain() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
+        let tli = make_timeline();
+
+        // Enqueue three uploads of the current empty index.
+        let index = Box::new(queue.clean.0.clone());
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let index0 = index_with(&index, &layer0);
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let index1 = index_with(&index0, &layer1);
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let index2 = index_with(&index1, &layer2);
+
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::UploadMetadata {
+                uploaded: index0.clone(),
+            },
+            UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
+            UploadOp::UploadMetadata {
+                uploaded: index1.clone(),
+            },
+            UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
+            UploadOp::UploadMetadata {
+                uploaded: index2.clone(),
+            },
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // The layer uploads should be scheduled immediately. The indexes must wait.
+        let upload_tasks = queue.schedule_ready();
+        assert_same_ops(
+            upload_tasks.iter().map(|t| &t.op),
+            [&ops[0], &ops[2], &ops[4]],
+        );
+
+        // layer2 completes first. None of the indexes can upload yet.
+        queue.complete(upload_tasks[2].task_id);
+        assert!(queue.schedule_ready().is_empty());
+
+        // layer0 completes. index0 can upload. It completes.
+        queue.complete(upload_tasks[0].task_id);
+        let index_tasks = queue.schedule_ready();
+        assert_eq!(index_tasks.len(), 1);
+        assert_same_op(&index_tasks[0].op, &ops[1]);
+        queue.complete(index_tasks[0].task_id);
+
+        // layer 1 completes. This unblocks index 1 and 2, which coalesce into
+        // a single upload for index 2.
+        queue.complete(upload_tasks[1].task_id);
+
+        let index_tasks = queue.schedule_ready();
+        assert_eq!(index_tasks.len(), 1);
+        assert_same_op(&index_tasks[0].op, &ops[5]);
+        assert_same_ops(&index_tasks[0].coalesced_ops, &ops[3..4]);
+
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// A delete can't bypass an index upload if an index ahead of it still references it.
+    #[test]
+    fn schedule_index_delete_dereferenced() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
+        let tli = make_timeline();
+
+        // Create a layer to upload.
+        let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let index_upload = index_with(&queue.clean.0, &layer);
+
+        // Remove the layer reference in a new index, then delete the layer.
+        let index_deref = index_without(&index_upload, &layer);
+
+        let ops = [
+            // Initial upload, with a barrier to prevent index coalescing.
+            UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
+            UploadOp::UploadMetadata {
+                uploaded: index_upload.clone(),
+            },
+            UploadOp::Barrier(tokio::sync::watch::channel(()).0),
+            // Dereference the layer and delete it.
+            UploadOp::UploadMetadata {
+                uploaded: index_deref.clone(),
+            },
+            UploadOp::Delete(Delete {
+                layers: vec![(layer.layer_desc().layer_name(), layer.metadata())],
+            }),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Operations are serialized.
+        for op in ops {
+            let tasks = queue.schedule_ready();
+            assert_eq!(tasks.len(), 1);
+            assert_same_op(&tasks[0].op, &op);
+            queue.complete(tasks[0].task_id);
+        }
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// An upload with a reused layer name doesn't clobber the previous layer. Specifically, a
+    /// dereference/upload/reference cycle can't allow the upload to bypass the reference.
+    #[test]
+    fn schedule_index_upload_dereferenced() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?;
+        let tli = make_timeline();
+
+        // Create a layer to upload.
+        let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        // Upload the layer. Then dereference the layer, and upload/reference it again.
+        let index_upload = index_with(&queue.clean.0, &layer);
+        let index_deref = index_without(&index_upload, &layer);
+        let index_ref = index_with(&index_deref, &layer);
+
+        let ops = [
+            // Initial upload, with a barrier to prevent index coalescing.
+            UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
+            UploadOp::UploadMetadata {
+                uploaded: index_upload.clone(),
+            },
+            UploadOp::Barrier(tokio::sync::watch::channel(()).0),
+            // Dereference the layer.
+            UploadOp::UploadMetadata {
+                uploaded: index_deref.clone(),
+            },
+            // Replace and reference the layer.
+            UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
+            UploadOp::UploadMetadata {
+                uploaded: index_ref.clone(),
+            },
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Operations are serialized.
+        for op in ops {
+            let tasks = queue.schedule_ready();
+            assert_eq!(tasks.len(), 1);
+            assert_same_op(&tasks[0].op, &op);
+            queue.complete(tasks[0].task_id);
+        }
+        assert!(queue.queued_operations.is_empty());
+
+        Ok(())
+    }
+
+    /// Nothing can bypass a shutdown, and it waits for inprogress tasks. It's never returned from
+    /// next_ready(), but is left at the head of the queue.
+    #[test]
+    fn schedule_shutdown() -> anyhow::Result<()> {
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?;
+        let tli = make_timeline();
+
+        let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        // Enqueue non-conflicting upload, delete, and index before and after a shutdown.
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())],
+            }),
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+            UploadOp::Shutdown,
+            UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
+            UploadOp::Delete(Delete {
+                layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())],
+            }),
+            UploadOp::UploadMetadata {
+                uploaded: index.clone(),
+            },
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Schedule the initial operations ahead of the shutdown.
+        let tasks = queue.schedule_ready();
+
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..3]);
+        assert!(matches!(
+            queue.queued_operations.front(),
+            Some(&UploadOp::Shutdown)
+        ));
+
+        // Complete the initial operations. The shutdown isn't triggered while they're pending.
+        for task in tasks {
+            assert!(queue.schedule_ready().is_empty());
+            queue.complete(task.task_id);
+        }
+
+        // The shutdown is triggered the next time we try to pull an operation. It isn't returned,
+        // but is left in the queue.
+        assert!(!queue.shutdown_ready.is_closed());
+        assert!(queue.next_ready().is_none());
+        assert!(queue.shutdown_ready.is_closed());
+
+        Ok(())
+    }
+
+    /// Scheduling respects inprogress_limit.
+    #[test]
+    fn schedule_inprogress_limit() -> anyhow::Result<()> {
+        // Create a queue with inprogress_limit=2.
+        let mut queue = UploadQueue::Uninitialized;
+        let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 2)?;
+        let tli = make_timeline();
+
+        // Enqueue a bunch of uploads.
+        let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+        let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51");
+
+        let ops = [
+            UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None),
+            UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None),
+            UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None),
+            UploadOp::UploadLayer(layer3.clone(), layer3.metadata(), None),
+        ];
+
+        queue.queued_operations.extend(ops.clone());
+
+        // Schedule all ready operations. Only 2 are scheduled.
+        let tasks = queue.schedule_ready();
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..2]);
+        assert!(queue.next_ready().is_none());
+
+        // When one completes, another is scheduled.
+        queue.complete(tasks[0].task_id);
+        let tasks = queue.schedule_ready();
+        assert_same_ops(tasks.iter().map(|t| &t.op), &ops[2..3]);
+
+        Ok(())
+    }
+
+    /// Tests that can_bypass takes name, generation and shard index into account for all operations.
+    #[test]
+    fn can_bypass_path() -> anyhow::Result<()> {
+        let tli = make_timeline();
+
+        let name0 = &"000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51";
+        let name1 = &"100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51";
+
+        // Asserts that layers a and b either can or can't bypass each other, for all combinations
+        // of operations (except Delete and UploadMetadata which are special-cased).
+        #[track_caller]
+        fn assert_can_bypass(a: ResidentLayer, b: ResidentLayer, can_bypass: bool) {
+            let index = IndexPart::empty(TimelineMetadata::example());
+            for (a, b) in make_ops(a).into_iter().zip(make_ops(b)) {
+                match (&a, &b) {
+                    // Deletes can always bypass each other.
+                    (UploadOp::Delete(_), UploadOp::Delete(_)) => assert!(a.can_bypass(&b, &index)),
+                    // Indexes can never bypass each other.
+                    (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => {
+                        assert!(!a.can_bypass(&b, &index))
+                    }
+                    // For other operations, assert as requested.
+                    (a, b) => assert_eq!(a.can_bypass(b, &index), can_bypass),
+                }
+            }
+        }
+
+        fn make_ops(layer: ResidentLayer) -> Vec<UploadOp> {
+            let mut index = IndexPart::empty(TimelineMetadata::example());
+            index
+                .layer_metadata
+                .insert(layer.layer_desc().layer_name(), layer.metadata());
+            vec![
+                UploadOp::UploadLayer(layer.clone(), layer.metadata(), None),
+                UploadOp::Delete(Delete {
+                    layers: vec![(layer.layer_desc().layer_name(), layer.metadata())],
+                }),
+                UploadOp::UploadMetadata {
+                    uploaded: Box::new(index),
+                },
+            ]
+        }
+
+        // Makes a ResidentLayer.
+        let layer = |name: &'static str, shard: Option<u8>, generation: u32| -> ResidentLayer {
+            let shard = shard
+                .map(|n| ShardIndex::new(ShardNumber(n), ShardCount(8)))
+                .unwrap_or(ShardIndex::unsharded());
+            let metadata = LayerFileMetadata {
+                shard,
+                generation: Generation::Valid(generation),
+                file_size: 0,
+            };
+            make_layer_with_metadata(&tli, name, metadata)
+        };
+
+        // Same name and metadata can't bypass. This goes both for unsharded and sharded, as well as
+        // 0 or >0 generation.
+        assert_can_bypass(layer(name0, None, 0), layer(name0, None, 0), false);
+        assert_can_bypass(layer(name0, Some(0), 0), layer(name0, Some(0), 0), false);
+        assert_can_bypass(layer(name0, None, 1), layer(name0, None, 1), false);
+
+        // Different names can bypass.
+        assert_can_bypass(layer(name0, None, 0), layer(name1, None, 0), true);
+
+        // Different shards can bypass. Shard 0 is different from unsharded.
+        assert_can_bypass(layer(name0, Some(0), 0), layer(name0, Some(1), 0), true);
+        assert_can_bypass(layer(name0, Some(0), 0), layer(name0, None, 0), true);
+
+        // Different generations can bypass, both sharded and unsharded.
+        assert_can_bypass(layer(name0, None, 0), layer(name0, None, 1), true);
+        assert_can_bypass(layer(name0, Some(1), 0), layer(name0, Some(1), 1), true);
+
+        Ok(())
+    }
+}
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index e5b23fed51..7253af8507 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -308,7 +308,7 @@ impl WalIngest {
             epoch -= 1;
         }
 
-        Ok((epoch as u64) << 32 | xid as u64)
+        Ok(((epoch as u64) << 32) | xid as u64)
     }
 
     async fn ingest_clear_vm_bits(
diff --git a/pgxn/hnsw/Makefile b/pgxn/hnsw/Makefile
deleted file mode 100644
index 66436b5920..0000000000
--- a/pgxn/hnsw/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-EXTENSION = hnsw
-EXTVERSION = 0.1.0
-
-MODULE_big = hnsw
-DATA = $(wildcard *--*.sql)
-OBJS = hnsw.o hnswalg.o
-
-TESTS = $(wildcard test/sql/*.sql)
-REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS))
-REGRESS_OPTS = --inputdir=test --load-extension=hnsw
-
-# For auto-vectorization:
-# - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html
-PG_CFLAGS += -O3
-PG_CXXFLAGS +=  -O3 -std=c++11
-PG_LDFLAGS += -lstdc++
-
-all: $(EXTENSION)--$(EXTVERSION).sql
-
-PG_CONFIG ?= pg_config
-PGXS := $(shell $(PG_CONFIG) --pgxs)
-include $(PGXS)
-
-dist:
-	mkdir -p dist
-	git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master
diff --git a/pgxn/hnsw/README.md b/pgxn/hnsw/README.md
deleted file mode 100644
index bc9c8d571c..0000000000
--- a/pgxn/hnsw/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors
-
-This ANN extension of Postgres is based
-on [ivf-hnsw](https://github.com/dbaranchuk/ivf-hnsw.git) implementation of [HNSW](https://www.pinecone.io/learn/hnsw),
-the code for the current state-of-the-art billion-scale nearest neighbor search system presented in the paper:
-
-[Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](http://openaccess.thecvf.com/content_ECCV_2018/html/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.html),
-<br>
-Dmitry Baranchuk, Artem Babenko, Yury Malkov
-
-# Postgres extension
-
-HNSW index is hold in memory (built on demand) and it's maxial size is limited
-by `maxelements` index parameter. Another required parameter is nubmer of dimensions (if it is not specified in column type).
-Optional parameter `ef` specifies number of neighbors which are considered during index construction and search (corresponds `efConstruction` and `efSearch` parameters
-described in the article).
-
-# Example of usage:
-
-```
-create extension hnsw;
-create table embeddings(id integer primary key, payload real[]);
-create index on embeddings using hnsw(payload) with (maxelements=1000000, dims=100, m=32);
-select id from embeddings order by payload <-> array[1.0, 2.0,...] limit 100;
-```
\ No newline at end of file
diff --git a/pgxn/hnsw/hnsw--0.1.0.sql b/pgxn/hnsw/hnsw--0.1.0.sql
deleted file mode 100644
index ebf424326d..0000000000
--- a/pgxn/hnsw/hnsw--0.1.0.sql
+++ /dev/null
@@ -1,29 +0,0 @@
--- complain if script is sourced in psql, rather than via CREATE EXTENSION
-\echo Use "CREATE EXTENSION hnsw" to load this file. \quit
-
--- functions
-
-CREATE FUNCTION l2_distance(real[], real[]) RETURNS real
-	AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
-
--- operators
-
-CREATE OPERATOR <-> (
-	LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2_distance,
-	COMMUTATOR = '<->'
-);
-
--- access method
-
-CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler
-	AS 'MODULE_PATHNAME' LANGUAGE C;
-
-CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler;
-
-COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method';
-
--- opclasses
-
-CREATE OPERATOR CLASS knn_ops
-	DEFAULT FOR TYPE real[] USING hnsw AS
-	OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops;
diff --git a/pgxn/hnsw/hnsw.c b/pgxn/hnsw/hnsw.c
deleted file mode 100644
index e624cb831f..0000000000
--- a/pgxn/hnsw/hnsw.c
+++ /dev/null
@@ -1,590 +0,0 @@
-#include "postgres.h"
-
-#include "access/amapi.h"
-#include "access/generic_xlog.h"
-#include "access/relation.h"
-#include "access/reloptions.h"
-#include "access/tableam.h"
-#include "catalog/index.h"
-#include "commands/vacuum.h"
-#include "nodes/execnodes.h"
-#include "storage/bufmgr.h"
-#include "utils/guc.h"
-#include "utils/selfuncs.h"
-
-#include <math.h>
-#include <float.h>
-
-#include "hnsw.h"
-
-PG_MODULE_MAGIC;
-
-typedef struct {
-	int32 vl_len_;		/* varlena header (do not touch directly!) */
-	int dims;
-	int maxelements;
-	int efConstruction;
-	int efSearch;
-	int M;
-} HnswOptions;
-
-static relopt_kind hnsw_relopt_kind;
-
-typedef struct {
-	HierarchicalNSW* hnsw;
-	size_t curr;
-	size_t n_results;
-	ItemPointer results;
-} HnswScanOpaqueData;
-
-typedef HnswScanOpaqueData* HnswScanOpaque;
-
-typedef struct {
-	Oid relid;
-	uint32 status;
-	HierarchicalNSW* hnsw;
-} HnswHashEntry;
-
-
-#define SH_PREFIX			 hnsw_index
-#define SH_ELEMENT_TYPE		 HnswHashEntry
-#define SH_KEY_TYPE			 Oid
-#define SH_KEY				 relid
-#define SH_STORE_HASH
-#define SH_GET_HASH(tb, a)	 ((a)->relid)
-#define SH_HASH_KEY(tb, key) (key)
-#define SH_EQUAL(tb, a, b)	((a) == (b))
-#define SH_SCOPE			static inline
-#define SH_DEFINE
-#define SH_DECLARE
-#include "lib/simplehash.h"
-
-#define INDEX_HASH_SIZE     11
-
-#define DEFAULT_EF_SEARCH   64
-
-PGDLLEXPORT void _PG_init(void);
-
-static hnsw_index_hash *hnsw_indexes;
-
-/*
- * Initialize index options and variables
- */
-void
-_PG_init(void)
-{
-	hnsw_relopt_kind = add_reloption_kind();
-	add_int_reloption(hnsw_relopt_kind, "dims", "Number of dimensions",
-					  0, 0, INT_MAX, AccessExclusiveLock);
-	add_int_reloption(hnsw_relopt_kind, "maxelements", "Maximal number of elements",
-					  0, 0, INT_MAX, AccessExclusiveLock);
-	add_int_reloption(hnsw_relopt_kind, "m", "Number of neighbors of each vertex",
-					  100, 0, INT_MAX, AccessExclusiveLock);
-	add_int_reloption(hnsw_relopt_kind, "efconstruction", "Number of inspected neighbors during index construction",
-					  16, 1, INT_MAX, AccessExclusiveLock);
-	add_int_reloption(hnsw_relopt_kind, "efsearch", "Number of inspected neighbors during index search",
-					  64, 1, INT_MAX, AccessExclusiveLock);
-	hnsw_indexes = hnsw_index_create(TopMemoryContext, INDEX_HASH_SIZE, NULL);
-}
-
-
-static void
-hnsw_build_callback(Relation index, ItemPointer tid, Datum *values,
-					bool *isnull, bool tupleIsAlive, void *state)
-{
-	HierarchicalNSW* hnsw = (HierarchicalNSW*) state;
-	ArrayType* array;
-	int n_items;
-	label_t label = 0;
-
-	/* Skip nulls */
-	if (isnull[0])
-		return;
-
-	array = DatumGetArrayTypeP(values[0]);
-	n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
-	if (n_items != hnsw_dimensions(hnsw))
-	{
-		elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
-			 n_items, hnsw_dimensions(hnsw));
-	}
-
-	memcpy(&label, tid, sizeof(*tid));
-	hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label);
-}
-
-static void
-hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel)
-{
-	IndexInfo* indexInfo = BuildIndexInfo(indexRel);
-	Assert(indexInfo->ii_NumIndexAttrs == 1);
-	table_index_build_scan(heapRel, indexRel, indexInfo,
-						   true, true, hnsw_build_callback, (void *) hnsw, NULL);
-}
-
-#ifdef __APPLE__
-
-#include <sys/types.h>
-#include <sys/sysctl.h>
-
-static void
-hnsw_check_available_memory(Size requested)
-{
-	size_t total;
-	if (sysctlbyname("hw.memsize", NULL, &total, NULL, 0) < 0)
-		elog(ERROR, "Failed to get amount of RAM: %m");
-
-	if ((Size)NBuffers*BLCKSZ + requested >= total)
-		elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
-			requested, total - (Size)NBuffers*BLCKSZ);
-}
-
-#else
-
-#include <sys/sysinfo.h>
-
-static void
-hnsw_check_available_memory(Size requested)
-{
-	struct sysinfo si;
-	Size total;
-	if (sysinfo(&si) < 0)
-		elog(ERROR, "Failed to get amount of RAM: %m");
-
-	total = si.totalram*si.mem_unit;
-	if ((Size)NBuffers*BLCKSZ + requested >= total)
-		elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
-			requested, total - (Size)NBuffers*BLCKSZ);
-}
-
-#endif
-
-static HierarchicalNSW*
-hnsw_get_index(Relation indexRel, Relation heapRel)
-{
-	HierarchicalNSW* hnsw;
-	Oid indexoid = RelationGetRelid(indexRel);
-	HnswHashEntry* entry = hnsw_index_lookup(hnsw_indexes, indexoid);
-	if (entry == NULL)
-	{
-		size_t dims, maxelements;
-		size_t M;
-		size_t maxM;
-		size_t size_links_level0;
-		size_t size_data_per_element;
-		size_t data_size;
-		dsm_handle handle = indexoid << 1; /* make it even */
-		void* impl_private = NULL;
-		void* mapped_address = NULL;
-		Size  mapped_size = 0;
-		Size  shmem_size;
-		bool exists = true;
-		bool found;
-		HnswOptions *opts = (HnswOptions *) indexRel->rd_options;
-		if (opts == NULL || opts->maxelements == 0 || opts->dims == 0) {
-			elog(ERROR, "HNSW index requires 'maxelements' and 'dims' to be specified");
-		}
-		dims = opts->dims;
-		maxelements = opts->maxelements;
-		M = opts->M;
-		maxM = M * 2;
-		data_size = dims * sizeof(coord_t);
-		size_links_level0 = (maxM + 1) * sizeof(idx_t);
-		size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
-		shmem_size =  hnsw_sizeof() + maxelements * size_data_per_element;
-
-		hnsw_check_available_memory(shmem_size);
-
-		/* first try to attach to existed index */
-		if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
-						 &mapped_address, &mapped_size, DEBUG1))
-		{
-			/* index doesn't exists: try to create it */
-			if (!dsm_impl_op(DSM_OP_CREATE, handle, shmem_size, &impl_private,
-							 &mapped_address, &mapped_size, DEBUG1))
-			{
-				/* We can do it under shared lock, so some other backend may
-				 * try to initialize index. If create is failed because index already
-				 * created by somebody else, then try to attach to it once again
-				 */
-				if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
-								 &mapped_address, &mapped_size, ERROR))
-				{
-					return NULL;
-				}
-			}
-			else
-			{
-				exists = false;
-			}
-		}
-		Assert(mapped_size == shmem_size);
-		hnsw = (HierarchicalNSW*)mapped_address;
-
-		if (!exists)
-		{
-			hnsw_init(hnsw, dims, maxelements, M, maxM, opts->efConstruction);
-			hnsw_populate(hnsw, indexRel, heapRel);
-		}
-		entry = hnsw_index_insert(hnsw_indexes, indexoid, &found);
-		Assert(!found);
-		entry->hnsw = hnsw;
-	}
-	else
-	{
-		hnsw = entry->hnsw;
-	}
-	return hnsw;
-}
-
-/*
- * Start or restart an index scan
- */
-static IndexScanDesc
-hnsw_beginscan(Relation index, int nkeys, int norderbys)
-{
-	IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys);
-	HnswScanOpaque so = (HnswScanOpaque) palloc(sizeof(HnswScanOpaqueData));
-	Relation heap = relation_open(index->rd_index->indrelid, NoLock);
-	so->hnsw = hnsw_get_index(index, heap);
-	relation_close(heap, NoLock);
-	so->curr = 0;
-	so->n_results = 0;
-	so->results = NULL;
-	scan->opaque = so;
-	return scan;
-}
-
-/*
- * Start or restart an index scan
- */
-static void
-hnsw_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys)
-{
-	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
-	if (so->results)
-	{
-		pfree(so->results);
-		so->results = NULL;
-	}
-	so->curr = 0;
-	if (orderbys && scan->numberOfOrderBys > 0)
-		memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData));
-}
-
-/*
- * Fetch the next tuple in the given scan
- */
-static bool
-hnsw_gettuple(IndexScanDesc scan, ScanDirection dir)
-{
-	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
-
-	/*
-	 * Index can be used to scan backward, but Postgres doesn't support
-	 * backward scan on operators
-	 */
-	Assert(ScanDirectionIsForward(dir));
-
-	if (so->curr == 0)
-	{
-		Datum		value;
-		ArrayType*	array;
-		int         n_items;
-		size_t      n_results;
-		label_t*    results;
-		HnswOptions *opts = (HnswOptions *) scan->indexRelation->rd_options;
-		size_t      efSearch = opts ? opts->efSearch : DEFAULT_EF_SEARCH;
-
-		/* Safety check */
-		if (scan->orderByData == NULL)
-			elog(ERROR, "cannot scan HNSW index without order");
-
-		/* No items will match if null */
-		if (scan->orderByData->sk_flags & SK_ISNULL)
-			return false;
-
-		value = scan->orderByData->sk_argument;
-		array = DatumGetArrayTypeP(value);
-		n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
-		if (n_items != hnsw_dimensions(so->hnsw))
-		{
-			elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
-				 n_items, hnsw_dimensions(so->hnsw));
-		}
-
-		if (!hnsw_search(so->hnsw, (coord_t*)ARR_DATA_PTR(array), efSearch, &n_results, &results))
-			elog(ERROR, "HNSW index search failed");
-		so->results = (ItemPointer)palloc(n_results*sizeof(ItemPointerData));
-		so->n_results = n_results;
-		for (size_t i = 0; i < n_results; i++)
-		{
-			memcpy(&so->results[i], &results[i], sizeof(so->results[i]));
-		}
-		free(results);
-	}
-	if (so->curr >= so->n_results)
-	{
-		return false;
-	}
-	else
-	{
-		scan->xs_heaptid = so->results[so->curr++];
-		scan->xs_recheckorderby = false;
-		return true;
-	}
-}
-
-/*
- * End a scan and release resources
- */
-static void
-hnsw_endscan(IndexScanDesc scan)
-{
-	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
-	if (so->results)
-		pfree(so->results);
-	pfree(so);
-	scan->opaque = NULL;
-}
-
-
-/*
- * Estimate the cost of an index scan
- */
-static void
-hnsw_costestimate(PlannerInfo *root, IndexPath *path, double loop_count,
-				 Cost *indexStartupCost, Cost *indexTotalCost,
-				 Selectivity *indexSelectivity, double *indexCorrelation
-				 ,double *indexPages
-)
-{
-	GenericCosts costs;
-
-	/* Never use index without order */
-	if (path->indexorderbys == NULL)
-	{
-		*indexStartupCost = DBL_MAX;
-		*indexTotalCost = DBL_MAX;
-		*indexSelectivity = 0;
-		*indexCorrelation = 0;
-		*indexPages = 0;
-		return;
-	}
-
-	MemSet(&costs, 0, sizeof(costs));
-
-	genericcostestimate(root, path, loop_count, &costs);
-
-	/* Startup cost and total cost are same */
-	*indexStartupCost = costs.indexTotalCost;
-	*indexTotalCost = costs.indexTotalCost;
-	*indexSelectivity = costs.indexSelectivity;
-	*indexCorrelation = costs.indexCorrelation;
-	*indexPages = costs.numIndexPages;
-}
-
-/*
- * Parse and validate the reloptions
- */
-static bytea *
-hnsw_options(Datum reloptions, bool validate)
-{
-	static const relopt_parse_elt tab[] = {
-		{"dims", RELOPT_TYPE_INT, offsetof(HnswOptions, dims)},
-		{"maxelements", RELOPT_TYPE_INT, offsetof(HnswOptions, maxelements)},
-		{"efconstruction", RELOPT_TYPE_INT, offsetof(HnswOptions, efConstruction)},
-		{"efsearch", RELOPT_TYPE_INT, offsetof(HnswOptions, efSearch)},
-		{"m", RELOPT_TYPE_INT, offsetof(HnswOptions, M)}
-	};
-
-	return (bytea *) build_reloptions(reloptions, validate,
-									  hnsw_relopt_kind,
-									  sizeof(HnswOptions),
-									  tab, lengthof(tab));
-}
-
-/*
- * Validate catalog entries for the specified operator class
- */
-static bool
-hnsw_validate(Oid opclassoid)
-{
-	return true;
-}
-
-/*
- * Build the index for a logged table
- */
-static IndexBuildResult *
-hnsw_build(Relation heap, Relation index, IndexInfo *indexInfo)
-{
-	HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
-	IndexBuildResult* result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
-	result->heap_tuples = result->index_tuples = hnsw_count(hnsw);
-
-	return result;
-}
-
-/*
- * Insert a tuple into the index
- */
-static bool
-hnsw_insert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid,
-			  Relation heap, IndexUniqueCheck checkUnique,
-			  bool indexUnchanged,
-			  IndexInfo *indexInfo)
-{
-	HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
-	Datum value;
-	ArrayType* array;
-	int n_items;
-	label_t label = 0;
-
-	/* Skip nulls */
-	if (isnull[0])
-		return false;
-
-	/* Detoast value */
-	value = PointerGetDatum(PG_DETOAST_DATUM(values[0]));
-	array = DatumGetArrayTypeP(value);
-	n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
-	if (n_items != hnsw_dimensions(hnsw))
-	{
-		elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
-			 n_items, hnsw_dimensions(hnsw));
-	}
-	memcpy(&label, heap_tid, sizeof(*heap_tid));
-	if (!hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label))
-		elog(ERROR, "HNSW index insert failed");
-	return true;
-}
-
-/*
- * Build the index for an unlogged table
- */
-static void
-hnsw_buildempty(Relation index)
-{
-	/* index will be constructed on dema nd when accessed */
-}
-
-/*
- * Clean up after a VACUUM operation
- */
-static IndexBulkDeleteResult *
-hnsw_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
-{
-	Relation	rel = info->index;
-
-	if (stats == NULL)
-		return NULL;
-
-	stats->num_pages = RelationGetNumberOfBlocks(rel);
-
-	return stats;
-}
-
-/*
- * Bulk delete tuples from the index
- */
-static IndexBulkDeleteResult *
-hnsw_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
-				IndexBulkDeleteCallback callback, void *callback_state)
-{
-	if (stats == NULL)
-		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
-	return stats;
-}
-
-/*
- * Define index handler
- *
- * See https://www.postgresql.org/docs/current/index-api.html
- */
-PGDLLEXPORT PG_FUNCTION_INFO_V1(hnsw_handler);
-Datum
-hnsw_handler(PG_FUNCTION_ARGS)
-{
-	IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
-
-	amroutine->amstrategies = 0;
-	amroutine->amsupport = 0;
-	amroutine->amoptsprocnum = 0;
-	amroutine->amcanorder = false;
-	amroutine->amcanorderbyop = true;
-	amroutine->amcanbackward = false;	/* can change direction mid-scan */
-	amroutine->amcanunique = false;
-	amroutine->amcanmulticol = false;
-	amroutine->amoptionalkey = true;
-	amroutine->amsearcharray = false;
-	amroutine->amsearchnulls = false;
-	amroutine->amstorage = false;
-	amroutine->amclusterable = false;
-	amroutine->ampredlocks = false;
-	amroutine->amcanparallel = false;
-	amroutine->amcaninclude = false;
-	amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */
-	amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL;
-	amroutine->amkeytype = InvalidOid;
-
-	/* Interface functions */
-	amroutine->ambuild = hnsw_build;
-	amroutine->ambuildempty = hnsw_buildempty;
-	amroutine->aminsert = hnsw_insert;
-	amroutine->ambulkdelete = hnsw_bulkdelete;
-	amroutine->amvacuumcleanup = hnsw_vacuumcleanup;
-	amroutine->amcanreturn = NULL;	/* tuple not included in heapsort */
-	amroutine->amcostestimate = hnsw_costestimate;
-	amroutine->amoptions = hnsw_options;
-	amroutine->amproperty = NULL;	/* TODO AMPROP_DISTANCE_ORDERABLE */
-	amroutine->ambuildphasename = NULL;
-	amroutine->amvalidate = hnsw_validate;
-	amroutine->amadjustmembers = NULL;
-	amroutine->ambeginscan = hnsw_beginscan;
-	amroutine->amrescan = hnsw_rescan;
-	amroutine->amgettuple = hnsw_gettuple;
-	amroutine->amgetbitmap = NULL;
-	amroutine->amendscan = hnsw_endscan;
-	amroutine->ammarkpos = NULL;
-	amroutine->amrestrpos = NULL;
-
-	/* Interface functions to support parallel index scans */
-	amroutine->amestimateparallelscan = NULL;
-	amroutine->aminitparallelscan = NULL;
-	amroutine->amparallelrescan = NULL;
-
-	PG_RETURN_POINTER(amroutine);
-}
-
-/*
- * Get the L2 distance between vectors
- */
-PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance);
-Datum
-l2_distance(PG_FUNCTION_ARGS)
-{
-	ArrayType  *a = PG_GETARG_ARRAYTYPE_P(0);
-	ArrayType  *b = PG_GETARG_ARRAYTYPE_P(1);
-	int         a_dim = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a));
-	int         b_dim = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b));
-	dist_t 		distance = 0.0;
-	dist_t		diff;
-	coord_t	   *ax = (coord_t*)ARR_DATA_PTR(a);
-	coord_t	   *bx = (coord_t*)ARR_DATA_PTR(b);
-
-	if (a_dim != b_dim)
-	{
-		ereport(ERROR,
-				(errcode(ERRCODE_DATA_EXCEPTION),
-				 errmsg("different array dimensions %d and %d", a_dim, b_dim)));
-	}
-
-	for (int i = 0; i < a_dim; i++)
-	{
-		diff = ax[i] - bx[i];
-		distance += diff * diff;
-	}
-
-	PG_RETURN_FLOAT4((dist_t)sqrt(distance));
-}
diff --git a/pgxn/hnsw/hnsw.control b/pgxn/hnsw/hnsw.control
deleted file mode 100644
index fbfa1a5b47..0000000000
--- a/pgxn/hnsw/hnsw.control
+++ /dev/null
@@ -1,4 +0,0 @@
-comment = '** Deprecated ** Please use pg_embedding instead'
-default_version = '0.1.0'
-module_pathname = '$libdir/hnsw'
-relocatable = true
diff --git a/pgxn/hnsw/hnsw.h b/pgxn/hnsw/hnsw.h
deleted file mode 100644
index d4065ab8fe..0000000000
--- a/pgxn/hnsw/hnsw.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#pragma once
-
-typedef float    coord_t;
-typedef float    dist_t;
-typedef uint32_t idx_t;
-typedef uint64_t label_t;
-
-typedef struct HierarchicalNSW HierarchicalNSW;
-
-bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results);
-bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label);
-void hnsw_init(HierarchicalNSW* hnsw, size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
-int  hnsw_dimensions(HierarchicalNSW* hnsw);
-size_t hnsw_count(HierarchicalNSW* hnsw);
-size_t hnsw_sizeof(void);
diff --git a/pgxn/hnsw/hnswalg.cpp b/pgxn/hnsw/hnswalg.cpp
deleted file mode 100644
index f6de3b8314..0000000000
--- a/pgxn/hnsw/hnswalg.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-#include "hnswalg.h"
-
-#if defined(__GNUC__)
-#define PORTABLE_ALIGN32 __attribute__((aligned(32)))
-#define PREFETCH(addr,hint) __builtin_prefetch(addr, 0, hint)
-#else
-#define PORTABLE_ALIGN32 __declspec(align(32))
-#define PREFETCH(addr,hint)
-#endif
-
-HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, size_t maxM_, size_t efConstruction_)
-{
-    dim = dim_;
-    data_size = dim * sizeof(coord_t);
-
-    efConstruction = efConstruction_;
-
-    maxelements = maxelements_;
-    M = M_;
-    maxM = maxM_;
-    size_links_level0 = (maxM + 1) * sizeof(idx_t);
-    size_data_per_element = size_links_level0 + data_size  + sizeof(label_t);
-    offset_data = size_links_level0;
-	offset_label = offset_data + data_size;
-
-    enterpoint_node = 0;
-    cur_element_count = 0;
-#ifdef __x86_64__
-    use_avx2 = __builtin_cpu_supports("avx2");
-#endif
-}
-
-std::priority_queue<std::pair<dist_t, idx_t>> HierarchicalNSW::searchBaseLayer(const coord_t *point, size_t ef)
-{
-	std::vector<uint32_t> visited;
-	visited.resize((cur_element_count + 31) >> 5);
-
-    std::priority_queue<std::pair<dist_t, idx_t >> topResults;
-    std::priority_queue<std::pair<dist_t, idx_t >> candidateSet;
-
-    dist_t dist = fstdistfunc(point, getDataByInternalId(enterpoint_node));
-
-    topResults.emplace(dist, enterpoint_node);
-    candidateSet.emplace(-dist, enterpoint_node);
-    visited[enterpoint_node >> 5] = 1 << (enterpoint_node & 31);
-    dist_t lowerBound = dist;
-
-    while (!candidateSet.empty())
-    {
-        std::pair<dist_t, idx_t> curr_el_pair = candidateSet.top();
-        if (-curr_el_pair.first > lowerBound)
-            break;
-
-        candidateSet.pop();
-        idx_t curNodeNum = curr_el_pair.second;
-
-        idx_t* data = get_linklist0(curNodeNum);
-        size_t size = *data++;
-
-        PREFETCH(getDataByInternalId(*data), 0);
-
-        for (size_t j = 0; j < size; ++j) {
-            size_t tnum = *(data + j);
-
-            PREFETCH(getDataByInternalId(*(data + j + 1)), 0);
-
-            if (!(visited[tnum >> 5] & (1 << (tnum & 31)))) {
-				visited[tnum >> 5] |= 1 << (tnum & 31);
-
-                dist = fstdistfunc(point, getDataByInternalId(tnum));
-
-                if (topResults.top().first > dist || topResults.size() < ef) {
-                    candidateSet.emplace(-dist, tnum);
-
-                    PREFETCH(get_linklist0(candidateSet.top().second), 0);
-                    topResults.emplace(dist, tnum);
-
-                    if (topResults.size() > ef)
-                        topResults.pop();
-
-                    lowerBound = topResults.top().first;
-                }
-            }
-        }
-    }
-    return topResults;
-}
-
-
-void HierarchicalNSW::getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN)
-{
-    if (topResults.size() < NN)
-        return;
-
-    std::priority_queue<std::pair<dist_t, idx_t>> resultSet;
-    std::vector<std::pair<dist_t, idx_t>> returnlist;
-
-    while (topResults.size() > 0) {
-        resultSet.emplace(-topResults.top().first, topResults.top().second);
-        topResults.pop();
-    }
-
-    while (resultSet.size()) {
-        if (returnlist.size() >= NN)
-            break;
-        std::pair<dist_t, idx_t> curen = resultSet.top();
-        dist_t dist_to_query = -curen.first;
-        resultSet.pop();
-        bool good = true;
-        for (std::pair<dist_t, idx_t> curen2 : returnlist) {
-            dist_t curdist = fstdistfunc(getDataByInternalId(curen2.second),
-                                         getDataByInternalId(curen.second));
-            if (curdist < dist_to_query) {
-                good = false;
-                break;
-            }
-        }
-        if (good) returnlist.push_back(curen);
-    }
-    for (std::pair<dist_t, idx_t> elem : returnlist)
-        topResults.emplace(-elem.first, elem.second);
-}
-
-void HierarchicalNSW::mutuallyConnectNewElement(const coord_t *point, idx_t cur_c,
-                               std::priority_queue<std::pair<dist_t, idx_t>> topResults)
-{
-    getNeighborsByHeuristic(topResults, M);
-
-    std::vector<idx_t> res;
-    res.reserve(M);
-    while (topResults.size() > 0) {
-        res.push_back(topResults.top().second);
-        topResults.pop();
-    }
-    {
-        idx_t* data = get_linklist0(cur_c);
-        if (*data)
-            throw std::runtime_error("Should be blank");
-
-        *data++ = res.size();
-
-        for (size_t idx = 0; idx < res.size(); idx++) {
-            if (data[idx])
-                throw std::runtime_error("Should be blank");
-            data[idx] = res[idx];
-        }
-    }
-    for (size_t idx = 0; idx < res.size(); idx++) {
-        if (res[idx] == cur_c)
-            throw std::runtime_error("Connection to the same element");
-
-        size_t resMmax = maxM;
-        idx_t *ll_other = get_linklist0(res[idx]);
-        idx_t sz_link_list_other = *ll_other;
-
-        if (sz_link_list_other > resMmax || sz_link_list_other < 0)
-            throw std::runtime_error("Bad sz_link_list_other");
-
-        if (sz_link_list_other < resMmax) {
-            idx_t *data = ll_other + 1;
-            data[sz_link_list_other] = cur_c;
-            *ll_other = sz_link_list_other + 1;
-        } else {
-            // finding the "weakest" element to replace it with the new one
-            idx_t *data = ll_other + 1;
-            dist_t d_max = fstdistfunc(getDataByInternalId(cur_c), getDataByInternalId(res[idx]));
-            // Heuristic:
-            std::priority_queue<std::pair<dist_t, idx_t>> candidates;
-            candidates.emplace(d_max, cur_c);
-
-            for (size_t j = 0; j < sz_link_list_other; j++)
-                candidates.emplace(fstdistfunc(getDataByInternalId(data[j]), getDataByInternalId(res[idx])), data[j]);
-
-            getNeighborsByHeuristic(candidates, resMmax);
-
-            size_t indx = 0;
-            while (!candidates.empty()) {
-                data[indx] = candidates.top().second;
-                candidates.pop();
-                indx++;
-            }
-            *ll_other = indx;
-        }
-    }
-}
-
-void HierarchicalNSW::addPoint(const coord_t *point, label_t label)
-{
-    if (cur_element_count >= maxelements) {
-        throw std::runtime_error("The number of elements exceeds the specified limit");
-    }
-    idx_t cur_c = cur_element_count++;
-    memset((char *) get_linklist0(cur_c), 0, size_data_per_element);
-    memcpy(getDataByInternalId(cur_c), point, data_size);
-    memcpy(getExternalLabel(cur_c), &label, sizeof label);
-
-    // Do nothing for the first element
-    if (cur_c != 0) {
-        std::priority_queue <std::pair<dist_t, idx_t>> topResults = searchBaseLayer(point, efConstruction);
-        mutuallyConnectNewElement(point, cur_c, topResults);
-    }
-};
-
-std::priority_queue<std::pair<dist_t, label_t>> HierarchicalNSW::searchKnn(const coord_t *query, size_t k)
-{
-	std::priority_queue<std::pair<dist_t, label_t>> topResults;
-	auto topCandidates = searchBaseLayer(query, k);
-    while (topCandidates.size() > k) {
-        topCandidates.pop();
-	}
-	while (!topCandidates.empty()) {
-		std::pair<dist_t, idx_t> rez = topCandidates.top();
-		label_t label;
-		memcpy(&label, getExternalLabel(rez.second), sizeof(label));
-		topResults.push(std::pair<dist_t, label_t>(rez.first, label));
-		topCandidates.pop();
-	}
-
-    return topResults;
-};
-
-dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n)
-{
-    dist_t 	distance = 0.0;
-
-    for (size_t i = 0; i < n; i++)
-    {
-        dist_t diff = x[i] - y[i];
-        distance += diff * diff;
-    }
-    return distance;
-
-}
-
-#ifdef __x86_64__
-#include <immintrin.h>
-
-__attribute__((target("avx2")))
-dist_t fstdistfunc_avx2(const coord_t *x, const coord_t *y, size_t n)
-{
-    const size_t TmpResSz = sizeof(__m256) / sizeof(float);
-    float PORTABLE_ALIGN32 TmpRes[TmpResSz];
-    size_t qty16 = n / 16;
-    const float *pEnd1 = x + (qty16 * 16);
-    __m256 diff, v1, v2;
-    __m256 sum = _mm256_set1_ps(0);
-
-    while (x < pEnd1) {
-        v1 = _mm256_loadu_ps(x);
-        x += 8;
-        v2 = _mm256_loadu_ps(y);
-        y += 8;
-        diff = _mm256_sub_ps(v1, v2);
-        sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
-
-        v1 = _mm256_loadu_ps(x);
-        x += 8;
-        v2 = _mm256_loadu_ps(y);
-        y += 8;
-        diff = _mm256_sub_ps(v1, v2);
-        sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
-    }
-    _mm256_store_ps(TmpRes, sum);
-    float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
-    return (res);
-}
-
-dist_t fstdistfunc_sse(const coord_t *x, const coord_t *y, size_t n)
-{
-    const size_t TmpResSz = sizeof(__m128) / sizeof(float);
-    float PORTABLE_ALIGN32 TmpRes[TmpResSz];
-    size_t qty16 = n / 16;
-    const float *pEnd1 = x + (qty16 * 16);
-
-    __m128 diff, v1, v2;
-    __m128 sum = _mm_set1_ps(0);
-
-    while (x < pEnd1) {
-        v1 = _mm_loadu_ps(x);
-        x += 4;
-        v2 = _mm_loadu_ps(y);
-        y += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-
-        v1 = _mm_loadu_ps(x);
-        x += 4;
-        v2 = _mm_loadu_ps(y);
-        y += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-
-        v1 = _mm_loadu_ps(x);
-        x += 4;
-        v2 = _mm_loadu_ps(y);
-        y += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-
-        v1 = _mm_loadu_ps(x);
-        x += 4;
-        v2 = _mm_loadu_ps(y);
-        y += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-    }
-    _mm_store_ps(TmpRes, sum);
-    float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
-    return res;
-}
-#endif
-
-dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y)
-{
-#ifndef __x86_64__
-    return fstdistfunc_scalar(x, y, dim);
-#else
-    if(use_avx2)
-        return fstdistfunc_avx2(x, y, dim);
-
-    return fstdistfunc_sse(x, y, dim);
-#endif
-}
-
-bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results)
-{
-	try
-	{
-		auto result = hnsw->searchKnn(point, efSearch);
-		size_t nResults = result.size();
-		*results = (label_t*)malloc(nResults*sizeof(label_t));
-		for (size_t i = nResults; i-- != 0;)
-		{
-			(*results)[i] = result.top().second;
-			result.pop();
-		}
-		*n_results = nResults;
-		return true;
-	}
-	catch (std::exception& x)
-	{
-		return false;
-	}
-}
-
-bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label)
-{
-	try
-	{
-		hnsw->addPoint(point, label);
-		return true;
-	}
-	catch (std::exception& x)
-	{
-		fprintf(stderr, "Catch %s\n", x.what());
-		return false;
-	}
-}
-
-void hnsw_init(HierarchicalNSW* hnsw, size_t dims, size_t maxelements, size_t M, size_t maxM, size_t efConstruction)
-{
-	new ((void*)hnsw) HierarchicalNSW(dims, maxelements, M, maxM, efConstruction);
-}
-
-
-int hnsw_dimensions(HierarchicalNSW* hnsw)
-{
-	return (int)hnsw->dim;
-}
-
-size_t hnsw_count(HierarchicalNSW* hnsw)
-{
-	return hnsw->cur_element_count;
-}
-
-size_t hnsw_sizeof(void)
-{
-	return sizeof(HierarchicalNSW);
-}
diff --git a/pgxn/hnsw/hnswalg.h b/pgxn/hnsw/hnswalg.h
deleted file mode 100644
index f38aeac362..0000000000
--- a/pgxn/hnsw/hnswalg.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#pragma once
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <unordered_map>
-#include <unordered_set>
-#include <map>
-#include <cmath>
-#include <queue>
-#include <stdexcept>
-
-extern "C" {
-#include "hnsw.h"
-}
-
-struct HierarchicalNSW
-{
-	size_t maxelements;
-	size_t cur_element_count;
-
-	idx_t  enterpoint_node;
-
-	size_t dim;
-	size_t data_size;
-	size_t offset_data;
-	size_t offset_label;
-	size_t size_data_per_element;
-	size_t M;
-	size_t maxM;
-	size_t size_links_level0;
-	size_t efConstruction;
-
-#ifdef __x86_64__
-	bool	use_avx2;
-#endif
-
-	char   data_level0_memory[0]; // varying size
-
-  public:
-	HierarchicalNSW(size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
-	~HierarchicalNSW();
-
-
-	inline coord_t *getDataByInternalId(idx_t internal_id) const {
-		return (coord_t *)&data_level0_memory[internal_id * size_data_per_element + offset_data];
-	}
-
-	inline idx_t *get_linklist0(idx_t internal_id) const {
-		return (idx_t*)&data_level0_memory[internal_id * size_data_per_element];
-	}
-
-	inline label_t *getExternalLabel(idx_t internal_id) const {
-		return (label_t *)&data_level0_memory[internal_id * size_data_per_element + offset_label];
-	}
-
-	std::priority_queue<std::pair<dist_t, idx_t>> searchBaseLayer(const coord_t *x, size_t ef);
-
-	void getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN);
-
-	void mutuallyConnectNewElement(const coord_t *x, idx_t id, std::priority_queue<std::pair<dist_t, idx_t>> topResults);
-
-	void addPoint(const coord_t *point, label_t label);
-
-	std::priority_queue<std::pair<dist_t, label_t>> searchKnn(const coord_t *query_data, size_t k);
-
-	dist_t fstdistfunc(const coord_t *x, const coord_t *y);
-};
diff --git a/pgxn/hnsw/test/expected/knn.out b/pgxn/hnsw/test/expected/knn.out
deleted file mode 100644
index a1cee4525e..0000000000
--- a/pgxn/hnsw/test/expected/knn.out
+++ /dev/null
@@ -1,28 +0,0 @@
-SET enable_seqscan = off;
-CREATE TABLE t (val real[]);
-INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
-CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
-INSERT INTO t (val) VALUES (array[1,2,4]);
-explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
-                             QUERY PLAN                             
---------------------------------------------------------------------
- Index Scan using t_val_idx on t  (cost=4.02..8.06 rows=3 width=36)
-   Order By: (val <-> '{3,3,3}'::real[])
-(2 rows)
-
-SELECT * FROM t ORDER BY val <-> array[3,3,3];
-   val   
----------
- {1,2,3}
- {1,2,4}
- {1,1,1}
- {0,0,0}
-(4 rows)
-
-SELECT COUNT(*) FROM t;
- count 
--------
-     5
-(1 row)
-
-DROP TABLE t;
diff --git a/pgxn/hnsw/test/sql/knn.sql b/pgxn/hnsw/test/sql/knn.sql
deleted file mode 100644
index 0635bda4a2..0000000000
--- a/pgxn/hnsw/test/sql/knn.sql
+++ /dev/null
@@ -1,13 +0,0 @@
-SET enable_seqscan = off;
-
-CREATE TABLE t (val real[]);
-INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
-CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
-
-INSERT INTO t (val) VALUES (array[1,2,4]);
-
-explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
-SELECT * FROM t ORDER BY val <-> array[3,3,3];
-SELECT COUNT(*) FROM t;
-
-DROP TABLE t;
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 2f63ee3acc..f362a45035 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -106,6 +106,7 @@ jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] }
 signature = "2"
 ecdsa = "0.16"
 p256 = { version = "0.13", features = ["jwk"] }
+ed25519-dalek = { version = "2", default-features = false, features = ["rand_core"] }
 rsa = "0.9"
 
 workspace_hack.workspace = true
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 5f65b17374..d7ffff0483 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -187,10 +187,6 @@ pub async fn worker(
     let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx));
     let rx = rx.map(RequestData::from);
 
-    let storage = GenericRemoteStorage::from_config(&remote_storage_config)
-        .await
-        .context("remote storage init")?;
-
     let properties = WriterProperties::builder()
         .set_data_page_size_limit(config.parquet_upload_page_size)
         .set_compression(config.parquet_upload_compression);
@@ -224,18 +220,18 @@ pub async fn worker(
         let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx));
         let rx_disconnect = rx_disconnect.map(RequestData::from);
 
-        let storage_disconnect =
-            GenericRemoteStorage::from_config(&disconnect_events_storage_config)
-                .await
-                .context("remote storage for disconnect events init")?;
         let parquet_config_disconnect = parquet_config.clone();
         tokio::try_join!(
-            worker_inner(storage, rx, parquet_config),
-            worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect)
+            worker_inner(remote_storage_config, rx, parquet_config),
+            worker_inner(
+                disconnect_events_storage_config,
+                rx_disconnect,
+                parquet_config_disconnect
+            )
         )
         .map(|_| ())
     } else {
-        worker_inner(storage, rx, parquet_config).await
+        worker_inner(remote_storage_config, rx, parquet_config).await
     }
 }
 
@@ -251,18 +247,32 @@ struct ParquetConfig {
     test_remote_failures: u64,
 }
 
+impl ParquetConfig {
+    async fn storage(
+        &self,
+        storage_config: &RemoteStorageConfig,
+    ) -> anyhow::Result<GenericRemoteStorage> {
+        let storage = GenericRemoteStorage::from_config(storage_config)
+            .await
+            .context("remote storage init")?;
+
+        #[cfg(any(test, feature = "testing"))]
+        if self.test_remote_failures > 0 {
+            return Ok(GenericRemoteStorage::unreliable_wrapper(
+                storage,
+                self.test_remote_failures,
+            ));
+        }
+
+        Ok(storage)
+    }
+}
+
 async fn worker_inner(
-    storage: GenericRemoteStorage,
+    storage_config: RemoteStorageConfig,
     rx: impl Stream<Item = RequestData>,
     config: ParquetConfig,
 ) -> anyhow::Result<()> {
-    #[cfg(any(test, feature = "testing"))]
-    let storage = if config.test_remote_failures > 0 {
-        GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures)
-    } else {
-        storage
-    };
-
     let mut rx = std::pin::pin!(rx);
 
     let mut rows = Vec::with_capacity(config.rows_per_group);
@@ -285,7 +295,7 @@ async fn worker_inner(
         }
         if len > config.file_size || force {
             last_upload = time::Instant::now();
-            let file = upload_parquet(w, len, &storage).await?;
+            let file = upload_parquet(w, len, &storage_config, &config).await?;
             w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?;
             len = 0;
         }
@@ -298,7 +308,7 @@ async fn worker_inner(
     }
 
     if !w.flushed_row_groups().is_empty() {
-        let _rtchk: Writer<BytesMut> = upload_parquet(w, len, &storage).await?;
+        let _rtchk: Writer<BytesMut> = upload_parquet(w, len, &storage_config, &config).await?;
     }
 
     Ok(())
@@ -340,7 +350,8 @@ where
 async fn upload_parquet(
     mut w: SerializedFileWriter<Writer<BytesMut>>,
     len: i64,
-    storage: &GenericRemoteStorage,
+    storage_config: &RemoteStorageConfig,
+    config: &ParquetConfig,
 ) -> anyhow::Result<Writer<BytesMut>> {
     let len_uncompressed = w
         .flushed_row_groups()
@@ -377,6 +388,15 @@ async fn upload_parquet(
         size, compression, "uploading request parquet file"
     );
 
+    // A bug in azure-sdk means that the identity-token-file that expires after
+    // 1 hour is not refreshed. This identity-token is used to fetch the actual azure storage
+    // tokens that last for 24 hours. After this 24 hour period, azure-sdk tries to refresh
+    // the storage token, but the identity token has now expired.
+    // <https://github.com/Azure/azure-sdk-for-rust/issues/1739>
+    //
+    // To work around this, we recreate the storage every time.
+    let storage = config.storage(storage_config).await?;
+
     let year = now.year();
     let month = now.month();
     let day = now.day();
@@ -431,8 +451,8 @@ mod tests {
     use rand::rngs::StdRng;
     use rand::{Rng, SeedableRng};
     use remote_storage::{
-        GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config,
-        DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+        RemoteStorageConfig, RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
+        DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
     };
     use tokio::sync::mpsc;
     use tokio::time;
@@ -559,12 +579,11 @@ mod tests {
             timeout: std::time::Duration::from_secs(120),
             small_timeout: std::time::Duration::from_secs(30),
         };
-        let storage = GenericRemoteStorage::from_config(&remote_storage_config)
+
+        worker_inner(remote_storage_config, rx, config)
             .await
             .unwrap();
 
-        worker_inner(storage, rx, config).await.unwrap();
-
         let mut files = WalkDir::new(tmpdir.as_std_path())
             .into_iter()
             .filter_map(|entry| entry.ok())
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index b398c3ddd0..6d5fb13681 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -3,9 +3,9 @@ use std::sync::Arc;
 use std::time::Duration;
 
 use async_trait::async_trait;
+use ed25519_dalek::SigningKey;
 use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
-use p256::ecdsa::SigningKey;
-use p256::elliptic_curve::JwkEcKey;
+use jose_jwk::jose_b64;
 use rand::rngs::OsRng;
 use tokio::net::{lookup_host, TcpStream};
 use tracing::field::display;
@@ -354,9 +354,15 @@ impl PoolingBackend {
     }
 }
 
-fn create_random_jwk() -> (SigningKey, JwkEcKey) {
-    let key = SigningKey::random(&mut OsRng);
-    let jwk = p256::PublicKey::from(key.verifying_key()).to_jwk();
+fn create_random_jwk() -> (SigningKey, jose_jwk::Key) {
+    let key = SigningKey::generate(&mut OsRng);
+
+    let jwk = jose_jwk::Key::Okp(jose_jwk::Okp {
+        crv: jose_jwk::OkpCurves::Ed25519,
+        x: jose_b64::serde::Bytes::from(key.verifying_key().to_bytes().to_vec()),
+        d: None,
+    });
+
     (key, jwk)
 }
 
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index c51a2bc9ba..fe33f0ff65 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -16,17 +16,16 @@ use std::sync::Arc;
 use std::task::{ready, Poll};
 use std::time::Duration;
 
+use ed25519_dalek::{Signature, Signer, SigningKey};
 use futures::future::poll_fn;
 use futures::Future;
 use indexmap::IndexMap;
 use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
-use p256::ecdsa::{Signature, SigningKey};
 use parking_lot::RwLock;
 use postgres_client::tls::NoTlsStream;
 use postgres_client::types::ToSql;
 use postgres_client::AsyncMessage;
 use serde_json::value::RawValue;
-use signature::Signer;
 use tokio::net::TcpStream;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
@@ -42,7 +41,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;
 
 pub(crate) const EXT_NAME: &str = "pg_session_jwt";
-pub(crate) const EXT_VERSION: &str = "0.1.2";
+pub(crate) const EXT_VERSION: &str = "0.2.0";
 pub(crate) const EXT_SCHEMA: &str = "auth";
 
 #[derive(Clone)]
@@ -339,8 +338,8 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
     let cap = jwt.capacity();
 
     // we only need an empty header with the alg specified.
-    // base64url(r#"{"alg":"ES256"}"#) == "eyJhbGciOiJFUzI1NiJ9"
-    jwt.push_str("eyJhbGciOiJFUzI1NiJ9.");
+    // base64url(r#"{"alg":"EdDSA"}"#) == "eyJhbGciOiJFZERTQSJ9"
+    jwt.push_str("eyJhbGciOiJFZERTQSJ9.");
 
     // encode the jwt payload in-place
     base64::encode_config_buf(payload, base64::URL_SAFE_NO_PAD, &mut jwt);
@@ -366,14 +365,14 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
 #[cfg(test)]
 #[expect(clippy::unwrap_used)]
 mod tests {
-    use p256::ecdsa::SigningKey;
+    use ed25519_dalek::SigningKey;
     use typed_json::json;
 
     use super::resign_jwt;
 
     #[test]
     fn jwt_token_snapshot() {
-        let key = SigningKey::from_bytes(&[1; 32].into()).unwrap();
+        let key = SigningKey::from_bytes(&[1; 32]);
         let data =
             json!({"foo":"bar","jti":"foo\nbar","nested":{"jti":"tricky nesting"}}).to_string();
 
@@ -381,12 +380,17 @@ mod tests {
 
         // To validate the JWT, copy the JWT string and paste it into https://jwt.io/.
         // In the public-key box, paste the following jwk public key
-        // `{"kty":"EC","crv":"P-256","x":"b_A7lJJBzh2t1DUZ5pYOCoW0GmmgXDKBA6orzhWUyhY","y":"PE91OlW_AdxT9sCwx-7ni0DG_30lqW4igrmJzvccFEo"}`
+        // `{"kty":"OKP","crv":"Ed25519","x":"iojj3XQJ8ZX9UtstPLpdcspnCb8dlBIb83SIAbQPb1w"}`
+        // Note - jwt.io doesn't support EdDSA :(
+        // https://github.com/jsonwebtoken/jsonwebtoken.github.io/issues/509
 
-        // let pub_key = p256::ecdsa::VerifyingKey::from(&key);
-        // let pub_key = p256::PublicKey::from(pub_key);
-        // println!("{}", pub_key.to_jwk_string());
+        // let jwk = jose_jwk::Key::Okp(jose_jwk::Okp {
+        //     crv: jose_jwk::OkpCurves::Ed25519,
+        //     x: jose_jwk::jose_b64::serde::Bytes::from(key.verifying_key().to_bytes().to_vec()),
+        //     d: None,
+        // });
+        // println!("{}", serde_json::to_string(&jwk).unwrap());
 
-        assert_eq!(jwt, "eyJhbGciOiJFUzI1NiJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.pYf0LxoJ8sDgpmsYOgrbNecOSipnPBEGwnZzB-JhW2cONrKlqRsgXwK8_cOsyolGy-hTTe8GXbWTl_UdpF5RyA");
+        assert_eq!(jwt, "eyJhbGciOiJFZERTQSJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.Cvyc2By33KI0f0obystwdy8PN111L3Sc9_Mr2CU3XshtSqSdxuRxNEZGbb_RvyJf2IzheC_s7aBZ-jLeQ9N0Bg");
     }
 }
diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs
index 996c4d9b8c..19c6662e74 100644
--- a/safekeeper/benches/receive_wal.rs
+++ b/safekeeper/benches/receive_wal.rs
@@ -21,14 +21,13 @@ const KB: usize = 1024;
 const MB: usize = 1024 * KB;
 const GB: usize = 1024 * MB;
 
-/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB.
-/// This mirrors the configuration in bin/safekeeper.rs.
+/// Use jemalloc and enable profiling, to mirror bin/safekeeper.rs.
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
 #[allow(non_upper_case_globals)]
 #[export_name = "malloc_conf"]
-pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
 
 // Register benchmarks with Criterion.
 criterion_group!(
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 13f6e34575..bc7af02185 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -51,10 +51,12 @@ use utils::{
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
-/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
+/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21).
+/// This adds roughly 3% overhead for allocations on average, which is acceptable considering
+/// performance-sensitive code will avoid allocations as far as possible anyway.
 #[allow(non_upper_case_globals)]
 #[export_name = "malloc_conf"]
-pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
 
 const PID_FILE_NAME: &str = "safekeeper.pid";
 const ID_FILE_NAME: &str = "safekeeper.id";
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 5f3319512d..caaa22d0a5 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -55,4 +55,4 @@ r2d2 = { version = "0.8.10" }
 utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
 control_plane = { path = "../control_plane" }
-workspace_hack = { version = "0.1", path = "../workspace_hack" }
+workspace_hack = { version = "0.1", path = "../workspace_hack" }
\ No newline at end of file
diff --git a/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql
new file mode 100644
index 0000000000..c2624f858b
--- /dev/null
+++ b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql
@@ -0,0 +1,4 @@
+-- this sadly isn't a "true" revert of the migration, as the column is now at the end of the table.
+-- But preserving order is not a trivial operation.
+-- https://wiki.postgresql.org/wiki/Alter_column_position
+ALTER TABLE safekeepers ADD active BOOLEAN NOT NULL DEFAULT false;
diff --git a/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql
new file mode 100644
index 0000000000..d76f044eda
--- /dev/null
+++ b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql
@@ -0,0 +1 @@
+ALTER TABLE safekeepers DROP active;
diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 69db48f8d1..3884a6df46 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -124,7 +124,10 @@ impl ComputeHookTenant {
                 if let Some(shard_idx) = shard_idx {
                     sharded.shards.remove(shard_idx);
                 } else {
-                    tracing::warn!("Shard not found while handling detach")
+                    // This is a valid but niche case, where the tenant was previously attached
+                    // as a Secondary location and then detached, so has no previously notified
+                    // state.
+                    tracing::info!("Shard not found while handling detach")
                 }
             }
             ComputeHookTenant::Unsharded(_) => {
@@ -761,7 +764,10 @@ impl ComputeHook {
         let mut state_locked = self.state.lock().unwrap();
         match state_locked.entry(tenant_shard_id.tenant_id) {
             Entry::Vacant(_) => {
-                tracing::warn!("Compute hook tenant not found for detach");
+                // This is a valid but niche case, where the tenant was previously attached
+                // as a Secondary location and then detached, so has no previously notified
+                // state.
+                tracing::info!("Compute hook tenant not found for detach");
             }
             Entry::Occupied(mut e) => {
                 let sharded = e.get().is_sharded();
diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs
index 47f4276ff2..8b7be88078 100644
--- a/storage_controller/src/drain_utils.rs
+++ b/storage_controller/src/drain_utils.rs
@@ -112,7 +112,7 @@ impl TenantShardDrain {
             }
         }
 
-        match scheduler.node_preferred(tenant_shard.intent.get_secondary()) {
+        match tenant_shard.preferred_secondary(scheduler) {
             Some(node) => Some(node),
             None => {
                 tracing::warn!(
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 5385e4ee0b..c8df4ffe28 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -690,7 +690,8 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
     };
 
     let state = get_state(&req);
-    let nodes = state.service.node_list().await?;
+    let mut nodes = state.service.node_list().await?;
+    nodes.sort_by_key(|n| n.get_id());
     let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
 
     json_response(StatusCode::OK, api_nodes)
@@ -1005,6 +1006,29 @@ async fn handle_tenant_shard_migrate(
     )
 }
 
+async fn handle_tenant_shard_migrate_secondary(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
+    let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
+    json_response(
+        StatusCode::OK,
+        service
+            .tenant_shard_migrate_secondary(tenant_shard_id, migrate_req)
+            .await?,
+    )
+}
+
 async fn handle_tenant_shard_cancel_reconcile(
     service: Arc<Service>,
     req: Request<Body>,
@@ -1855,6 +1879,16 @@ pub fn make_router(
                 RequestName("control_v1_tenant_migrate"),
             )
         })
+        .put(
+            "/control/v1/tenant/:tenant_shard_id/migrate_secondary",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_shard_migrate_secondary,
+                    RequestName("control_v1_tenant_migrate_secondary"),
+                )
+            },
+        )
         .put(
             "/control/v1/tenant/:tenant_shard_id/cancel_reconcile",
             |r| {
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index 6d5885eba6..4164e3dc2b 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -53,6 +53,16 @@ pub(crate) struct StorageControllerMetricGroup {
     /// How many shards are not scheduled into their preferred AZ
     pub(crate) storage_controller_schedule_az_violation: measured::Gauge,
 
+    /// How many shard locations (secondary or attached) on each node
+    pub(crate) storage_controller_node_shards: measured::GaugeVec<NodeLabelGroupSet>,
+
+    /// How many _attached_ shard locations on each node
+    pub(crate) storage_controller_node_attached_shards: measured::GaugeVec<NodeLabelGroupSet>,
+
+    /// How many _home_ shard locations on each node (i.e. the node's AZ matches the shard's
+    /// preferred AZ)
+    pub(crate) storage_controller_node_home_shards: measured::GaugeVec<NodeLabelGroupSet>,
+
     /// How many shards would like to reconcile but were blocked by concurrency limits
     pub(crate) storage_controller_pending_reconciles: measured::Gauge,
 
@@ -132,6 +142,15 @@ impl Default for StorageControllerMetrics {
     }
 }
 
+#[derive(measured::LabelGroup, Clone)]
+#[label(set = NodeLabelGroupSet)]
+pub(crate) struct NodeLabelGroup<'a> {
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    pub(crate) az: &'a str,
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    pub(crate) node_id: &'a str,
+}
+
 #[derive(measured::LabelGroup)]
 #[label(set = ReconcileCompleteLabelGroupSet)]
 pub(crate) struct ReconcileCompleteLabelGroup {
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 4cc9b0070d..f5c2d329e0 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -299,6 +299,7 @@ impl Node {
             id: self.id,
             availability: self.availability.clone().into(),
             scheduling: self.scheduling,
+            availability_zone_id: self.availability_zone_id.0.clone(),
             listen_http_addr: self.listen_http_addr.clone(),
             listen_http_port: self.listen_http_port,
             listen_pg_addr: self.listen_pg_addr.clone(),
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index cebf3e9594..eb0bfc879e 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -708,10 +708,11 @@ impl Persistence {
         Ok(())
     }
 
+    /// Note that passing None for a shard clears the preferred AZ (rather than leaving it unmodified)
     pub(crate) async fn set_tenant_shard_preferred_azs(
         &self,
-        preferred_azs: Vec<(TenantShardId, AvailabilityZone)>,
-    ) -> DatabaseResult<Vec<(TenantShardId, AvailabilityZone)>> {
+        preferred_azs: Vec<(TenantShardId, Option<AvailabilityZone>)>,
+    ) -> DatabaseResult<Vec<(TenantShardId, Option<AvailabilityZone>)>> {
         use crate::schema::tenant_shards::dsl::*;
 
         self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
@@ -722,7 +723,7 @@ impl Persistence {
                     .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                     .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
                     .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set(preferred_az_id.eq(preferred_az.0.clone()))
+                    .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone())))
                     .execute(conn)?;
 
                 if updated == 1 {
@@ -1258,7 +1259,6 @@ pub(crate) struct SafekeeperPersistence {
     pub(crate) version: i64,
     pub(crate) host: String,
     pub(crate) port: i32,
-    pub(crate) active: bool,
     pub(crate) http_port: i32,
     pub(crate) availability_zone_id: String,
     pub(crate) scheduling_policy: String,
@@ -1270,7 +1270,6 @@ impl SafekeeperPersistence {
             SkSchedulingPolicy::from_str(&self.scheduling_policy).map_err(|e| {
                 DatabaseError::Logical(format!("can't construct SkSchedulingPolicy: {e:?}"))
             })?;
-        // omit the `active` flag on purpose: it is deprecated.
         Ok(SafekeeperDescribeResponse {
             id: NodeId(self.id as u64),
             region_id: self.region_id.clone(),
@@ -1295,7 +1294,8 @@ pub(crate) struct SafekeeperUpsert {
     pub(crate) version: i64,
     pub(crate) host: String,
     pub(crate) port: i32,
-    pub(crate) active: bool,
+    /// The active flag will not be stored in the database and will be ignored.
+    pub(crate) active: Option<bool>,
     pub(crate) http_port: i32,
     pub(crate) availability_zone_id: String,
 }
@@ -1311,7 +1311,6 @@ impl SafekeeperUpsert {
             version: self.version,
             host: &self.host,
             port: self.port,
-            active: self.active,
             http_port: self.http_port,
             availability_zone_id: &self.availability_zone_id,
             // None means a wish to not update this column. We expose abilities to update it via other means.
@@ -1328,7 +1327,6 @@ struct InsertUpdateSafekeeper<'a> {
     version: i64,
     host: &'a str,
     port: i32,
-    active: bool,
     http_port: i32,
     availability_zone_id: &'a str,
     scheduling_policy: Option<&'a str>,
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index e0a854fff7..adced3b77d 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -696,6 +696,11 @@ impl Reconciler {
     /// First we apply special case handling (e.g. for live migrations), and then a
     /// general case reconciliation where we walk through the intent by pageserver
     /// and call out to the pageserver to apply the desired state.
+    ///
+    /// An Ok(()) result indicates that we successfully attached the tenant, but _not_ that
+    /// all locations for the tenant are in the expected state. When nodes that are to be detached
+    /// or configured as secondary are unavailable, we may return Ok(()) but leave the shard in a
+    /// state where it still requires later reconciliation.
     pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
         // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it
         self.maybe_refresh_observed().await?;
@@ -784,10 +789,18 @@ impl Reconciler {
                     tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
                 }
                 _ => {
-                    // In all cases other than a matching observed configuration, we will
-                    // reconcile this location.
-                    tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
-                    changes.push((node.clone(), wanted_conf))
+                    // Only try and configure secondary locations on nodes that are available.  This
+                    // allows the reconciler to "succeed" while some secondaries are offline (e.g. after
+                    // a node failure, where the failed node will have a secondary intent)
+                    if node.is_available() {
+                        tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
+                        changes.push((node.clone(), wanted_conf))
+                    } else {
+                        tracing::info!(node_id=%node.get_id(), "Skipping configuration as secondary, node is unavailable");
+                        self.observed
+                            .locations
+                            .insert(node.get_id(), ObservedStateLocation { conf: None });
+                    }
                 }
             }
         }
@@ -813,7 +826,21 @@ impl Reconciler {
             if self.cancel.is_cancelled() {
                 return Err(ReconcileError::Cancel);
             }
-            self.location_config(&node, conf, None, false).await?;
+            // We only try to configure secondary locations if the node is available.  This does
+            // not stop us succeeding with the reconcile, because our core goal is to make the
+            // shard _available_ (the attached location), and configuring secondary locations
+            // can be done lazily when the node becomes available (via background reconciliation).
+            if node.is_available() {
+                self.location_config(&node, conf, None, false).await?;
+            } else {
+                // If the node is unavailable, we skip and consider the reconciliation successful: this
+                // is a common case where a pageserver is marked unavailable: we demote a location on
+                // that unavailable pageserver to secondary.
+                tracing::info!("Skipping configuring secondary location {node}, it is unavailable");
+                self.observed
+                    .locations
+                    .insert(node.get_id(), ObservedStateLocation { conf: None });
+            }
         }
 
         // The condition below identifies a detach. We must have no attached intent and
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 51a4cf35be..f5cab9dd57 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -1,4 +1,4 @@
-use crate::{node::Node, tenant_shard::TenantShard};
+use crate::{metrics::NodeLabelGroup, node::Node, tenant_shard::TenantShard};
 use itertools::Itertools;
 use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization};
 use serde::Serialize;
@@ -32,6 +32,9 @@ pub(crate) struct SchedulerNode {
     shard_count: usize,
     /// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`].
     attached_shard_count: usize,
+    /// How many shards have a location on this node (via [`crate::tenant_shard::IntentState`]) _and_ this node
+    /// is in their preferred AZ (i.e. this is their 'home' location)
+    home_shard_count: usize,
     /// Availability zone id in which the node resides
     az: AvailabilityZone,
 
@@ -47,6 +50,12 @@ pub(crate) trait NodeSchedulingScore: Debug + Ord + Copy + Sized {
         preferred_az: &Option<AvailabilityZone>,
         context: &ScheduleContext,
     ) -> Option<Self>;
+
+    /// Return a score that drops any components based on node utilization: this is useful
+    /// for finding scores for scheduling optimisation, when we want to avoid rescheduling
+    /// shards due to e.g. disk usage, to avoid flapping.
+    fn for_optimization(&self) -> Self;
+
     fn is_overloaded(&self) -> bool;
     fn node_id(&self) -> NodeId;
 }
@@ -136,17 +145,13 @@ impl PartialOrd for SecondaryAzMatch {
 /// Ordering is given by member declaration order (top to bottom).
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
 pub(crate) struct NodeAttachmentSchedulingScore {
-    /// The number of shards belonging to the tenant currently being
-    /// scheduled that are attached to this node.
-    affinity_score: AffinityScore,
     /// Flag indicating whether this node matches the preferred AZ
     /// of the shard. For equal affinity scores, nodes in the matching AZ
     /// are considered first.
     az_match: AttachmentAzMatch,
-    /// Size of [`ScheduleContext::attached_nodes`] for the current node.
-    /// This normally tracks the number of attached shards belonging to the
-    /// tenant being scheduled that are already on this node.
-    attached_shards_in_context: usize,
+    /// The number of shards belonging to the tenant currently being
+    /// scheduled that are attached to this node.
+    affinity_score: AffinityScore,
     /// Utilisation score that combines shard count and disk utilisation
     utilization_score: u64,
     /// Total number of shards attached to this node. When nodes have identical utilisation, this
@@ -177,13 +182,25 @@ impl NodeSchedulingScore for NodeAttachmentSchedulingScore {
                 .copied()
                 .unwrap_or(AffinityScore::FREE),
             az_match: AttachmentAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())),
-            attached_shards_in_context: context.attached_nodes.get(node_id).copied().unwrap_or(0),
             utilization_score: utilization.cached_score(),
             total_attached_shard_count: node.attached_shard_count,
             node_id: *node_id,
         })
     }
 
+    /// For use in scheduling optimisation, where we only want to consider the aspects
+    /// of the score that can only be resolved by moving things (such as inter-shard affinity
+    /// and AZ affinity), and ignore aspects that reflect the total utilization of a node (which
+    /// can fluctuate for other reasons)
+    fn for_optimization(&self) -> Self {
+        Self {
+            utilization_score: 0,
+            total_attached_shard_count: 0,
+            node_id: NodeId(0),
+            ..*self
+        }
+    }
+
     fn is_overloaded(&self) -> bool {
         PageserverUtilization::is_overloaded(self.utilization_score)
     }
@@ -208,9 +225,9 @@ pub(crate) struct NodeSecondarySchedulingScore {
     affinity_score: AffinityScore,
     /// Utilisation score that combines shard count and disk utilisation
     utilization_score: u64,
-    /// Total number of shards attached to this node. When nodes have identical utilisation, this
-    /// acts as an anti-affinity between attached shards.
-    total_attached_shard_count: usize,
+    /// Anti-affinity with other non-home locations: this gives the behavior that secondaries
+    /// will spread out across the nodes in an AZ.
+    total_non_home_shard_count: usize,
     /// Convenience to make selection deterministic in tests and empty systems
     node_id: NodeId,
 }
@@ -237,11 +254,20 @@ impl NodeSchedulingScore for NodeSecondarySchedulingScore {
                 .copied()
                 .unwrap_or(AffinityScore::FREE),
             utilization_score: utilization.cached_score(),
-            total_attached_shard_count: node.attached_shard_count,
+            total_non_home_shard_count: (node.shard_count - node.home_shard_count),
             node_id: *node_id,
         })
     }
 
+    fn for_optimization(&self) -> Self {
+        Self {
+            utilization_score: 0,
+            total_non_home_shard_count: 0,
+            node_id: NodeId(0),
+            ..*self
+        }
+    }
+
     fn is_overloaded(&self) -> bool {
         PageserverUtilization::is_overloaded(self.utilization_score)
     }
@@ -293,6 +319,10 @@ impl AffinityScore {
     pub(crate) fn inc(&mut self) {
         self.0 += 1;
     }
+
+    pub(crate) fn dec(&mut self) {
+        self.0 -= 1;
+    }
 }
 
 impl std::ops::Add for AffinityScore {
@@ -324,9 +354,6 @@ pub(crate) struct ScheduleContext {
     /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
     pub(crate) nodes: HashMap<NodeId, AffinityScore>,
 
-    /// Specifically how many _attached_ locations are on each node
-    pub(crate) attached_nodes: HashMap<NodeId, usize>,
-
     pub(crate) mode: ScheduleMode,
 }
 
@@ -334,7 +361,6 @@ impl ScheduleContext {
     pub(crate) fn new(mode: ScheduleMode) -> Self {
         Self {
             nodes: HashMap::new(),
-            attached_nodes: HashMap::new(),
             mode,
         }
     }
@@ -348,25 +374,31 @@ impl ScheduleContext {
         }
     }
 
-    pub(crate) fn push_attached(&mut self, node_id: NodeId) {
-        let entry = self.attached_nodes.entry(node_id).or_default();
-        *entry += 1;
-    }
-
-    pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
-        self.nodes
-            .get(&node_id)
-            .copied()
-            .unwrap_or(AffinityScore::FREE)
-    }
-
-    pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
-        self.attached_nodes.get(&node_id).copied().unwrap_or(0)
+    /// Remove `shard`'s contributions to this context.  This is useful when considering scheduling
+    /// this shard afresh, where we don't want it to e.g. experience anti-affinity to its current location.
+    pub(crate) fn project_detach(&self, shard: &TenantShard) -> Self {
+        let mut new_context = self.clone();
+
+        if let Some(attached) = shard.intent.get_attached() {
+            if let Some(score) = new_context.nodes.get_mut(attached) {
+                score.dec();
+            }
+        }
+
+        for secondary in shard.intent.get_secondary() {
+            if let Some(score) = new_context.nodes.get_mut(secondary) {
+                score.dec();
+            }
+        }
+
+        new_context
     }
 
+    /// For test, track the sum of AffinityScore values, which is effectively how many
+    /// attached or secondary locations have been registered with this context.
     #[cfg(test)]
-    pub(crate) fn attach_count(&self) -> usize {
-        self.attached_nodes.values().sum()
+    pub(crate) fn location_count(&self) -> usize {
+        self.nodes.values().map(|i| i.0).sum()
     }
 }
 
@@ -388,6 +420,7 @@ impl Scheduler {
                 SchedulerNode {
                     shard_count: 0,
                     attached_shard_count: 0,
+                    home_shard_count: 0,
                     may_schedule: node.may_schedule(),
                     az: node.get_availability_zone_id().clone(),
                 },
@@ -415,6 +448,7 @@ impl Scheduler {
                 SchedulerNode {
                     shard_count: 0,
                     attached_shard_count: 0,
+                    home_shard_count: 0,
                     may_schedule: node.may_schedule(),
                     az: node.get_availability_zone_id().clone(),
                 },
@@ -427,6 +461,9 @@ impl Scheduler {
                     Some(node) => {
                         node.shard_count += 1;
                         node.attached_shard_count += 1;
+                        if Some(&node.az) == shard.preferred_az() {
+                            node.home_shard_count += 1;
+                        }
                     }
                     None => anyhow::bail!(
                         "Tenant {} references nonexistent node {}",
@@ -438,7 +475,12 @@ impl Scheduler {
 
             for node_id in shard.intent.get_secondary() {
                 match expect_nodes.get_mut(node_id) {
-                    Some(node) => node.shard_count += 1,
+                    Some(node) => {
+                        node.shard_count += 1;
+                        if Some(&node.az) == shard.preferred_az() {
+                            node.home_shard_count += 1;
+                        }
+                    }
                     None => anyhow::bail!(
                         "Tenant {} references nonexistent node {}",
                         shard.tenant_shard_id,
@@ -482,13 +524,20 @@ impl Scheduler {
     ///
     /// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
     /// [`Self::new`] or [`Self::node_upsert`])
-    pub(crate) fn update_node_ref_counts(&mut self, node_id: NodeId, update: RefCountUpdate) {
+    pub(crate) fn update_node_ref_counts(
+        &mut self,
+        node_id: NodeId,
+        preferred_az: Option<&AvailabilityZone>,
+        update: RefCountUpdate,
+    ) {
         let Some(node) = self.nodes.get_mut(&node_id) else {
             debug_assert!(false);
             tracing::error!("Scheduler missing node {node_id}");
             return;
         };
 
+        let is_home_az = Some(&node.az) == preferred_az;
+
         match update {
             RefCountUpdate::PromoteSecondary => {
                 node.attached_shard_count += 1;
@@ -496,19 +545,31 @@ impl Scheduler {
             RefCountUpdate::Attach => {
                 node.shard_count += 1;
                 node.attached_shard_count += 1;
+                if is_home_az {
+                    node.home_shard_count += 1;
+                }
             }
             RefCountUpdate::Detach => {
                 node.shard_count -= 1;
                 node.attached_shard_count -= 1;
+                if is_home_az {
+                    node.home_shard_count -= 1;
+                }
             }
             RefCountUpdate::DemoteAttached => {
                 node.attached_shard_count -= 1;
             }
             RefCountUpdate::AddSecondary => {
                 node.shard_count += 1;
+                if is_home_az {
+                    node.home_shard_count += 1;
+                }
             }
             RefCountUpdate::RemoveSecondary => {
                 node.shard_count -= 1;
+                if is_home_az {
+                    node.home_shard_count -= 1;
+                }
             }
         }
 
@@ -594,6 +655,7 @@ impl Scheduler {
                 entry.insert(SchedulerNode {
                     shard_count: 0,
                     attached_shard_count: 0,
+                    home_shard_count: 0,
                     may_schedule: node.may_schedule(),
                     az: node.get_availability_zone_id().clone(),
                 });
@@ -607,33 +669,20 @@ impl Scheduler {
         }
     }
 
-    /// Where we have several nodes to choose from, for example when picking a secondary location
-    /// to promote to an attached location, this method may be used to pick the best choice based
-    /// on the scheduler's knowledge of utilization and availability.
-    ///
-    /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
-    /// caller can pick a node some other way.
-    pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option<NodeId> {
-        if nodes.is_empty() {
-            return None;
-        }
-
-        // TODO: When the utilization score returned by the pageserver becomes meaningful,
-        // schedule based on that instead of the shard count.
-        let node = nodes
-            .iter()
-            .map(|node_id| {
-                let may_schedule = self
-                    .nodes
-                    .get(node_id)
-                    .map(|n| !matches!(n.may_schedule, MaySchedule::No))
-                    .unwrap_or(false);
-                (*node_id, may_schedule)
-            })
-            .max_by_key(|(_n, may_schedule)| *may_schedule);
-
-        // If even the preferred node has may_schedule==false, return None
-        node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
+    /// Calculate a single node's score, used in optimizer logic to compare specific
+    /// nodes' scores.
+    pub(crate) fn compute_node_score<Score>(
+        &mut self,
+        node_id: NodeId,
+        preferred_az: &Option<AvailabilityZone>,
+        context: &ScheduleContext,
+    ) -> Option<Score>
+    where
+        Score: NodeSchedulingScore,
+    {
+        self.nodes
+            .get_mut(&node_id)
+            .and_then(|node| Score::generate(&node_id, node, preferred_az, context))
     }
 
     /// Compute a schedulling score for each node that the scheduler knows of
@@ -727,7 +776,7 @@ impl Scheduler {
             tracing::info!(
             "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
             scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>()
-        );
+       );
         }
 
         // Note that we do not update shard count here to reflect the scheduling: that
@@ -743,47 +792,74 @@ impl Scheduler {
     }
 
     /// For choosing which AZ to schedule a new shard into, use this.  It will return the
-    /// AZ with the lowest median utilization.
+    /// AZ with the the lowest number of shards currently scheduled in this AZ as their home
+    /// location.
     ///
     /// We use an AZ-wide measure rather than simply selecting the AZ of the least-loaded
     /// node, because while tenants start out single sharded, when they grow and undergo
-    /// shard-split, they will occupy space on many nodes within an AZ.
+    /// shard-split, they will occupy space on many nodes within an AZ.  It is important
+    /// that we pick the AZ in a way that balances this _future_ load.
     ///
-    /// We use median rather than total free space or mean utilization, because
-    /// we wish to avoid preferring AZs that have low-load nodes resulting from
-    /// recent replacements.
-    ///
-    /// The practical result is that we will pick an AZ based on its median node, and
-    /// then actually _schedule_ the new shard onto the lowest-loaded node in that AZ.
+    /// Once we've picked an AZ, subsequent scheduling within that AZ will be driven by
+    /// nodes' utilization scores.
     pub(crate) fn get_az_for_new_tenant(&self) -> Option<AvailabilityZone> {
         if self.nodes.is_empty() {
             return None;
         }
 
-        let mut scores_by_az = HashMap::new();
-        for (node_id, node) in &self.nodes {
-            let az_scores = scores_by_az.entry(&node.az).or_insert_with(Vec::new);
-            let score = match &node.may_schedule {
-                MaySchedule::Yes(utilization) => utilization.score(),
-                MaySchedule::No => PageserverUtilization::full().score(),
-            };
-            az_scores.push((node_id, node, score));
+        #[derive(Default)]
+        struct AzScore {
+            home_shard_count: usize,
+            scheduleable: bool,
         }
 
-        // Sort by utilization.  Also include the node ID to break ties.
-        for scores in scores_by_az.values_mut() {
-            scores.sort_by_key(|i| (i.2, i.0));
+        let mut azs: HashMap<&AvailabilityZone, AzScore> = HashMap::new();
+        for node in self.nodes.values() {
+            let az = azs.entry(&node.az).or_default();
+            az.home_shard_count += node.home_shard_count;
+            az.scheduleable |= matches!(node.may_schedule, MaySchedule::Yes(_));
         }
 
-        let mut median_by_az = scores_by_az
+        // If any AZs are schedulable, then filter out the non-schedulable ones (i.e. AZs where
+        // all nodes are overloaded or otherwise unschedulable).
+        if azs.values().any(|i| i.scheduleable) {
+            azs.retain(|_, i| i.scheduleable);
+        }
+
+        // Find the AZ with the lowest number of shards currently allocated
+        Some(
+            azs.into_iter()
+                .min_by_key(|i| (i.1.home_shard_count, i.0))
+                .unwrap()
+                .0
+                .clone(),
+        )
+    }
+
+    pub(crate) fn get_node_az(&self, node_id: &NodeId) -> Option<AvailabilityZone> {
+        self.nodes.get(node_id).map(|n| n.az.clone())
+    }
+
+    /// For use when choosing a preferred secondary location: filter out nodes that are not
+    /// available, and gather their AZs.
+    pub(crate) fn filter_usable_nodes(
+        &self,
+        nodes: &[NodeId],
+    ) -> Vec<(NodeId, Option<AvailabilityZone>)> {
+        nodes
             .iter()
-            .map(|(az, nodes)| (*az, nodes.get(nodes.len() / 2).unwrap().2))
-            .collect::<Vec<_>>();
-        // Sort by utilization.  Also include the AZ to break ties.
-        median_by_az.sort_by_key(|i| (i.1, i.0));
-
-        // Return the AZ with the lowest median utilization
-        Some(median_by_az.first().unwrap().0.clone())
+            .filter_map(|node_id| {
+                let node = self
+                    .nodes
+                    .get(node_id)
+                    .expect("Referenced nodes always exist");
+                if matches!(node.may_schedule, MaySchedule::Yes(_)) {
+                    Some((*node_id, Some(node.az.clone())))
+                } else {
+                    None
+                }
+            })
+            .collect()
     }
 
     /// Unit test access to internal state
@@ -796,6 +872,33 @@ impl Scheduler {
     pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize {
         self.nodes.get(&node_id).unwrap().attached_shard_count
     }
+
+    /// Some metrics that we only calculate periodically: this is simpler than
+    /// rigorously updating them on every change.
+    pub(crate) fn update_metrics(&self) {
+        for (node_id, node) in &self.nodes {
+            let node_id_str = format!("{}", node_id);
+            let label_group = NodeLabelGroup {
+                az: &node.az.0,
+                node_id: &node_id_str,
+            };
+
+            crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_node_shards
+                .set(label_group.clone(), node.shard_count as i64);
+
+            crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_node_attached_shards
+                .set(label_group.clone(), node.attached_shard_count as i64);
+
+            crate::metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_node_home_shards
+                .set(label_group.clone(), node.home_shard_count as i64);
+        }
+    }
 }
 
 #[cfg(test)]
@@ -843,7 +946,14 @@ pub(crate) mod test_utils {
 
 #[cfg(test)]
 mod tests {
-    use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
+    use pageserver_api::{
+        controller_api::NodeAvailability, models::utilization::test_utilization,
+        shard::ShardIdentity,
+    };
+    use utils::{
+        id::TenantId,
+        shard::{ShardCount, ShardNumber, TenantShardId},
+    };
 
     use super::*;
 
@@ -853,8 +963,8 @@ mod tests {
         let nodes = test_utils::make_test_nodes(2, &[]);
 
         let mut scheduler = Scheduler::new(nodes.values());
-        let mut t1_intent = IntentState::new();
-        let mut t2_intent = IntentState::new();
+        let mut t1_intent = IntentState::new(None);
+        let mut t2_intent = IntentState::new(None);
 
         let context = ScheduleContext::default();
 
@@ -930,7 +1040,7 @@ mod tests {
             let scheduled = scheduler
                 .schedule_shard::<AttachedShardTag>(&[], &None, context)
                 .unwrap();
-            let mut intent = IntentState::new();
+            let mut intent = IntentState::new(None);
             intent.set_attached(scheduler, Some(scheduled));
             scheduled_intents.push(intent);
             assert_eq!(scheduled, expect_node);
@@ -1063,7 +1173,7 @@ mod tests {
             let scheduled = scheduler
                 .schedule_shard::<Tag>(&[], &preferred_az, context)
                 .unwrap();
-            let mut intent = IntentState::new();
+            let mut intent = IntentState::new(preferred_az.clone());
             intent.set_attached(scheduler, Some(scheduled));
             scheduled_intents.push(intent);
             assert_eq!(scheduled, expect_node);
@@ -1089,9 +1199,9 @@ mod tests {
             &mut context,
         );
 
-        // Node 2 is not in "az-a", but it has the lowest affinity so we prefer that.
+        // Node 1 and 3 (az-a) have same affinity score, so prefer the lowest node id.
         assert_scheduler_chooses::<AttachedShardTag>(
-            NodeId(2),
+            NodeId(1),
             Some(az_a_tag.clone()),
             &mut scheduled_intents,
             &mut scheduler,
@@ -1107,26 +1217,6 @@ mod tests {
             &mut context,
         );
 
-        // Avoid nodes in "az-b" for the secondary location.
-        // Nodes 1 and 3 are identically loaded, so prefer the lowest node id.
-        assert_scheduler_chooses::<SecondaryShardTag>(
-            NodeId(1),
-            Some(az_b_tag.clone()),
-            &mut scheduled_intents,
-            &mut scheduler,
-            &mut context,
-        );
-
-        // Avoid nodes in "az-b" for the secondary location.
-        // Node 3 has lower affinity score than 1, so prefer that.
-        assert_scheduler_chooses::<SecondaryShardTag>(
-            NodeId(3),
-            Some(az_b_tag.clone()),
-            &mut scheduled_intents,
-            &mut scheduler,
-            &mut context,
-        );
-
         for mut intent in scheduled_intents {
             intent.clear(&mut scheduler);
         }
@@ -1150,34 +1240,292 @@ mod tests {
 
         let mut scheduler = Scheduler::new(nodes.values());
 
-        /// Force the utilization of a node in Scheduler's state to a particular
-        /// number of bytes used.
-        fn set_utilization(scheduler: &mut Scheduler, node_id: NodeId, shard_count: u32) {
-            let mut node = Node::new(
-                node_id,
-                "".to_string(),
-                0,
-                "".to_string(),
-                0,
-                scheduler.nodes.get(&node_id).unwrap().az.clone(),
-            );
-            node.set_availability(NodeAvailability::Active(test_utilization::simple(
-                shard_count,
-                0,
-            )));
-            scheduler.node_upsert(&node);
+        /// Force the `home_shard_count` of a node directly: this is the metric used
+        /// by the scheduler when picking AZs.
+        fn set_shard_count(scheduler: &mut Scheduler, node_id: NodeId, shard_count: usize) {
+            let node = scheduler.nodes.get_mut(&node_id).unwrap();
+            node.home_shard_count = shard_count;
         }
 
         // Initial empty state.  Scores are tied, scheduler prefers lower AZ ID.
         assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
 
-        // Put some utilization on one node in AZ A: this should change nothing, as the median hasn't changed
-        set_utilization(&mut scheduler, NodeId(1), 1000000);
-        assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
-
-        // Put some utilization on a second node in AZ A: now the median has changed, so the scheduler
-        // should prefer the other AZ.
-        set_utilization(&mut scheduler, NodeId(2), 1000000);
+        // Home shard count is higher in AZ A, so AZ B will be preferred
+        set_shard_count(&mut scheduler, NodeId(1), 10);
         assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_b_tag.clone()));
+
+        // Total home shard count is higher in AZ B, so we revert to preferring AZ A
+        set_shard_count(&mut scheduler, NodeId(4), 6);
+        set_shard_count(&mut scheduler, NodeId(5), 6);
+        assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
+    }
+
+    /// Test that when selecting AZs for many new tenants, we get the expected balance across nodes
+    #[test]
+    fn az_selection_many() {
+        let az_a_tag = AvailabilityZone("az-a".to_string());
+        let az_b_tag = AvailabilityZone("az-b".to_string());
+        let az_c_tag = AvailabilityZone("az-c".to_string());
+        let nodes = test_utils::make_test_nodes(
+            6,
+            &[
+                az_a_tag.clone(),
+                az_b_tag.clone(),
+                az_c_tag.clone(),
+                az_a_tag.clone(),
+                az_b_tag.clone(),
+                az_c_tag.clone(),
+            ],
+        );
+
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        // We should get 1/6th of these on each node, give or take a few...
+        let total_tenants = 300;
+
+        // ...where the 'few' is the number of AZs, because the scheduling will sometimes overshoot
+        // on one AZ before correcting itself.  This is because we select the 'home' AZ based on
+        // an AZ-wide metric, but we select the location for secondaries on a purely node-based
+        // metric (while excluding the home AZ).
+        let grace = 3;
+
+        let mut scheduled_shards = Vec::new();
+        for _i in 0..total_tenants {
+            let preferred_az = scheduler.get_az_for_new_tenant().unwrap();
+
+            let mut node_home_counts = scheduler
+                .nodes
+                .iter()
+                .map(|(node_id, node)| (node_id, node.home_shard_count))
+                .collect::<Vec<_>>();
+            node_home_counts.sort_by_key(|i| i.0);
+            eprintln!("Selected {}, vs nodes {:?}", preferred_az, node_home_counts);
+
+            let tenant_shard_id = TenantShardId {
+                tenant_id: TenantId::generate(),
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(1),
+            };
+
+            let shard_identity = ShardIdentity::new(
+                tenant_shard_id.shard_number,
+                tenant_shard_id.shard_count,
+                pageserver_api::shard::ShardStripeSize(1),
+            )
+            .unwrap();
+            let mut shard = TenantShard::new(
+                tenant_shard_id,
+                shard_identity,
+                pageserver_api::controller_api::PlacementPolicy::Attached(1),
+                Some(preferred_az),
+            );
+
+            let mut context = ScheduleContext::default();
+            shard.schedule(&mut scheduler, &mut context).unwrap();
+            eprintln!("Scheduled shard at {:?}", shard.intent);
+
+            scheduled_shards.push(shard);
+        }
+
+        for (node_id, node) in &scheduler.nodes {
+            eprintln!(
+                "Node {}: {} {} {}",
+                node_id, node.shard_count, node.attached_shard_count, node.home_shard_count
+            );
+        }
+
+        for node in scheduler.nodes.values() {
+            assert!((node.home_shard_count as i64 - total_tenants as i64 / 6).abs() < grace);
+        }
+
+        for mut shard in scheduled_shards {
+            shard.intent.clear(&mut scheduler);
+        }
+    }
+
+    #[test]
+    /// Make sure that when we have an odd number of nodes and an even number of shards, we still
+    /// get scheduling stability.
+    fn odd_nodes_stability() {
+        let az_a = AvailabilityZone("az-a".to_string());
+        let az_b = AvailabilityZone("az-b".to_string());
+
+        let nodes = test_utils::make_test_nodes(
+            10,
+            &[
+                az_a.clone(),
+                az_a.clone(),
+                az_a.clone(),
+                az_a.clone(),
+                az_a.clone(),
+                az_b.clone(),
+                az_b.clone(),
+                az_b.clone(),
+                az_b.clone(),
+                az_b.clone(),
+            ],
+        );
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        // Need to keep these alive because they contribute to shard counts via RAII
+        let mut scheduled_shards = Vec::new();
+
+        let mut context = ScheduleContext::default();
+
+        fn schedule_shard(
+            tenant_shard_id: TenantShardId,
+            expect_attached: NodeId,
+            expect_secondary: NodeId,
+            scheduled_shards: &mut Vec<TenantShard>,
+            scheduler: &mut Scheduler,
+            preferred_az: Option<AvailabilityZone>,
+            context: &mut ScheduleContext,
+        ) {
+            let shard_identity = ShardIdentity::new(
+                tenant_shard_id.shard_number,
+                tenant_shard_id.shard_count,
+                pageserver_api::shard::ShardStripeSize(1),
+            )
+            .unwrap();
+            let mut shard = TenantShard::new(
+                tenant_shard_id,
+                shard_identity,
+                pageserver_api::controller_api::PlacementPolicy::Attached(1),
+                preferred_az,
+            );
+
+            shard.schedule(scheduler, context).unwrap();
+
+            assert_eq!(shard.intent.get_attached().unwrap(), expect_attached);
+            assert_eq!(
+                shard.intent.get_secondary().first().unwrap(),
+                &expect_secondary
+            );
+
+            scheduled_shards.push(shard);
+        }
+
+        let tenant_id = TenantId::generate();
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(8),
+            },
+            NodeId(1),
+            NodeId(6),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(1),
+                shard_count: ShardCount(8),
+            },
+            NodeId(2),
+            NodeId(7),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(2),
+                shard_count: ShardCount(8),
+            },
+            NodeId(3),
+            NodeId(8),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(3),
+                shard_count: ShardCount(8),
+            },
+            NodeId(4),
+            NodeId(9),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(4),
+                shard_count: ShardCount(8),
+            },
+            NodeId(5),
+            NodeId(10),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(5),
+                shard_count: ShardCount(8),
+            },
+            NodeId(1),
+            NodeId(6),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(6),
+                shard_count: ShardCount(8),
+            },
+            NodeId(2),
+            NodeId(7),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        schedule_shard(
+            TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(7),
+                shard_count: ShardCount(8),
+            },
+            NodeId(3),
+            NodeId(8),
+            &mut scheduled_shards,
+            &mut scheduler,
+            Some(az_a.clone()),
+            &mut context,
+        );
+
+        // Assert that the optimizer suggests nochanges, i.e. our initial scheduling was stable.
+        for shard in &scheduled_shards {
+            assert_eq!(shard.optimize_attachment(&mut scheduler, &context), None);
+        }
+
+        for mut shard in scheduled_shards {
+            shard.intent.clear(&mut scheduler);
+        }
     }
 }
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 44c91619ab..14c30c296d 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -36,7 +36,6 @@ diesel::table! {
         version -> Int8,
         host -> Text,
         port -> Int4,
-        active -> Bool,
         http_port -> Int4,
         availability_zone_id -> Text,
         scheduling_policy -> Varchar,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 265b2798d2..cbb9103880 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1404,7 +1404,11 @@ impl Service {
 
             // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
             // it with what we can infer: the node for which a generation was most recently issued.
-            let mut intent = IntentState::new();
+            let mut intent = IntentState::new(
+                tsp.preferred_az_id
+                    .as_ref()
+                    .map(|az| AvailabilityZone(az.clone())),
+            );
             if let Some(generation_pageserver) = tsp.generation_pageserver.map(|n| NodeId(n as u64))
             {
                 if nodes.contains_key(&generation_pageserver) {
@@ -2474,18 +2478,29 @@ impl Service {
         tenant_id: TenantId,
         _guard: &TracingExclusiveGuard<TenantOperations>,
     ) -> Result<(), ApiError> {
-        let present_in_memory = {
+        // Check if the tenant is present in memory, and select an AZ to use when loading
+        // if we will load it.
+        let load_in_az = {
             let locked = self.inner.read().unwrap();
-            locked
+            let existing = locked
                 .tenants
                 .range(TenantShardId::tenant_range(tenant_id))
-                .next()
-                .is_some()
-        };
+                .next();
 
-        if present_in_memory {
-            return Ok(());
-        }
+            // If the tenant is not present in memory, we expect to load it from database,
+            // so let's figure out what AZ to load it into while we have self.inner locked.
+            if existing.is_none() {
+                locked
+                    .scheduler
+                    .get_az_for_new_tenant()
+                    .ok_or(ApiError::BadRequest(anyhow::anyhow!(
+                        "No AZ with nodes found to load tenant"
+                    )))?
+            } else {
+                // We already have this tenant in memory
+                return Ok(());
+            }
+        };
 
         let tenant_shards = self.persistence.load_tenant(tenant_id).await?;
         if tenant_shards.is_empty() {
@@ -2494,8 +2509,20 @@ impl Service {
             ));
         }
 
-        // TODO: choose a fresh AZ to use for this tenant when un-detaching: there definitely isn't a running
-        // compute, so no benefit to making AZ sticky across detaches.
+        // Update the persistent shards with the AZ that we are about to apply to in-memory state
+        self.persistence
+            .set_tenant_shard_preferred_azs(
+                tenant_shards
+                    .iter()
+                    .map(|t| {
+                        (
+                            t.get_tenant_shard_id().expect("Corrupt shard in database"),
+                            Some(load_in_az.clone()),
+                        )
+                    })
+                    .collect(),
+            )
+            .await?;
 
         let mut locked = self.inner.write().unwrap();
         tracing::info!(
@@ -2505,7 +2532,7 @@ impl Service {
         );
 
         locked.tenants.extend(tenant_shards.into_iter().map(|p| {
-            let intent = IntentState::new();
+            let intent = IntentState::new(Some(load_in_az.clone()));
             let shard =
                 TenantShard::from_persistent(p, intent).expect("Corrupt shard row in database");
 
@@ -4236,6 +4263,22 @@ impl Service {
                 }
 
                 tracing::info!("Restoring parent shard {tenant_shard_id}");
+
+                // Drop any intents that refer to unavailable nodes, to enable this abort to proceed even
+                // if the original attachment location is offline.
+                if let Some(node_id) = shard.intent.get_attached() {
+                    if !nodes.get(node_id).unwrap().is_available() {
+                        tracing::info!("Demoting attached intent for {tenant_shard_id} on unavailable node {node_id}");
+                        shard.intent.demote_attached(scheduler, *node_id);
+                    }
+                }
+                for node_id in shard.intent.get_secondary().clone() {
+                    if !nodes.get(&node_id).unwrap().is_available() {
+                        tracing::info!("Dropping secondary intent for {tenant_shard_id} on unavailable node {node_id}");
+                        shard.intent.remove_secondary(scheduler, node_id);
+                    }
+                }
+
                 shard.splitting = SplitState::Idle;
                 if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) {
                     // If this shard can't be scheduled now (perhaps due to offline nodes or
@@ -4389,15 +4432,13 @@ impl Service {
 
                     let mut child_state =
                         TenantShard::new(child, child_shard, policy.clone(), preferred_az.clone());
-                    child_state.intent = IntentState::single(scheduler, Some(pageserver));
+                    child_state.intent =
+                        IntentState::single(scheduler, Some(pageserver), preferred_az.clone());
                     child_state.observed = ObservedState {
                         locations: child_observed,
                     };
                     child_state.generation = Some(generation);
                     child_state.config = config.clone();
-                    if let Some(preferred_az) = &preferred_az {
-                        child_state.set_preferred_az(preferred_az.clone());
-                    }
 
                     // The child's TenantShard::splitting is intentionally left at the default value of Idle,
                     // as at this point in the split process we have succeeded and this part is infallible:
@@ -5014,6 +5055,8 @@ impl Service {
                         // If our new attached node was a secondary, it no longer should be.
                         shard.intent.remove_secondary(scheduler, migrate_req.node_id);
 
+                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
+
                         // If we were already attached to something, demote that to a secondary
                         if let Some(old_attached) = old_attached {
                             if n > 0 {
@@ -5025,8 +5068,6 @@ impl Service {
                                 shard.intent.push_secondary(scheduler, old_attached);
                             }
                         }
-
-                        shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
                     }
                     PlacementPolicy::Secondary => {
                         shard.intent.clear(scheduler);
@@ -5055,6 +5096,69 @@ impl Service {
         Ok(TenantShardMigrateResponse {})
     }
 
+    pub(crate) async fn tenant_shard_migrate_secondary(
+        &self,
+        tenant_shard_id: TenantShardId,
+        migrate_req: TenantShardMigrateRequest,
+    ) -> Result<TenantShardMigrateResponse, ApiError> {
+        let waiter = {
+            let mut locked = self.inner.write().unwrap();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
+
+            let Some(node) = nodes.get(&migrate_req.node_id) else {
+                return Err(ApiError::BadRequest(anyhow::anyhow!(
+                    "Node {} not found",
+                    migrate_req.node_id
+                )));
+            };
+
+            if !node.is_available() {
+                // Warn but proceed: the caller may intend to manually adjust the placement of
+                // a shard even if the node is down, e.g. if intervening during an incident.
+                tracing::warn!("Migrating to unavailable node {node}");
+            }
+
+            let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant shard not found").into(),
+                ));
+            };
+
+            if shard.intent.get_secondary().len() == 1
+                && shard.intent.get_secondary()[0] == migrate_req.node_id
+            {
+                tracing::info!(
+                    "Migrating secondary to {node}: intent is unchanged {:?}",
+                    shard.intent
+                );
+            } else if shard.intent.get_attached() == &Some(migrate_req.node_id) {
+                tracing::info!("Migrating secondary to {node}: already attached where we were asked to create a secondary");
+            } else {
+                let old_secondaries = shard.intent.get_secondary().clone();
+                for secondary in old_secondaries {
+                    shard.intent.remove_secondary(scheduler, secondary);
+                }
+
+                shard.intent.push_secondary(scheduler, migrate_req.node_id);
+                shard.sequence = shard.sequence.next();
+                tracing::info!(
+                    "Migrating secondary to {node}: new intent {:?}",
+                    shard.intent
+                );
+            }
+
+            self.maybe_reconcile_shard(shard, nodes)
+        };
+
+        if let Some(waiter) = waiter {
+            waiter.wait_timeout(RECONCILE_TIMEOUT).await?;
+        } else {
+            tracing::info!("Migration is a no-op");
+        }
+
+        Ok(TenantShardMigrateResponse {})
+    }
+
     /// 'cancel' in this context means cancel any ongoing reconcile
     pub(crate) async fn tenant_shard_cancel_reconcile(
         &self,
@@ -5256,7 +5360,8 @@ impl Service {
         expect_nodes.sort_by_key(|n| n.node_id);
         nodes.sort_by_key(|n| n.node_id);
 
-        if nodes != expect_nodes {
+        // Errors relating to nodes are deferred so that we don't skip the shard checks below if we have a node error
+        let node_result = if nodes != expect_nodes {
             tracing::error!("Consistency check failed on nodes.");
             tracing::error!(
                 "Nodes in memory: {}",
@@ -5268,10 +5373,12 @@ impl Service {
                 serde_json::to_string(&nodes)
                     .map_err(|e| ApiError::InternalServerError(e.into()))?
             );
-            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+            Err(ApiError::InternalServerError(anyhow::anyhow!(
                 "Node consistency failure"
-            )));
-        }
+            )))
+        } else {
+            Ok(())
+        };
 
         let mut persistent_shards = self.persistence.load_active_tenant_shards().await?;
         persistent_shards
@@ -5281,6 +5388,7 @@ impl Service {
 
         if persistent_shards != expect_shards {
             tracing::error!("Consistency check failed on shards.");
+
             tracing::error!(
                 "Shards in memory: {}",
                 serde_json::to_string(&expect_shards)
@@ -5291,12 +5399,57 @@ impl Service {
                 serde_json::to_string(&persistent_shards)
                     .map_err(|e| ApiError::InternalServerError(e.into()))?
             );
+
+            // The total dump log lines above are useful in testing but in the field grafana will
+            // usually just drop them because they're so large. So we also do some explicit logging
+            // of just the diffs.
+            let persistent_shards = persistent_shards
+                .into_iter()
+                .map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp))
+                .collect::<HashMap<_, _>>();
+            let expect_shards = expect_shards
+                .into_iter()
+                .map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp))
+                .collect::<HashMap<_, _>>();
+            for (tenant_shard_id, persistent_tsp) in &persistent_shards {
+                match expect_shards.get(tenant_shard_id) {
+                    None => {
+                        tracing::error!(
+                            "Shard {} found in database but not in memory",
+                            tenant_shard_id
+                        );
+                    }
+                    Some(expect_tsp) => {
+                        if expect_tsp != persistent_tsp {
+                            tracing::error!(
+                                "Shard {} is inconsistent.  In memory: {}, database has: {}",
+                                tenant_shard_id,
+                                serde_json::to_string(expect_tsp).unwrap(),
+                                serde_json::to_string(&persistent_tsp).unwrap()
+                            );
+                        }
+                    }
+                }
+            }
+
+            // Having already logged any differences, log any shards that simply aren't present in the database
+            for (tenant_shard_id, memory_tsp) in &expect_shards {
+                if !persistent_shards.contains_key(tenant_shard_id) {
+                    tracing::error!(
+                        "Shard {} found in memory but not in database: {}",
+                        tenant_shard_id,
+                        serde_json::to_string(memory_tsp)
+                            .map_err(|e| ApiError::InternalServerError(e.into()))?
+                    );
+                }
+            }
+
             return Err(ApiError::InternalServerError(anyhow::anyhow!(
                 "Shard consistency failure"
             )));
         }
 
-        Ok(())
+        node_result
     }
 
     /// For debug/support: a JSON dump of the [`Scheduler`].  Returns a response so that
@@ -5600,7 +5753,7 @@ impl Service {
             register_req.listen_http_port,
             register_req.listen_pg_addr,
             register_req.listen_pg_port,
-            register_req.availability_zone_id,
+            register_req.availability_zone_id.clone(),
         );
 
         // TODO: idempotency if the node already exists in the database
@@ -5620,8 +5773,9 @@ impl Service {
             .set(locked.nodes.len() as i64);
 
         tracing::info!(
-            "Registered pageserver {}, now have {} pageservers",
+            "Registered pageserver {} ({}), now have {} pageservers",
             register_req.node_id,
+            register_req.availability_zone_id,
             locked.nodes.len()
         );
         Ok(())
@@ -6236,7 +6390,7 @@ impl Service {
     /// available.  A return value of 0 indicates that everything is fully reconciled already.
     fn reconcile_all(&self) -> usize {
         let mut locked = self.inner.write().unwrap();
-        let (nodes, tenants, _scheduler) = locked.parts_mut();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
         let pageservers = nodes.clone();
 
         // This function is an efficient place to update lazy statistics, since we are walking
@@ -6297,6 +6451,9 @@ impl Service {
             }
         }
 
+        // Some metrics are calculated from SchedulerNode state, update these periodically
+        scheduler.update_metrics();
+
         // Process any deferred tenant drops
         for (tenant_id, guard) in drop_detached_tenants {
             self.maybe_drop_tenant(tenant_id, &mut locked, &guard);
@@ -6355,6 +6512,7 @@ impl Service {
                 // Shard was dropped between planning and execution;
                 continue;
             };
+            tracing::info!(tenant_shard_id=%tenant_shard_id, "Applying optimization: {optimization:?}");
             if shard.apply_optimization(scheduler, optimization) {
                 optimizations_applied += 1;
                 if self.maybe_reconcile_shard(shard, nodes).is_some() {
@@ -6385,7 +6543,13 @@ impl Service {
 
         let mut work = Vec::new();
         let mut locked = self.inner.write().unwrap();
-        let (nodes, tenants, scheduler) = locked.parts_mut();
+        let (_nodes, tenants, scheduler) = locked.parts_mut();
+
+        // We are going to plan a bunch of optimisations before applying any of them, so the
+        // utilisation stats on nodes will be effectively stale for the >1st optimisation we
+        // generate.  To avoid this causing unstable migrations/flapping, it's important that the
+        // code in TenantShard for finding optimisations uses [`NodeAttachmentSchedulingScore::disregard_utilization`]
+        // to ignore the utilisation component of the score.
 
         for (_tenant_id, schedule_context, shards) in
             TenantShardContextIterator::new(tenants, ScheduleMode::Speculative)
@@ -6416,13 +6580,28 @@ impl Service {
                     continue;
                 }
 
-                // TODO: optimization calculations are relatively expensive: create some fast-path for
-                // the common idle case (avoiding the search on tenants that we have recently checked)
+                // Fast path: we may quickly identify shards that don't have any possible optimisations
+                if !shard.maybe_optimizable(scheduler, &schedule_context) {
+                    if cfg!(feature = "testing") {
+                        // Check that maybe_optimizable doesn't disagree with the actual optimization functions.
+                        // Only do this in testing builds because it is not a correctness-critical check, so we shouldn't
+                        // panic in prod if we hit this, or spend cycles on it in prod.
+                        assert!(shard
+                            .optimize_attachment(scheduler, &schedule_context)
+                            .is_none());
+                        assert!(shard
+                            .optimize_secondary(scheduler, &schedule_context)
+                            .is_none());
+                    }
+                    continue;
+                }
+
                 if let Some(optimization) =
-                    // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
+                    // If idle, maybe optimize attachments: if a shard has a secondary location that is preferable to
                     // its primary location based on soft constraints, cut it over.
-                    shard.optimize_attachment(nodes, &schedule_context)
+                    shard.optimize_attachment(scheduler, &schedule_context)
                 {
+                    tracing::info!(tenant_shard_id=%shard.tenant_shard_id, "Identified optimization for attachment: {optimization:?}");
                     work.push((shard.tenant_shard_id, optimization));
                     break;
                 } else if let Some(optimization) =
@@ -6432,6 +6611,7 @@ impl Service {
                     // in the same tenant with secondary locations on the node where they originally split.
                     shard.optimize_secondary(scheduler, &schedule_context)
                 {
+                    tracing::info!(tenant_shard_id=%shard.tenant_shard_id, "Identified optimization for secondary: {optimization:?}");
                     work.push((shard.tenant_shard_id, optimization));
                     break;
                 }
@@ -6480,8 +6660,10 @@ impl Service {
                         }
                     }
                 }
-                ScheduleOptimizationAction::ReplaceSecondary(_) => {
-                    // No extra checks needed to replace a secondary: this does not interrupt client access
+                ScheduleOptimizationAction::ReplaceSecondary(_)
+                | ScheduleOptimizationAction::CreateSecondary(_)
+                | ScheduleOptimizationAction::RemoveSecondary(_) => {
+                    // No extra checks needed to manage secondaries: this does not interrupt client access
                     validated_work.push((tenant_shard_id, optimization))
                 }
             };
@@ -6553,26 +6735,35 @@ impl Service {
     /// we have this helper to move things along faster.
     #[cfg(feature = "testing")]
     async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) {
-        let (attached_node, secondary_node) = {
+        let (attached_node, secondaries) = {
             let locked = self.inner.read().unwrap();
             let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
+                tracing::warn!(
+                    "Skipping kick of secondary download for {tenant_shard_id}: not found"
+                );
                 return;
             };
-            let (Some(attached), Some(secondary)) = (
-                shard.intent.get_attached(),
-                shard.intent.get_secondary().first(),
-            ) else {
+
+            let Some(attached) = shard.intent.get_attached() else {
+                tracing::warn!(
+                    "Skipping kick of secondary download for {tenant_shard_id}: no attached"
+                );
                 return;
             };
-            (
-                locked.nodes.get(attached).unwrap().clone(),
-                locked.nodes.get(secondary).unwrap().clone(),
-            )
+
+            let secondaries = shard
+                .intent
+                .get_secondary()
+                .iter()
+                .map(|n| locked.nodes.get(n).unwrap().clone())
+                .collect::<Vec<_>>();
+
+            (locked.nodes.get(attached).unwrap().clone(), secondaries)
         };
 
         // Make remote API calls to upload + download heatmaps: we ignore errors because this is just
         // a 'kick' to let scheduling optimisation run more promptly.
-        attached_node
+        match attached_node
             .with_client_retries(
                 |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await },
                 &self.config.jwt_token,
@@ -6581,22 +6772,57 @@ impl Service {
                 SHORT_RECONCILE_TIMEOUT,
                 &self.cancel,
             )
-            .await;
+            .await
+        {
+            Some(Err(e)) => {
+                tracing::info!(
+                    "Failed to upload heatmap from {attached_node} for {tenant_shard_id}: {e}"
+                );
+            }
+            None => {
+                tracing::info!(
+                    "Cancelled while uploading heatmap from {attached_node} for {tenant_shard_id}"
+                );
+            }
+            Some(Ok(_)) => {
+                tracing::info!(
+                    "Successfully uploaded heatmap from {attached_node} for {tenant_shard_id}"
+                );
+            }
+        }
 
-        secondary_node
-            .with_client_retries(
-                |client| async move {
-                    client
-                        .tenant_secondary_download(tenant_shard_id, Some(Duration::from_secs(1)))
-                        .await
-                },
-                &self.config.jwt_token,
-                3,
-                10,
-                SHORT_RECONCILE_TIMEOUT,
-                &self.cancel,
-            )
-            .await;
+        for secondary_node in secondaries {
+            match secondary_node
+                .with_client_retries(
+                    |client| async move {
+                        client
+                            .tenant_secondary_download(
+                                tenant_shard_id,
+                                Some(Duration::from_secs(1)),
+                            )
+                            .await
+                    },
+                    &self.config.jwt_token,
+                    3,
+                    10,
+                    SHORT_RECONCILE_TIMEOUT,
+                    &self.cancel,
+                )
+                .await
+            {
+                Some(Err(e)) => {
+                    tracing::info!(
+                "Failed to download heatmap from {secondary_node} for {tenant_shard_id}: {e}"
+            );
+                }
+                None => {
+                    tracing::info!("Cancelled while downloading heatmap from {secondary_node} for {tenant_shard_id}");
+                }
+                Some(Ok(progress)) => {
+                    tracing::info!("Successfully downloaded heatmap from {secondary_node} for {tenant_shard_id}: {progress:?}");
+                }
+            }
+        }
     }
 
     /// Look for shards which are oversized and in need of splitting
@@ -7032,9 +7258,15 @@ impl Service {
     fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
         let mut locked = self.inner.write().unwrap();
         let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
+        let (nodes, tenants, _scheduler) = locked.parts_mut();
 
-        let mut tids_by_node = locked
-            .tenants
+        let node_az = nodes
+            .get(&node_id)
+            .expect("Node must exist")
+            .get_availability_zone_id()
+            .clone();
+
+        let mut tids_by_node = tenants
             .iter_mut()
             .filter_map(|(tid, tenant_shard)| {
                 if !matches!(
@@ -7047,6 +7279,25 @@ impl Service {
                     return None;
                 }
 
+                // AZ check: when filling nodes after a restart, our intent is to move _back_ the
+                // shards which belong on this node, not to promote shards whose scheduling preference
+                // would be on their currently attached node.  So will avoid promoting shards whose
+                // home AZ doesn't match the AZ of the node we're filling.
+                match tenant_shard.preferred_az() {
+                    None => {
+                        // Shard doesn't have an AZ preference: it is elegible to be moved.
+                    }
+                    Some(az) if az == &node_az => {
+                        // This shard's home AZ is equal to the node we're filling: it is
+                        // elegible to be moved: fall through;
+                    }
+                    Some(_) => {
+                        // This shard's home AZ is somewhere other than the node we're filling:
+                        // do not include it in the fill plan.
+                        return None;
+                    }
+                }
+
                 if tenant_shard.intent.get_secondary().contains(&node_id) {
                     if let Some(primary) = tenant_shard.intent.get_attached() {
                         return Some((*primary, *tid));
diff --git a/storage_controller/src/service/context_iterator.rs b/storage_controller/src/service/context_iterator.rs
index d38010a27e..dd6913e988 100644
--- a/storage_controller/src/service/context_iterator.rs
+++ b/storage_controller/src/service/context_iterator.rs
@@ -43,9 +43,6 @@ impl<'a> Iterator for TenantShardContextIterator<'a> {
 
             // Accumulate the schedule context for all the shards in a tenant
             schedule_context.avoid(&shard.intent.all_pageservers());
-            if let Some(attached) = shard.intent.get_attached() {
-                schedule_context.push_attached(*attached);
-            }
             tenant_shards.push(shard);
 
             if tenant_shard_id.shard_number.0 == tenant_shard_id.shard_count.count() - 1 {
@@ -115,7 +112,7 @@ mod tests {
         assert_eq!(tenant_id, t1_id);
         assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
         assert_eq!(shards.len(), 1);
-        assert_eq!(context.attach_count(), 1);
+        assert_eq!(context.location_count(), 2);
 
         let (tenant_id, context, shards) = iter.next().unwrap();
         assert_eq!(tenant_id, t2_id);
@@ -124,13 +121,13 @@ mod tests {
         assert_eq!(shards[2].tenant_shard_id.shard_number, ShardNumber(2));
         assert_eq!(shards[3].tenant_shard_id.shard_number, ShardNumber(3));
         assert_eq!(shards.len(), 4);
-        assert_eq!(context.attach_count(), 4);
+        assert_eq!(context.location_count(), 8);
 
         let (tenant_id, context, shards) = iter.next().unwrap();
         assert_eq!(tenant_id, t3_id);
         assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
         assert_eq!(shards.len(), 1);
-        assert_eq!(context.attach_count(), 1);
+        assert_eq!(context.location_count(), 2);
 
         for shard in tenants.values_mut() {
             shard.intent.clear(&mut scheduler);
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index c17989a316..79ed628c25 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -11,16 +11,14 @@ use crate::{
     persistence::TenantShardPersistence,
     reconciler::{ReconcileUnits, ReconcilerConfig},
     scheduler::{
-        AffinityScore, AttachedShardTag, MaySchedule, RefCountUpdate, ScheduleContext,
-        SecondaryShardTag,
+        AffinityScore, AttachedShardTag, NodeSchedulingScore, NodeSecondarySchedulingScore,
+        RefCountUpdate, ScheduleContext, SecondaryShardTag, ShardTag,
     },
     service::ReconcileResultRequest,
 };
 use futures::future::{self, Either};
 use itertools::Itertools;
-use pageserver_api::controller_api::{
-    AvailabilityZone, NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy,
-};
+use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy, ShardSchedulingPolicy};
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
     shard::{ShardIdentity, TenantShardId},
@@ -33,6 +31,7 @@ use utils::{
     generation::Generation,
     id::NodeId,
     seqwait::{SeqWait, SeqWaitError},
+    shard::ShardCount,
     sync::gate::GateGuard,
 };
 
@@ -147,45 +146,67 @@ pub(crate) struct TenantShard {
     // Support/debug tool: if something is going wrong or flapping with scheduling, this may
     // be set to a non-active state to avoid making changes while the issue is fixed.
     scheduling_policy: ShardSchedulingPolicy,
+}
+
+#[derive(Clone, Debug, Serialize)]
+pub(crate) struct IntentState {
+    attached: Option<NodeId>,
+    secondary: Vec<NodeId>,
 
     // We should attempt to schedule this shard in the provided AZ to
     // decrease chances of cross-AZ compute.
     preferred_az_id: Option<AvailabilityZone>,
 }
 
-#[derive(Default, Clone, Debug, Serialize)]
-pub(crate) struct IntentState {
-    attached: Option<NodeId>,
-    secondary: Vec<NodeId>,
-}
-
 impl IntentState {
-    pub(crate) fn new() -> Self {
+    pub(crate) fn new(preferred_az_id: Option<AvailabilityZone>) -> Self {
         Self {
             attached: None,
             secondary: vec![],
+            preferred_az_id,
         }
     }
-    pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option<NodeId>) -> Self {
+    pub(crate) fn single(
+        scheduler: &mut Scheduler,
+        node_id: Option<NodeId>,
+        preferred_az_id: Option<AvailabilityZone>,
+    ) -> Self {
         if let Some(node_id) = node_id {
-            scheduler.update_node_ref_counts(node_id, RefCountUpdate::Attach);
+            scheduler.update_node_ref_counts(
+                node_id,
+                preferred_az_id.as_ref(),
+                RefCountUpdate::Attach,
+            );
         }
         Self {
             attached: node_id,
             secondary: vec![],
+            preferred_az_id,
         }
     }
 
     pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option<NodeId>) {
         if self.attached != new_attached {
             if let Some(old_attached) = self.attached.take() {
-                scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach);
+                scheduler.update_node_ref_counts(
+                    old_attached,
+                    self.preferred_az_id.as_ref(),
+                    RefCountUpdate::Detach,
+                );
             }
             if let Some(new_attached) = &new_attached {
-                scheduler.update_node_ref_counts(*new_attached, RefCountUpdate::Attach);
+                scheduler.update_node_ref_counts(
+                    *new_attached,
+                    self.preferred_az_id.as_ref(),
+                    RefCountUpdate::Attach,
+                );
             }
             self.attached = new_attached;
         }
+
+        if let Some(new_attached) = &new_attached {
+            assert!(!self.secondary.contains(new_attached));
+        }
     }
 
     /// Like set_attached, but the node is from [`Self::secondary`].  This swaps the node from
@@ -204,15 +225,28 @@ impl IntentState {
         let demoted = self.attached;
         self.attached = Some(promote_secondary);
 
-        scheduler.update_node_ref_counts(promote_secondary, RefCountUpdate::PromoteSecondary);
+        scheduler.update_node_ref_counts(
+            promote_secondary,
+            self.preferred_az_id.as_ref(),
+            RefCountUpdate::PromoteSecondary,
+        );
         if let Some(demoted) = demoted {
-            scheduler.update_node_ref_counts(demoted, RefCountUpdate::DemoteAttached);
+            scheduler.update_node_ref_counts(
+                demoted,
+                self.preferred_az_id.as_ref(),
+                RefCountUpdate::DemoteAttached,
+            );
         }
     }
 
     pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
-        debug_assert!(!self.secondary.contains(&new_secondary));
-        scheduler.update_node_ref_counts(new_secondary, RefCountUpdate::AddSecondary);
+        assert!(!self.secondary.contains(&new_secondary));
+        assert!(self.attached != Some(new_secondary));
+        scheduler.update_node_ref_counts(
+            new_secondary,
+            self.preferred_az_id.as_ref(),
+            RefCountUpdate::AddSecondary,
+        );
         self.secondary.push(new_secondary);
     }
 
@@ -220,27 +254,43 @@ impl IntentState {
     pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) {
         let index = self.secondary.iter().position(|n| *n == node_id);
         if let Some(index) = index {
-            scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary);
+            scheduler.update_node_ref_counts(
+                node_id,
+                self.preferred_az_id.as_ref(),
+                RefCountUpdate::RemoveSecondary,
+            );
             self.secondary.remove(index);
         }
     }
 
     pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) {
         for secondary in self.secondary.drain(..) {
-            scheduler.update_node_ref_counts(secondary, RefCountUpdate::RemoveSecondary);
+            scheduler.update_node_ref_counts(
+                secondary,
+                self.preferred_az_id.as_ref(),
+                RefCountUpdate::RemoveSecondary,
+            );
         }
     }
 
     /// Remove the last secondary node from the list of secondaries
     pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
         if let Some(node_id) = self.secondary.pop() {
-            scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary);
+            scheduler.update_node_ref_counts(
+                node_id,
+                self.preferred_az_id.as_ref(),
+                RefCountUpdate::RemoveSecondary,
+            );
         }
     }
 
     pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
         if let Some(old_attached) = self.attached.take() {
-            scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach);
+            scheduler.update_node_ref_counts(
+                old_attached,
+                self.preferred_az_id.as_ref(),
+                RefCountUpdate::Detach,
+            );
         }
 
         self.clear_secondary(scheduler);
@@ -275,7 +325,11 @@ impl IntentState {
         if self.attached == Some(node_id) {
             self.attached = None;
             self.secondary.push(node_id);
-            scheduler.update_node_ref_counts(node_id, RefCountUpdate::DemoteAttached);
+            scheduler.update_node_ref_counts(
+                node_id,
+                self.preferred_az_id.as_ref(),
+                RefCountUpdate::DemoteAttached,
+            );
             true
         } else {
             false
@@ -315,6 +369,7 @@ pub(crate) struct ObservedStateLocation {
     /// we know that we might have some state on this node.
     pub(crate) conf: Option<LocationConfig>,
 }
+
 pub(crate) struct ReconcilerWaiter {
     // For observability purposes, remember the ID of the shard we're
     // waiting for.
@@ -360,6 +415,10 @@ pub(crate) enum ScheduleOptimizationAction {
     ReplaceSecondary(ReplaceSecondary),
     // Migrate attachment to an existing secondary location
     MigrateAttachment(MigrateAttachment),
+    // Create a secondary location, with the intent of later migrating to it
+    CreateSecondary(NodeId),
+    // Remove a secondary location that we previously created to facilitate a migration
+    RemoveSecondary(NodeId),
 }
 
 #[derive(Eq, PartialEq, Debug, Clone)]
@@ -486,7 +545,7 @@ impl TenantShard {
         Self {
             tenant_shard_id,
             policy,
-            intent: IntentState::default(),
+            intent: IntentState::new(preferred_az_id),
             generation: Some(Generation::new(0)),
             shard,
             observed: ObservedState::default(),
@@ -500,7 +559,6 @@ impl TenantShard {
             last_error: Arc::default(),
             pending_compute_notification: false,
             scheduling_policy: ShardSchedulingPolicy::default(),
-            preferred_az_id,
         }
     }
 
@@ -563,7 +621,7 @@ impl TenantShard {
             return Ok((false, node_id));
         }
 
-        if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) {
+        if let Some(promote_secondary) = self.preferred_secondary(scheduler) {
             // Promote a secondary
             tracing::debug!("Promoted secondary {} to attached", promote_secondary);
             self.intent.promote_attached(scheduler, promote_secondary);
@@ -572,7 +630,7 @@ impl TenantShard {
             // Pick a fresh node: either we had no secondaries or none were schedulable
             let node_id = scheduler.schedule_shard::<AttachedShardTag>(
                 &self.intent.secondary,
-                &self.preferred_az_id,
+                &self.intent.preferred_az_id,
                 context,
             )?;
             tracing::debug!("Selected {} as attached", node_id);
@@ -594,9 +652,6 @@ impl TenantShard {
         let r = self.do_schedule(scheduler, context);
 
         context.avoid(&self.intent.all_pageservers());
-        if let Some(attached) = self.intent.get_attached() {
-            context.push_attached(*attached);
-        }
 
         r
     }
@@ -631,24 +686,7 @@ impl TenantShard {
         use PlacementPolicy::*;
         match self.policy {
             Attached(secondary_count) => {
-                let retain_secondaries = if self.intent.attached.is_none()
-                    && scheduler.node_preferred(&self.intent.secondary).is_some()
-                {
-                    // If we have no attached, and one of the secondaries is elegible to be promoted, retain
-                    // one more secondary than we usually would, as one of them will become attached futher down this function.
-                    secondary_count + 1
-                } else {
-                    secondary_count
-                };
-
-                while self.intent.secondary.len() > retain_secondaries {
-                    // We have no particular preference for one secondary location over another: just
-                    // arbitrarily drop from the end
-                    self.intent.pop_secondary(scheduler);
-                    modified = true;
-                }
-
-                // Should have exactly one attached, and N secondaries
+                // Should have exactly one attached, and at least N secondaries
                 let (modified_attached, attached_node_id) =
                     self.schedule_attached(scheduler, context)?;
                 modified |= modified_attached;
@@ -657,7 +695,7 @@ impl TenantShard {
                 while self.intent.secondary.len() < secondary_count {
                     let node_id = scheduler.schedule_shard::<SecondaryShardTag>(
                         &used_pageservers,
-                        &self.preferred_az_id,
+                        &self.intent.preferred_az_id,
                         context,
                     )?;
                     self.intent.push_secondary(scheduler, node_id);
@@ -674,7 +712,7 @@ impl TenantShard {
                     // Populate secondary by scheduling a fresh node
                     let node_id = scheduler.schedule_shard::<SecondaryShardTag>(
                         &[],
-                        &self.preferred_az_id,
+                        &self.intent.preferred_az_id,
                         context,
                     )?;
                     self.intent.push_secondary(scheduler, node_id);
@@ -718,7 +756,7 @@ impl TenantShard {
     ) -> Result<(), ScheduleError> {
         let promote_to = match promote_to {
             Some(node) => node,
-            None => match scheduler.node_preferred(self.intent.get_secondary()) {
+            None => match self.preferred_secondary(scheduler) {
                 Some(node) => node,
                 None => {
                     return Err(ScheduleError::ImpossibleConstraint);
@@ -745,90 +783,276 @@ impl TenantShard {
         Ok(())
     }
 
+    /// Returns None if the current location's score is unavailable, i.e. cannot draw a conclusion
+    fn is_better_location<T: ShardTag>(
+        &self,
+        scheduler: &mut Scheduler,
+        schedule_context: &ScheduleContext,
+        current: NodeId,
+        candidate: NodeId,
+    ) -> Option<bool> {
+        let Some(candidate_score) = scheduler.compute_node_score::<T::Score>(
+            candidate,
+            &self.intent.preferred_az_id,
+            schedule_context,
+        ) else {
+            // The candidate node is unavailable for scheduling or otherwise couldn't get a score
+            return None;
+        };
+
+        match scheduler.compute_node_score::<T::Score>(
+            current,
+            &self.intent.preferred_az_id,
+            schedule_context,
+        ) {
+            Some(current_score) => {
+                // Ignore utilization components when comparing scores: we don't want to migrate
+                // because of transient load variations, it risks making the system thrash, and
+                // migrating for utilization requires a separate high level view of the system to
+                // e.g. prioritize moving larger or smaller tenants, rather than arbitrarily
+                // moving things around in the order that we hit this function.
+                let candidate_score = candidate_score.for_optimization();
+                let current_score = current_score.for_optimization();
+
+                if candidate_score < current_score {
+                    tracing::info!("Found a lower scoring location! {candidate} is better than {current} ({candidate_score:?} is better than {current_score:?})");
+                    Some(true)
+                } else {
+                    // The candidate node is no better than our current location, so don't migrate
+                    tracing::debug!(
+                        "Candidate node {candidate} is no better than our current location {current} (candidate {candidate_score:?} vs current {current_score:?})",
+                    );
+                    Some(false)
+                }
+            }
+            None => {
+                // The current node is unavailable for scheduling, so we can't make any sensible
+                // decisions about optimisation.  This should be a transient state -- if the node
+                // is offline then it will get evacuated, if is blocked by a scheduling mode
+                // then we will respect that mode by doing nothing.
+                tracing::debug!("Current node {current} is unavailable for scheduling");
+                None
+            }
+        }
+    }
+
+    fn find_better_location<T: ShardTag>(
+        &self,
+        scheduler: &mut Scheduler,
+        schedule_context: &ScheduleContext,
+        current: NodeId,
+        hard_exclude: &[NodeId],
+    ) -> Option<NodeId> {
+        // Look for a lower-scoring location to attach to
+        let Ok(candidate_node) = scheduler.schedule_shard::<T>(
+            hard_exclude,
+            &self.intent.preferred_az_id,
+            schedule_context,
+        ) else {
+            // A scheduling error means we have no possible candidate replacements
+            tracing::debug!("No candidate node found");
+            return None;
+        };
+
+        if candidate_node == current {
+            // We're already at the best possible location, so don't migrate
+            tracing::debug!("Candidate node {candidate_node} is already in use");
+            return None;
+        }
+
+        self.is_better_location::<T>(scheduler, schedule_context, current, candidate_node)
+            .and_then(|better| if better { Some(candidate_node) } else { None })
+    }
+
+    /// This function is an optimization, used to avoid doing large numbers of scheduling operations
+    /// when looking for optimizations.  This function uses knowledge of how scores work to do some
+    /// fast checks for whether it may to be possible to improve a score.
+    ///
+    /// If we return true, it only means that optimization _might_ be possible, not that it necessarily is.  If we
+    /// return no, it definitely means that calling [`Self::optimize_attachment`] or [`Self::optimize_secondary`] would do no
+    /// work.
+    pub(crate) fn maybe_optimizable(
+        &self,
+        scheduler: &mut Scheduler,
+        schedule_context: &ScheduleContext,
+    ) -> bool {
+        // Sharded tenant: check if any locations have a nonzero affinity score
+        if self.shard.count >= ShardCount(1) {
+            let schedule_context = schedule_context.project_detach(self);
+            for node in self.intent.all_pageservers() {
+                if let Some(af) = schedule_context.nodes.get(&node) {
+                    if *af > AffinityScore(0) {
+                        return true;
+                    }
+                }
+            }
+        }
+
+        // Attached tenant: check if the attachment is outside the preferred AZ
+        if let PlacementPolicy::Attached(_) = self.policy {
+            if let Some(attached) = self.intent.get_attached() {
+                if scheduler.get_node_az(attached) != self.intent.preferred_az_id {
+                    return true;
+                }
+            }
+        }
+
+        // Tenant with secondary locations: check if any are within the preferred AZ
+        for secondary in self.intent.get_secondary() {
+            if scheduler.get_node_az(secondary) == self.intent.preferred_az_id {
+                return true;
+            }
+        }
+
+        // Does the tenant have excess secondaries?
+        if self.intent.get_secondary().len() > self.policy.want_secondaries() {
+            return true;
+        }
+
+        // Fall through: no optimizations possible
+        false
+    }
+
     /// Optimize attachments: if a shard has a secondary location that is preferable to
     /// its primary location based on soft constraints, switch that secondary location
     /// to be attached.
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
     pub(crate) fn optimize_attachment(
         &self,
-        nodes: &HashMap<NodeId, Node>,
+        scheduler: &mut Scheduler,
         schedule_context: &ScheduleContext,
     ) -> Option<ScheduleOptimization> {
         let attached = (*self.intent.get_attached())?;
-        if self.intent.secondary.is_empty() {
-            // We can only do useful work if we have both attached and secondary locations: this
-            // function doesn't schedule new locations, only swaps between attached and secondaries.
-            return None;
-        }
 
-        let current_affinity_score = schedule_context.get_node_affinity(attached);
-        let current_attachment_count = schedule_context.get_node_attachments(attached);
+        let schedule_context = schedule_context.project_detach(self);
 
-        // Generate score for each node, dropping any un-schedulable nodes.
-        let all_pageservers = self.intent.all_pageservers();
-        let mut scores = all_pageservers
-            .iter()
-            .flat_map(|node_id| {
-                let node = nodes.get(node_id);
-                if node.is_none() {
-                    None
-                } else if matches!(
-                    node.unwrap().get_scheduling(),
-                    NodeSchedulingPolicy::Filling
-                ) {
-                    // If the node is currently filling, don't count it as a candidate to avoid,
-                    // racing with the background fill.
-                    None
-                } else if matches!(node.unwrap().may_schedule(), MaySchedule::No) {
-                    None
-                } else {
-                    let affinity_score = schedule_context.get_node_affinity(*node_id);
-                    let attachment_count = schedule_context.get_node_attachments(*node_id);
-                    Some((*node_id, affinity_score, attachment_count))
-                }
-            })
-            .collect::<Vec<_>>();
-
-        // Sort precedence:
-        //  1st - prefer nodes with the lowest total affinity score
-        //  2nd - prefer nodes with the lowest number of attachments in this context
-        //  3rd - if all else is equal, sort by node ID for determinism in tests.
-        scores.sort_by_key(|i| (i.1, i.2, i.0));
-
-        if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) =
-            scores.first()
-        {
-            if attached != *preferred_node {
-                // The best alternative must be more than 1 better than us, otherwise we could end
-                // up flapping back next time we're called (e.g. there's no point migrating from
-                // a location with score 1 to a score zero, because on next location the situation
-                // would be the same, but in reverse).
-                if current_affinity_score > *preferred_affinity_score + AffinityScore(1)
-                    || current_attachment_count > *preferred_attachment_count + 1
-                {
-                    tracing::info!(
-                        "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
-                        self.intent.get_secondary()
-                    );
-                    return Some(ScheduleOptimization {
-                        sequence: self.sequence,
-                        action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
-                            old_attached_node_id: attached,
-                            new_attached_node_id: *preferred_node,
-                        }),
-                    });
-                }
-            } else {
-                tracing::debug!(
-                    "Node {} is already preferred (score {:?})",
-                    preferred_node,
-                    preferred_affinity_score
-                );
+        // If we already have a secondary that is higher-scoring than out current location,
+        // then simply migrate to it.
+        for secondary in self.intent.get_secondary() {
+            if let Some(true) = self.is_better_location::<AttachedShardTag>(
+                scheduler,
+                &schedule_context,
+                attached,
+                *secondary,
+            ) {
+                return Some(ScheduleOptimization {
+                    sequence: self.sequence,
+                    action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                        old_attached_node_id: attached,
+                        new_attached_node_id: *secondary,
+                    }),
+                });
             }
         }
 
-        // Fall-through: we didn't find an optimization
-        None
+        // Given that none of our current secondaries is a better location than our current
+        // attached location (checked above), we may trim any secondaries that are not needed
+        // for the placement policy.
+        if self.intent.get_secondary().len() > self.policy.want_secondaries() {
+            // This code path cleans up extra secondaries after migrating, and/or
+            // trims extra secondaries after a PlacementPolicy::Attached(N) was
+            // modified to decrease N.
+
+            let secondary_scores = self
+                .intent
+                .get_secondary()
+                .iter()
+                .map(|node_id| {
+                    (
+                        *node_id,
+                        scheduler.compute_node_score::<NodeSecondarySchedulingScore>(
+                            *node_id,
+                            &self.intent.preferred_az_id,
+                            &schedule_context,
+                        ),
+                    )
+                })
+                .collect::<Vec<_>>();
+
+            if secondary_scores.iter().any(|score| score.1.is_none()) {
+                // Don't have full list of scores, so can't make a good decision about which to drop unless
+                // there is an obvious one in the wrong AZ
+                for secondary in self.intent.get_secondary() {
+                    if scheduler.get_node_az(secondary) == self.intent.preferred_az_id {
+                        return Some(ScheduleOptimization {
+                            sequence: self.sequence,
+                            action: ScheduleOptimizationAction::RemoveSecondary(*secondary),
+                        });
+                    }
+                }
+
+                // Fall through: we didn't identify one to remove.  This ought to be rare.
+                tracing::warn!("Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)",
+                self.intent.get_secondary()
+            );
+            } else {
+                let victim = secondary_scores
+                    .iter()
+                    .max_by_key(|score| score.1.unwrap())
+                    .unwrap()
+                    .0;
+                return Some(ScheduleOptimization {
+                    sequence: self.sequence,
+                    action: ScheduleOptimizationAction::RemoveSecondary(victim),
+                });
+            }
+        }
+
+        let replacement = self.find_better_location::<AttachedShardTag>(
+            scheduler,
+            &schedule_context,
+            attached,
+            &[], // Don't exclude secondaries: our preferred attachment location may be a secondary
+        );
+
+        // We have found a candidate and confirmed that its score is preferable
+        // to our current location. See if we have a secondary location in the preferred location already: if not,
+        // then create one.
+        if let Some(replacement) = replacement {
+            // If we are currently in non-preferred AZ, then the scheduler might suggest a location that is better, but still
+            // not in our preferred AZ.  Migration has a cost in resources an impact to the workload, so we want to avoid doing
+            // multiple hops where we might go to some other AZ before eventually finding a suitable location in our preferred
+            // AZ: skip this optimization if it is not in our final, preferred AZ.
+            //
+            // This should be a transient state, there should always be capacity eventually in our preferred AZ (even if nodes
+            // there are too overloaded for scheduler to suggest them, more should be provisioned eventually).
+            if self.intent.preferred_az_id.is_some()
+                && scheduler.get_node_az(&replacement) != self.intent.preferred_az_id
+            {
+                tracing::debug!(
+                    "Candidate node {replacement} is not in preferred AZ {:?}",
+                    self.intent.preferred_az_id
+                );
+
+                // This should only happen if our current location is not in the preferred AZ, otherwise
+                // [`Self::find_better_location`]` should have rejected any other location outside the preferred Az, because
+                // AZ is the highest priority part of NodeAttachmentSchedulingScore.
+                debug_assert!(scheduler.get_node_az(&attached) != self.intent.preferred_az_id);
+
+                return None;
+            }
+
+            if !self.intent.get_secondary().contains(&replacement) {
+                Some(ScheduleOptimization {
+                    sequence: self.sequence,
+                    action: ScheduleOptimizationAction::CreateSecondary(replacement),
+                })
+            } else {
+                // We already have a secondary in the preferred location, let's try migrating to it.  Our caller
+                // will check the warmth of the destination before deciding whether to really execute this.
+                Some(ScheduleOptimization {
+                    sequence: self.sequence,
+                    action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                        old_attached_node_id: attached,
+                        new_attached_node_id: replacement,
+                    }),
+                })
+            }
+        } else {
+            // We didn't find somewhere we'd rather be, and we don't have any excess secondaries
+            // to clean up: no action required.
+            None
+        }
     }
 
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
@@ -837,50 +1061,40 @@ impl TenantShard {
         scheduler: &mut Scheduler,
         schedule_context: &ScheduleContext,
     ) -> Option<ScheduleOptimization> {
-        if self.intent.secondary.is_empty() {
-            // We can only do useful work if we have both attached and secondary locations: this
-            // function doesn't schedule new locations, only swaps between attached and secondaries.
+        if self.intent.get_secondary().len() > self.policy.want_secondaries() {
+            // We have extra secondaries, perhaps to facilitate a migration of the attached location:
+            // do nothing, it is up to [`Self::optimize_attachment`] to clean them up.  When that's done,
+            // and we are called again, we will proceed.
+            tracing::debug!("Too many secondaries: skipping");
             return None;
         }
 
+        let schedule_context = schedule_context.project_detach(self);
+
         for secondary in self.intent.get_secondary() {
-            let Some(affinity_score) = schedule_context.nodes.get(secondary) else {
-                // We're already on a node unaffected any affinity constraints,
-                // so we won't change it.
-                continue;
+            // Make sure we don't try to migrate a secondary to our attached location: this case happens
+            // easily in environments without multiple AZs.
+            let exclude = match self.intent.attached {
+                Some(attached) => vec![attached],
+                None => vec![],
             };
 
-            // Let the scheduler suggest a node, where it would put us if we were scheduling afresh
-            // This implicitly limits the choice to nodes that are available, and prefers nodes
-            // with lower utilization.
-            let Ok(candidate_node) = scheduler.schedule_shard::<SecondaryShardTag>(
-                &self.intent.all_pageservers(),
-                &self.preferred_az_id,
-                schedule_context,
-            ) else {
-                // A scheduling error means we have no possible candidate replacements
-                continue;
-            };
-
-            let candidate_affinity_score = schedule_context
-                .nodes
-                .get(&candidate_node)
-                .unwrap_or(&AffinityScore::FREE);
-
-            // The best alternative must be more than 1 better than us, otherwise we could end
-            // up flapping back next time we're called.
-            if *candidate_affinity_score + AffinityScore(1) < *affinity_score {
-                // If some other node is available and has a lower score than this node, then
-                // that other node is a good place to migrate to.
-                tracing::info!(
-                    "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
-                    self.intent.get_secondary()
-                );
+            let replacement = self.find_better_location::<SecondaryShardTag>(
+                scheduler,
+                &schedule_context,
+                *secondary,
+                &exclude,
+            );
+            assert!(replacement != Some(*secondary));
+            if let Some(replacement) = replacement {
+                // We have found a candidate and confirmed that its score is preferable
+                // to our current location. See if we have a secondary location in the preferred location already: if not,
+                // then create one.
                 return Some(ScheduleOptimization {
                     sequence: self.sequence,
                     action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
                         old_node_id: *secondary,
-                        new_node_id: candidate_node,
+                        new_node_id: replacement,
                     }),
                 });
             }
@@ -921,11 +1135,54 @@ impl TenantShard {
                 self.intent.remove_secondary(scheduler, old_node_id);
                 self.intent.push_secondary(scheduler, new_node_id);
             }
+            ScheduleOptimizationAction::CreateSecondary(new_node_id) => {
+                self.intent.push_secondary(scheduler, new_node_id);
+            }
+            ScheduleOptimizationAction::RemoveSecondary(old_secondary) => {
+                self.intent.remove_secondary(scheduler, old_secondary);
+            }
         }
 
         true
     }
 
+    /// When a shard has several secondary locations, we need to pick one in situations where
+    /// we promote one of them to an attached location:
+    ///  - When draining a node for restart
+    ///  - When responding to a node failure
+    ///
+    /// In this context, 'preferred' does not mean the node with the best scheduling score: instead
+    /// we want to pick the node which is best for use _temporarily_ while the previous attached location
+    /// is unavailable (e.g. because it's down or deploying).  That means we prefer to use secondary
+    /// locations in a non-preferred AZ, as they're more likely to have awarm cache than a temporary
+    /// secondary in the preferred AZ (which are usually only created for migrations, and if they exist
+    /// they're probably not warmed up yet). The latter behavior is based oni
+    ///
+    /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
+    /// caller needs to a pick a node some other way.
+    pub(crate) fn preferred_secondary(&self, scheduler: &Scheduler) -> Option<NodeId> {
+        let candidates = scheduler.filter_usable_nodes(&self.intent.secondary);
+
+        // We will sort candidates to prefer nodes which are _not_ in our preferred AZ, i.e. we prefer
+        // to migrate to a long-lived secondary location (which would have been scheduled in a non-preferred AZ),
+        // rather than a short-lived secondary location being used for optimization/migration (which would have
+        // been scheduled in our preferred AZ).
+        let mut candidates = candidates
+            .iter()
+            .map(|(node_id, node_az)| {
+                if node_az == &self.intent.preferred_az_id {
+                    (1, *node_id)
+                } else {
+                    (0, *node_id)
+                }
+            })
+            .collect::<Vec<_>>();
+
+        candidates.sort();
+
+        candidates.first().map(|i| i.1)
+    }
+
     /// Query whether the tenant's observed state for attached node matches its intent state, and if so,
     /// yield the node ID.  This is appropriate for emitting compute hook notifications: we are checking that
     /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
@@ -1122,10 +1379,15 @@ impl TenantShard {
         let result = reconciler.reconcile().await;
 
         // If we know we had a pending compute notification from some previous action, send a notification irrespective
-        // of whether the above reconcile() did any work
+        // of whether the above reconcile() did any work.  It has to be Ok() though, because otherwise we might be
+        // sending a notification of a location that isn't really attached.
         if result.is_ok() && must_notify {
             // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
             reconciler.compute_notify().await.ok();
+        } else if must_notify {
+            // Carry this flag so that the reconciler's result will indicate that it still needs to retry
+            // the compute hook notification eventually.
+            reconciler.compute_notify_failure = true;
         }
 
         // Update result counter
@@ -1202,7 +1464,7 @@ impl TenantShard {
             detach,
             reconciler_config,
             config: self.config.clone(),
-            preferred_az: self.preferred_az_id.clone(),
+            preferred_az: self.intent.preferred_az_id.clone(),
             observed: self.observed.clone(),
             original_observed: self.observed.clone(),
             compute_hook: compute_hook.clone(),
@@ -1423,7 +1685,6 @@ impl TenantShard {
             pending_compute_notification: false,
             delayed_reconcile: false,
             scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
-            preferred_az_id: tsp.preferred_az_id.map(AvailabilityZone),
         })
     }
 
@@ -1439,16 +1700,16 @@ impl TenantShard {
             config: serde_json::to_string(&self.config).unwrap(),
             splitting: SplitState::default(),
             scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
-            preferred_az_id: self.preferred_az_id.as_ref().map(|az| az.0.clone()),
+            preferred_az_id: self.intent.preferred_az_id.as_ref().map(|az| az.0.clone()),
         }
     }
 
     pub(crate) fn preferred_az(&self) -> Option<&AvailabilityZone> {
-        self.preferred_az_id.as_ref()
+        self.intent.preferred_az_id.as_ref()
     }
 
-    pub(crate) fn set_preferred_az(&mut self, preferred_az_id: AvailabilityZone) {
-        self.preferred_az_id = Some(preferred_az_id);
+    pub(crate) fn set_preferred_az(&mut self, preferred_az_id: Option<AvailabilityZone>) {
+        self.intent.preferred_az_id = preferred_az_id;
     }
 
     /// Returns all the nodes to which this tenant shard is attached according to the
@@ -1751,65 +2012,90 @@ pub(crate) mod tests {
     }
 
     #[test]
-    fn optimize_attachment() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(3, &[]);
+    /// Simple case: moving attachment to somewhere better where we already have a secondary
+    fn optimize_attachment_simple() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(
+            3,
+            &[
+                AvailabilityZone("az-a".to_string()),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-c".to_string()),
+            ],
+        );
         let mut scheduler = Scheduler::new(nodes.values());
 
         let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        shard_a.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string()));
         let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        shard_b.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string()));
 
         // Initially: both nodes attached on shard 1, and both have secondary locations
         // on different nodes.
-        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
-        shard_a.intent.push_secondary(&mut scheduler, NodeId(2));
+        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(2)));
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(1));
         shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1)));
-        shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
+        shard_b.intent.push_secondary(&mut scheduler, NodeId(2));
 
-        let mut schedule_context = ScheduleContext::default();
-        schedule_context.avoid(&shard_a.intent.all_pageservers());
-        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
-        schedule_context.avoid(&shard_b.intent.all_pageservers());
-        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
+        fn make_schedule_context(shard_a: &TenantShard, shard_b: &TenantShard) -> ScheduleContext {
+            let mut schedule_context = ScheduleContext::default();
+            schedule_context.avoid(&shard_a.intent.all_pageservers());
+            schedule_context.avoid(&shard_b.intent.all_pageservers());
+            schedule_context
+        }
 
-        let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);
-
-        // Either shard should recognize that it has the option to switch to a secondary location where there
-        // would be no other shards from the same tenant, and request to do so.
+        let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        let optimization_a = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
         assert_eq!(
             optimization_a,
             Some(ScheduleOptimization {
                 sequence: shard_a.sequence,
                 action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
-                    old_attached_node_id: NodeId(1),
-                    new_attached_node_id: NodeId(2)
+                    old_attached_node_id: NodeId(2),
+                    new_attached_node_id: NodeId(1)
                 })
             })
         );
-
-        // Note that these optimizing two shards in the same tenant with the same ScheduleContext is
-        // mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility
-        // of [`Service::optimize_all`] to avoid trying
-        // to do optimizations for multiple shards in the same tenant at the same time.  Generating
-        // both optimizations is just done for test purposes
-        let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
-        assert_eq!(
-            optimization_b,
-            Some(ScheduleOptimization {
-                sequence: shard_b.sequence,
-                action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
-                    old_attached_node_id: NodeId(1),
-                    new_attached_node_id: NodeId(3)
-                })
-            })
-        );
-
-        // Applying these optimizations should result in the end state proposed
         shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
-        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2)));
-        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
-        shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap());
-        assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3)));
-        assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]);
+
+        // // Either shard should recognize that it has the option to switch to a secondary location where there
+        // // would be no other shards from the same tenant, and request to do so.
+        // assert_eq!(
+        //     optimization_a_prepare,
+        //     Some(ScheduleOptimization {
+        //         sequence: shard_a.sequence,
+        //         action: ScheduleOptimizationAction::CreateSecondary(NodeId(2))
+        //     })
+        // );
+        // shard_a.apply_optimization(&mut scheduler, optimization_a_prepare.unwrap());
+
+        // let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        // let optimization_a_migrate = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        // assert_eq!(
+        //     optimization_a_migrate,
+        //     Some(ScheduleOptimization {
+        //         sequence: shard_a.sequence,
+        //         action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+        //             old_attached_node_id: NodeId(1),
+        //             new_attached_node_id: NodeId(2)
+        //         })
+        //     })
+        // );
+        // shard_a.apply_optimization(&mut scheduler, optimization_a_migrate.unwrap());
+
+        // let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        // let optimization_a_cleanup = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        // assert_eq!(
+        //     optimization_a_cleanup,
+        //     Some(ScheduleOptimization {
+        //         sequence: shard_a.sequence,
+        //         action: ScheduleOptimizationAction::RemoveSecondary(NodeId(1))
+        //     })
+        // );
+        // shard_a.apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap());
+
+        // // Shard B should not be moved anywhere, since the pressure on node 1 was relieved by moving shard A
+        // let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        // assert_eq!(shard_b.optimize_attachment(&mut scheduler, &schedule_context), None);
 
         shard_a.intent.clear(&mut scheduler);
         shard_b.intent.clear(&mut scheduler);
@@ -1817,6 +2103,190 @@ pub(crate) mod tests {
         Ok(())
     }
 
+    #[test]
+    /// Complicated case: moving attachment to somewhere better where we do not have a secondary
+    /// already, creating one as needed.
+    fn optimize_attachment_multistep() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(
+            3,
+            &[
+                AvailabilityZone("az-a".to_string()),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-c".to_string()),
+            ],
+        );
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        // Two shards of a tenant that wants to be in AZ A
+        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        shard_a.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string()));
+        let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        shard_b.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string()));
+
+        // Both shards are initially attached in non-home AZ _and_ have secondaries in non-home AZs
+        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(2)));
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
+        shard_b.intent.set_attached(&mut scheduler, Some(NodeId(3)));
+        shard_b.intent.push_secondary(&mut scheduler, NodeId(2));
+
+        fn make_schedule_context(shard_a: &TenantShard, shard_b: &TenantShard) -> ScheduleContext {
+            let mut schedule_context = ScheduleContext::default();
+            schedule_context.avoid(&shard_a.intent.all_pageservers());
+            schedule_context.avoid(&shard_b.intent.all_pageservers());
+            schedule_context
+        }
+
+        let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        let optimization_a_prepare = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_prepare,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::CreateSecondary(NodeId(1))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization_a_prepare.unwrap());
+
+        let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        let optimization_a_migrate = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_migrate,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                    old_attached_node_id: NodeId(2),
+                    new_attached_node_id: NodeId(1)
+                })
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization_a_migrate.unwrap());
+
+        let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        let optimization_a_cleanup = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_cleanup,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(3))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap());
+
+        // // Shard B should not be moved anywhere, since the pressure on node 1 was relieved by moving shard A
+        // let schedule_context = make_schedule_context(&shard_a, &shard_b);
+        // assert_eq!(shard_b.optimize_attachment(&mut scheduler, &schedule_context), None);
+
+        shard_a.intent.clear(&mut scheduler);
+        shard_b.intent.clear(&mut scheduler);
+
+        Ok(())
+    }
+
+    #[test]
+    /// Check that multi-step migration works when moving to somewhere that is only better by
+    /// 1 AffinityScore -- this ensures that we don't have a bug like the intermediate secondary
+    /// counting toward the affinity score such that it prevents the rest of the migration from happening.
+    fn optimize_attachment_marginal() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(2, &[]);
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        // Multi-sharded tenant, we will craft a situation where affinity
+        // scores differ only slightly
+        let mut shards = make_test_tenant(PlacementPolicy::Attached(0), ShardCount::new(4), None);
+
+        // 1 attached on node 1
+        shards[0]
+            .intent
+            .set_attached(&mut scheduler, Some(NodeId(1)));
+        // 3 attached on node 2
+        shards[1]
+            .intent
+            .set_attached(&mut scheduler, Some(NodeId(2)));
+        shards[2]
+            .intent
+            .set_attached(&mut scheduler, Some(NodeId(2)));
+        shards[3]
+            .intent
+            .set_attached(&mut scheduler, Some(NodeId(2)));
+
+        // The scheduler should figure out that we need to:
+        // - Create a secondary for shard 3 on node 1
+        // - Migrate shard 3 to node 1
+        // - Remove shard 3's location on node 2
+
+        fn make_schedule_context(shards: &Vec<TenantShard>) -> ScheduleContext {
+            let mut schedule_context = ScheduleContext::default();
+            for shard in shards {
+                schedule_context.avoid(&shard.intent.all_pageservers());
+            }
+            schedule_context
+        }
+
+        let schedule_context = make_schedule_context(&shards);
+        let optimization_a_prepare =
+            shards[1].optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_prepare,
+            Some(ScheduleOptimization {
+                sequence: shards[1].sequence,
+                action: ScheduleOptimizationAction::CreateSecondary(NodeId(1))
+            })
+        );
+        shards[1].apply_optimization(&mut scheduler, optimization_a_prepare.unwrap());
+
+        let schedule_context = make_schedule_context(&shards);
+        let optimization_a_migrate =
+            shards[1].optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_migrate,
+            Some(ScheduleOptimization {
+                sequence: shards[1].sequence,
+                action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                    old_attached_node_id: NodeId(2),
+                    new_attached_node_id: NodeId(1)
+                })
+            })
+        );
+        shards[1].apply_optimization(&mut scheduler, optimization_a_migrate.unwrap());
+
+        let schedule_context = make_schedule_context(&shards);
+        let optimization_a_cleanup =
+            shards[1].optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization_a_cleanup,
+            Some(ScheduleOptimization {
+                sequence: shards[1].sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(2))
+            })
+        );
+        shards[1].apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap());
+
+        // Everything should be stable now
+        let schedule_context = make_schedule_context(&shards);
+        assert_eq!(
+            shards[0].optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+        assert_eq!(
+            shards[1].optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+        assert_eq!(
+            shards[2].optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+        assert_eq!(
+            shards[3].optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+
+        for mut shard in shards {
+            shard.intent.clear(&mut scheduler);
+        }
+
+        Ok(())
+    }
+
     #[test]
     fn optimize_secondary() -> anyhow::Result<()> {
         let nodes = make_test_nodes(4, &[]);
@@ -1834,9 +2304,7 @@ pub(crate) mod tests {
 
         let mut schedule_context = ScheduleContext::default();
         schedule_context.avoid(&shard_a.intent.all_pageservers());
-        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
         schedule_context.avoid(&shard_b.intent.all_pageservers());
-        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
 
         let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context);
 
@@ -1867,7 +2335,6 @@ pub(crate) mod tests {
     // called repeatedly in the background.
     // Returns the applied optimizations
     fn optimize_til_idle(
-        nodes: &HashMap<NodeId, Node>,
         scheduler: &mut Scheduler,
         shards: &mut [TenantShard],
     ) -> Vec<ScheduleOptimization> {
@@ -1879,14 +2346,18 @@ pub(crate) mod tests {
 
             for shard in shards.iter() {
                 schedule_context.avoid(&shard.intent.all_pageservers());
-                if let Some(attached) = shard.intent.get_attached() {
-                    schedule_context.push_attached(*attached);
-                }
             }
 
             for shard in shards.iter_mut() {
-                let optimization = shard.optimize_attachment(nodes, &schedule_context);
+                let optimization = shard.optimize_attachment(scheduler, &schedule_context);
+                tracing::info!(
+                    "optimize_attachment({})={:?}",
+                    shard.tenant_shard_id,
+                    optimization
+                );
                 if let Some(optimization) = optimization {
+                    // Check that maybe_optimizable wouldn't have wrongly claimed this optimization didn't exist
+                    assert!(shard.maybe_optimizable(scheduler, &schedule_context));
                     optimizations.push(optimization.clone());
                     shard.apply_optimization(scheduler, optimization);
                     any_changed = true;
@@ -1894,7 +2365,15 @@ pub(crate) mod tests {
                 }
 
                 let optimization = shard.optimize_secondary(scheduler, &schedule_context);
+                tracing::info!(
+                    "optimize_secondary({})={:?}",
+                    shard.tenant_shard_id,
+                    optimization
+                );
                 if let Some(optimization) = optimization {
+                    // Check that maybe_optimizable wouldn't have wrongly claimed this optimization didn't exist
+                    assert!(shard.maybe_optimizable(scheduler, &schedule_context));
+
                     optimizations.push(optimization.clone());
                     shard.apply_optimization(scheduler, optimization);
                     any_changed = true;
@@ -1918,14 +2397,34 @@ pub(crate) mod tests {
     /// that it converges.
     #[test]
     fn optimize_add_nodes() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(4, &[]);
+        let nodes = make_test_nodes(
+            9,
+            &[
+                // Initial 6 nodes
+                AvailabilityZone("az-a".to_string()),
+                AvailabilityZone("az-a".to_string()),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-c".to_string()),
+                AvailabilityZone("az-c".to_string()),
+                // Three we will add later
+                AvailabilityZone("az-a".to_string()),
+                AvailabilityZone("az-b".to_string()),
+                AvailabilityZone("az-c".to_string()),
+            ],
+        );
 
-        // Only show the scheduler a couple of nodes
+        // Only show the scheduler two nodes in each AZ to start with
         let mut scheduler = Scheduler::new([].iter());
-        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
-        scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
+        for i in 1..=6 {
+            scheduler.node_upsert(nodes.get(&NodeId(i)).unwrap());
+        }
 
-        let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4), None);
+        let mut shards = make_test_tenant(
+            PlacementPolicy::Attached(1),
+            ShardCount::new(4),
+            Some(AvailabilityZone("az-a".to_string())),
+        );
         let mut schedule_context = ScheduleContext::default();
         for shard in &mut shards {
             assert!(shard
@@ -1933,30 +2432,50 @@ pub(crate) mod tests {
                 .is_ok());
         }
 
-        // We should see equal number of locations on the two nodes.
-        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
+        // Initial: attached locations land in the tenant's home AZ.
+        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
         assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2);
-
-        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
         assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2);
 
-        // Add another two nodes: we should see the shards spread out when their optimize
-        // methods are called
-        scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap());
-        scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
-        optimize_til_idle(&nodes, &mut scheduler, &mut shards);
+        // Initial: secondary locations in a remote AZ
+        assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(5)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(5)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(6)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(6)), 0);
 
-        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
+        // Add another three nodes: we should see the shards spread out when their optimize
+        // methods are called
+        scheduler.node_upsert(nodes.get(&NodeId(7)).unwrap());
+        scheduler.node_upsert(nodes.get(&NodeId(8)).unwrap());
+        scheduler.node_upsert(nodes.get(&NodeId(9)).unwrap());
+        optimize_til_idle(&mut scheduler, &mut shards);
+
+        // We expect one attached location was moved to the new node in the tenant's home AZ
+        assert_eq!(scheduler.get_node_shard_count(NodeId(7)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(7)), 1);
+        // The original node has one less attached shard
+        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
         assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
 
+        // One of the original nodes still has two attachments, since there are an odd number of nodes
         assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2);
 
-        assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 1);
-
-        assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
-        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 1);
+        // None of our secondaries moved, since we already had enough nodes for those to be
+        // scheduled perfectly
+        assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(5)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(5)), 0);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(6)), 1);
+        assert_eq!(scheduler.get_node_attached_shard_count(NodeId(6)), 0);
 
         for shard in shards.iter_mut() {
             shard.intent.clear(&mut scheduler);
@@ -1996,10 +2515,10 @@ pub(crate) mod tests {
             shard.schedule(&mut scheduler, context).unwrap();
         }
 
-        let applied_to_a = optimize_til_idle(&nodes, &mut scheduler, &mut a);
+        let applied_to_a = optimize_til_idle(&mut scheduler, &mut a);
         assert_eq!(applied_to_a, vec![]);
 
-        let applied_to_b = optimize_til_idle(&nodes, &mut scheduler, &mut b);
+        let applied_to_b = optimize_til_idle(&mut scheduler, &mut b);
         assert_eq!(applied_to_b, vec![]);
 
         for shard in a.iter_mut().chain(b.iter_mut()) {
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index c5295360c3..fa541bad17 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -131,7 +131,6 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = (
     "pageserver_getpage_reconstruct_seconds_sum",
     *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
     *histogram("pageserver_smgr_query_seconds_global"),
-    *histogram("pageserver_layers_visited_per_read_global"),
     *histogram("pageserver_getpage_get_reconstruct_data_seconds"),
     *histogram("pageserver_wait_lsn_seconds"),
     *histogram("pageserver_remote_operation_seconds"),
diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py
index caa89955e3..76c3ad01a4 100644
--- a/test_runner/performance/test_sharding_autosplit.py
+++ b/test_runner/performance/test_sharding_autosplit.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 import concurrent.futures
 import re
+import threading
 from pathlib import Path
 
 import pytest
@@ -188,7 +189,20 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
 
         check_pgbench_output(out_path)
 
-    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
+    stop_pump = threading.Event()
+
+    def pump_controller():
+        # Run a background loop to force the storage controller to run its
+        # background work faster than it otherwise would: this helps
+        # us:
+        #  A) to create a test that runs in a shorter time
+        #  B) to create a test that is more intensive by doing the shard migrations
+        #     after splits happen more rapidly.
+        while not stop_pump.is_set():
+            env.storage_controller.reconcile_all()
+            stop_pump.wait(0.1)
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count + 1) as pgbench_threads:
         pgbench_futs = []
         for tenant_state in tenants.values():
             fut = pgbench_threads.submit(run_pgbench_init, tenant_state.endpoint)
@@ -198,6 +212,8 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
         for fut in pgbench_futs:
             fut.result()
 
+        pump_fut = pgbench_threads.submit(pump_controller)
+
         pgbench_futs = []
         for tenant_state in tenants.values():
             fut = pgbench_threads.submit(run_pgbench_main, tenant_state.endpoint)
@@ -207,6 +223,9 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
         for fut in pgbench_futs:
             fut.result()
 
+        stop_pump.set()
+        pump_fut.result()
+
     def assert_all_split():
         for tenant_id in tenants.keys():
             shards = tenant_get_shards(env, tenant_id)
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 49f41483ec..d45db28c78 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -13,11 +13,13 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
+    NeonPageserver,
     PageserverAvailability,
     PageserverSchedulingPolicy,
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pg_version import PgVersion
+from fixtures.utils import wait_until
 
 
 def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[str, int]:
@@ -85,8 +87,12 @@ def test_storage_controller_many_tenants(
     )
 
     AZS = ["alpha", "bravo", "charlie"]
+
+    def az_selector(node_id):
+        return f"az-{AZS[(node_id - 1) % len(AZS)]}"
+
     neon_env_builder.pageserver_config_override = lambda ps_cfg: ps_cfg.update(
-        {"availability_zone": f"az-{AZS[ps_cfg['id'] % len(AZS)]}"}
+        {"availability_zone": az_selector(ps_cfg["id"])}
     )
 
     # A small sleep on each call into the notify hook, to simulate the latency of doing a database write
@@ -168,6 +174,31 @@ def test_storage_controller_many_tenants(
         log.info(f"Resident memory: {rss} ({ rss / total_shards} per shard)")
         assert rss < expect_memory_per_shard * total_shards
 
+    def assert_all_tenants_scheduled_in_home_az():
+        for tenant_id in tenant_ids:
+            desc = env.storage_controller.tenant_describe(tenant_id)
+            preferred_az = None
+            for shard in desc["shards"]:
+                # All shards in a tenant should have the same preferred AZ
+                if preferred_az is None:
+                    preferred_az = shard["preferred_az_id"]
+                else:
+                    assert preferred_az == shard["preferred_az_id"]
+
+                # Attachment should be in the preferred AZ
+                assert shard["preferred_az_id"] == az_selector(
+                    shard["node_attached"]
+                ), f"Shard {shard['tenant_shard_id']} not in {shard['preferred_az_id']}"
+
+                # Secondary locations should not be in the preferred AZ
+                for node_secondary in shard["node_secondary"]:
+                    assert (
+                        shard["preferred_az_id"] != az_selector(node_secondary)
+                    ), f"Shard {shard['tenant_shard_id']} secondary should be in {shard['preferred_az_id']}"
+
+                # There should only be one secondary location (i.e. no migrations in flight)
+                assert len(shard["node_secondary"]) == 1
+
     # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore
     # permits, to ensure that we are exercising stressing that.
     api_concurrency = 135
@@ -242,6 +273,22 @@ def test_storage_controller_many_tenants(
             f"Created {len(tenants_with_timelines)} timelines in {time.time() - t1}, {len(tenants_with_timelines) / (time.time() - t1)}/s"
         )
 
+    # Check initial scheduling
+    assert_all_tenants_scheduled_in_home_az()
+    az_attached_counts: defaultdict[str, int] = defaultdict(int)
+    az_secondary_counts: defaultdict[str, int] = defaultdict(int)
+    node_attached_counts: defaultdict[str, int] = defaultdict(int)
+    for tenant_id in tenants.keys():
+        desc = env.storage_controller.tenant_describe(tenant_id)
+        for shard in desc["shards"]:
+            az_attached_counts[az_selector(shard["node_attached"])] += 1
+            node_attached_counts[shard["node_attached"]] += 1
+            for node_secondary in shard["node_secondary"]:
+                az_secondary_counts[az_selector(node_secondary)] += 1
+
+    log.info(f"Initial node attached counts: {node_attached_counts}")
+    log.info(f"Initial AZ shard counts: {az_attached_counts}, {az_secondary_counts}")
+
     # Plan operations: ensure each tenant with a timeline gets at least
     # one of each operation type.  Then add other tenants to make up the
     # numbers.
@@ -450,11 +497,77 @@ def test_storage_controller_many_tenants(
         env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
         env.storage_controller.consistency_check()
 
+    # Since we did `reconcile_until_idle` during the above loop, the system should be left in
+    # an optimally scheduled state.  Validate that this includes all the tenants being scheduled
+    # in their home AZ.
+    assert_all_tenants_scheduled_in_home_az()
+
     # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,
     # as they were not offline long enough to trigger any scheduling changes.
     env.storage_controller.consistency_check()
     check_memory()
 
+    # Simulate loss of an AZ
+    victim_az = "az-alpha"
+    killed_pageservers = []
+    for ps in env.pageservers:
+        if az_selector(ps.id) == victim_az:
+            ps.stop(immediate=True)
+            killed_pageservers.append(ps)
+            log.info(f"Killed pageserver {ps.id}")
+
+    assert killed_pageservers
+
+    # Wait for the controller to notice the pageservers are dead
+    def assert_pageservers_availability(
+        pageservers: list[NeonPageserver], expected_availability: PageserverAvailability
+    ):
+        nodes = env.storage_controller.nodes()
+        checked_any = False
+        node_ids = [ps.id for ps in pageservers]
+        for node in nodes:
+            if node["id"] in node_ids:
+                checked_any = True
+                assert (
+                    node["availability"] == expected_availability
+                ), f"Node {node['id']} is not {expected_availability} yet: {node['availability']}"
+
+        assert checked_any
+
+    wait_until(
+        lambda: assert_pageservers_availability(killed_pageservers, PageserverAvailability.OFFLINE),
+        timeout=60,
+    )
+
+    # Let the controller finish all its rescheduling
+    env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
+
+    # Check that all the tenants are rescheduled to the remaining pageservers
+    for tenant_id in tenant_ids:
+        desc = env.storage_controller.tenant_describe(tenant_id)
+        for shard in desc["shards"]:
+            # Attachment should be outside the AZ where we killed the pageservers
+            assert (
+                az_selector(shard["node_attached"]) != victim_az
+            ), f"Shard {shard['tenant_shard_id']} still in {victim_az} (node {shard['node_attached']})"
+
+    # Bring back the pageservers
+    for ps in killed_pageservers:
+        ps.start()
+
+    wait_until(
+        lambda: assert_pageservers_availability(killed_pageservers, PageserverAvailability.ACTIVE),
+        timeout=60,
+    )
+
+    # A very long timeout is required: we will be migrating all the tenants on all the pageservers
+    # in the region that we just restored.  Assume it'll take up to twice as long as it took to fill
+    # a single node
+    env.storage_controller.reconcile_until_idle(
+        max_interval=0.1, timeout_secs=DRAIN_FILL_TIMEOUT * 4
+    )
+    assert_all_tenants_scheduled_in_home_az()
+
     # Stop the storage controller before tearing down fixtures, because it otherwise might log
     # errors trying to call our `ComputeReconfigure`.
     env.storage_controller.stop()
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index ae48a8fc27..fe0422088a 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -84,9 +84,6 @@ page_cache_size=10
     log.info("Checking layer access metrics ...")
 
     layer_access_metric_names = [
-        "pageserver_layers_visited_per_read_global_sum",
-        "pageserver_layers_visited_per_read_global_count",
-        "pageserver_layers_visited_per_read_global_bucket",
         "pageserver_layers_visited_per_vectored_read_global_sum",
         "pageserver_layers_visited_per_vectored_read_global_count",
         "pageserver_layers_visited_per_vectored_read_global_bucket",
@@ -97,12 +94,6 @@ page_cache_size=10
         layer_access_metrics = metrics.query_all(name)
         log.info(f"Got metrics: {layer_access_metrics}")
 
-    non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum")
-    non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count")
-    if non_vectored_count.value != 0:
-        non_vectored_average = non_vectored_sum.value / non_vectored_count.value
-    else:
-        non_vectored_average = 0
     vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
     vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
     if vectored_count.value > 0:
@@ -113,11 +104,10 @@ page_cache_size=10
         assert vectored_sum.value == 0
         vectored_average = 0
 
-    log.info(f"{non_vectored_average=} {vectored_average=}")
+    log.info(f"{vectored_average=}")
 
     # The upper bound for average number of layer visits below (8)
     # was chosen empirically for this workload.
-    assert non_vectored_average < 8
     assert vectored_average < 8
 
 
diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py
index 71963355b7..5dcc93acff 100644
--- a/test_runner/regress/test_compute_metrics.py
+++ b/test_runner/regress/test_compute_metrics.py
@@ -219,7 +219,7 @@ if SQL_EXPORTER is None:
             #
             # The "host" network mode allows sql_exporter to talk to the
             # endpoint which is running on the host.
-            super().__init__("docker.io/burningalchemist/sql_exporter:0.16.0", network_mode="host")
+            super().__init__("docker.io/burningalchemist/sql_exporter:0.17.0", network_mode="host")
 
             self.__logs_dir = logs_dir
             self.__port = port
@@ -252,7 +252,7 @@ if SQL_EXPORTER is None:
             log.info("Waiting for sql_exporter to be ready")
             wait_for_logs(
                 self,
-                rf'level=info msg="Listening on" address=\[::\]:{self.__port}',
+                rf'msg="Listening on" address=\[::\]:{self.__port}',
                 timeout=5,
             )
 
@@ -344,10 +344,7 @@ else:
                         time.sleep(0.5)
                         continue
 
-                    if (
-                        f'level=info msg="Listening on" address=[::]:{self._sql_exporter_port}'
-                        in line
-                    ):
+                    if f'msg="Listening on" address=[::]:{self._sql_exporter_port}' in line:
                         break
 
         @override
diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py
index 6cb11b825d..17819fd367 100644
--- a/test_runner/regress/test_physical_replication.py
+++ b/test_runner/regress/test_physical_replication.py
@@ -187,7 +187,7 @@ def test_physical_replication_config_mismatch_too_many_known_xids(neon_simple_en
         origin=primary,
         endpoint_id="secondary",
         config_lines=[
-            "max_connections=2",
+            "max_connections=5",
             "autovacuum_max_workers=1",
             "max_worker_processes=5",
             "max_wal_senders=1",
diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py
index ea01252ce4..f14317a39f 100644
--- a/test_runner/regress/test_proxy_websockets.py
+++ b/test_runner/regress/test_proxy_websockets.py
@@ -1,10 +1,15 @@
 from __future__ import annotations
 
+import asyncio
 import ssl
 
+import asyncpg
 import pytest
+import websocket_tunnel
 import websockets
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonProxy
+from fixtures.port_distributor import PortDistributor
 
 
 @pytest.mark.asyncio
@@ -196,3 +201,53 @@ async def test_websockets_pipelined(static_proxy: NeonProxy):
         # close
         await websocket.send(b"X\x00\x00\x00\x04")
         await websocket.wait_closed()
+
+
+@pytest.mark.asyncio
+async def test_websockets_tunneled(static_proxy: NeonProxy, port_distributor: PortDistributor):
+    static_proxy.safe_psql("create user ws_auth with password 'ws' superuser")
+
+    user = "ws_auth"
+    password = "ws"
+
+    ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+    ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt"))
+
+    # Launch a tunnel service so that we can speak the websockets protocol to
+    # the proxy
+    tunnel_port = port_distributor.get_port()
+    tunnel_server = await websocket_tunnel.start_server(
+        "127.0.0.1",
+        tunnel_port,
+        f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+        ssl_context,
+    )
+    log.info(f"websockets tunnel listening for connections on port {tunnel_port}")
+
+    async with tunnel_server:
+
+        async def run_tunnel():
+            try:
+                async with tunnel_server:
+                    await tunnel_server.serve_forever()
+            except Exception as e:
+                log.error(f"Error in tunnel task: {e}")
+
+        tunnel_task = asyncio.create_task(run_tunnel())
+
+        # Ok, the tunnel is now running. Check that we can connect to the proxy's
+        # websocket interface, through the tunnel
+        tunnel_connstring = f"postgres://{user}:{password}@127.0.0.1:{tunnel_port}/postgres"
+
+        log.info(f"connecting to {tunnel_connstring}")
+        conn = await asyncpg.connect(tunnel_connstring)
+        res = await conn.fetchval("SELECT 123")
+        assert res == 123
+        await conn.close()
+        log.info("Ran a query successfully through the tunnel")
+
+    tunnel_server.close()
+    try:
+        await tunnel_task
+    except asyncio.CancelledError:
+        pass
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 673904a1cd..86a6b7428b 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -520,14 +520,18 @@ def test_sharding_split_smoke(
     shard_count = 2
     # Shard count we split into
     split_shard_count = 4
-    # We will have 2 shards per pageserver once done (including secondaries)
-    neon_env_builder.num_pageservers = split_shard_count
+    # In preferred AZ & other AZ we will end up with one shard per pageserver
+    neon_env_builder.num_pageservers = split_shard_count * 2
 
     # Two AZs
     def assign_az(ps_cfg):
         az = f"az-{(ps_cfg['id'] - 1) % 2}"
         ps_cfg["availability_zone"] = az
 
+        # We will run more pageservers than tests usually do, so give them tiny page caches
+        # in case we're on a test node under memory pressure.
+        ps_cfg["page_cache_size"] = 128
+
     neon_env_builder.pageserver_config_override = assign_az
 
     # 1MiB stripes: enable getting some meaningful data distribution without
@@ -679,8 +683,8 @@ def test_sharding_split_smoke(
     # - shard_count reconciles for the original setup of the tenant
     # - shard_count reconciles for detaching the original secondary locations during split
     # - split_shard_count reconciles during shard splitting, for setting up secondaries.
-    # - split_shard_count/2 of the child shards will need to fail over to their secondaries (since we have 8 shards and 4 pageservers, only 4 will move)
-    expect_reconciles = shard_count * 2 + split_shard_count + split_shard_count / 2
+    # - split_shard_count/2 reconciles to migrate shards to their temporary secondaries
+    expect_reconciles = shard_count * 2 + split_shard_count + 3 * (split_shard_count / 2)
 
     reconcile_ok = env.storage_controller.get_metric_value(
         "storage_controller_reconcile_complete_total", filter={"status": "ok"}
@@ -745,10 +749,14 @@ def test_sharding_split_smoke(
     # dominated by shard count.
     log.info(f"total: {total}")
     assert total == {
-        1: 2,
-        2: 2,
-        3: 2,
-        4: 2,
+        1: 1,
+        2: 1,
+        3: 1,
+        4: 1,
+        5: 1,
+        6: 1,
+        7: 1,
+        8: 1,
     }
 
     # The controller is not required to lay out the attached locations in any particular way, but
@@ -1387,13 +1395,7 @@ def test_sharding_split_failures(
                 else:
                     attached_count += 1
 
-        if exclude_ps_id is not None:
-            # For a node failure case, we expect there to be a secondary location
-            # scheduled on the offline node, so expect one fewer secondary in total
-            assert secondary_count == initial_shard_count - 1
-        else:
-            assert secondary_count == initial_shard_count
-
+        assert secondary_count == initial_shard_count
         assert attached_count == initial_shard_count
 
     def assert_split_done(exclude_ps_id: int | None = None) -> None:
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index da6d5b8622..8ffb6ba6b2 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -822,6 +822,122 @@ def test_storage_controller_stuck_compute_hook(
     env.storage_controller.consistency_check()
 
 
+@run_only_on_default_postgres("postgres behavior is not relevant")
+def test_storage_controller_compute_hook_retry(
+    httpserver: HTTPServer,
+    neon_env_builder: NeonEnvBuilder,
+    httpserver_listen_address: ListenAddress,
+):
+    """
+    Test that when a reconciler can't do its compute hook notification, it will keep
+    trying until it succeeds.
+
+    Reproducer for https://github.com/neondatabase/cloud/issues/22612
+    """
+
+    neon_env_builder.num_pageservers = 2
+    (host, port) = httpserver_listen_address
+    neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
+
+    handle_params = {"status": 200}
+
+    notifications = []
+
+    def handler(request: Request):
+        status = handle_params["status"]
+        log.info(f"Notify request[{status}]: {request}")
+        notifications.append(request.json)
+        return Response(status=status)
+
+    httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
+
+    # Start running
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    env.create_tenant(tenant_id, placement_policy='{"Attached": 1}')
+
+    # Initial notification from tenant creation
+    assert len(notifications) == 1
+    expect: dict[str, list[dict[str, int]] | str | None | int] = {
+        "tenant_id": str(tenant_id),
+        "stripe_size": None,
+        "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
+        "preferred_az": DEFAULT_AZ_ID,
+    }
+    assert notifications[0] == expect
+
+    # Block notifications, and fail a node
+    handle_params["status"] = 423
+    env.pageservers[0].stop()
+    env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
+    env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
+
+    # Avoid waiting for heartbeats
+    env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
+
+    # Make reconciler run and fail: it should leave itself in a state where the shard will retry notification later,
+    # and we will check that that happens
+    notifications = []
+    try:
+        assert env.storage_controller.reconcile_all() == 1
+    except StorageControllerApiException as e:
+        assert "Control plane tenant busy" in str(e)
+    assert len(notifications) == 1
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is True
+    )
+
+    # Try reconciling again, it should try notifying again
+    notifications = []
+    try:
+        assert env.storage_controller.reconcile_all() == 1
+    except StorageControllerApiException as e:
+        assert "Control plane tenant busy" in str(e)
+    assert len(notifications) == 1
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is True
+    )
+
+    # The describe API should indicate that a notification is pending
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is True
+    )
+
+    # Unblock notifications: reconcile should work now
+    handle_params["status"] = 200
+    notifications = []
+    assert env.storage_controller.reconcile_all() == 1
+    assert len(notifications) == 1
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is False
+    )
+
+    # Reconciler should be idle now that it succeeded in its compute notification
+    notifications = []
+    assert env.storage_controller.reconcile_all() == 0
+    assert len(notifications) == 0
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is False
+    )
+
+
 @run_only_on_default_postgres("this test doesn't start an endpoint")
 def test_storage_controller_compute_hook_revert(
     httpserver: HTTPServer,
@@ -936,7 +1052,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
     that just hits the endpoints to check that they don't bitrot.
     """
 
-    neon_env_builder.num_pageservers = 2
+    neon_env_builder.num_pageservers = 3
     env = neon_env_builder.init_start()
 
     tenant_id = TenantId.generate()
@@ -961,7 +1077,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
         "GET", f"{env.storage_controller_api}/debug/v1/scheduler"
     )
     # Two nodes, in a dict of node_id->node
-    assert len(response.json()["nodes"]) == 2
+    assert len(response.json()["nodes"]) == 3
     assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
     assert all(v["may_schedule"] for v in response.json()["nodes"].values())
 
@@ -972,13 +1088,25 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
         headers=env.storage_controller.headers(TokenScope.ADMIN),
     )
 
+    # Secondary migration API: superficial check that it migrates
+    secondary_dest = env.pageservers[2].id
+    env.storage_controller.request(
+        "PUT",
+        f"{env.storage_controller_api}/control/v1/tenant/{tenant_id}-0002/migrate_secondary",
+        headers=env.storage_controller.headers(TokenScope.ADMIN),
+        json={"tenant_shard_id": f"{tenant_id}-0002", "node_id": secondary_dest},
+    )
+    assert env.storage_controller.tenant_describe(tenant_id)["shards"][0]["node_secondary"] == [
+        secondary_dest
+    ]
+
     # Node unclean drop API
     response = env.storage_controller.request(
         "POST",
         f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop",
         headers=env.storage_controller.headers(TokenScope.ADMIN),
     )
-    assert len(env.storage_controller.node_list()) == 1
+    assert len(env.storage_controller.node_list()) == 2
 
     # Tenant unclean drop API
     response = env.storage_controller.request(
@@ -1696,7 +1824,13 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     """
     output_dir = neon_env_builder.test_output_dir
     shard_count = 4
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    env.create_tenant(tenant_id, placement_policy='{"Attached":1}', shard_count=shard_count)
+
     base_args = [env.neon_binpath / "storcon_cli", "--api", env.storage_controller_api]
 
     def storcon_cli(args):
@@ -1725,7 +1859,7 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     # List nodes
     node_lines = storcon_cli(["nodes"])
     # Table header, footer, and one line of data
-    assert len(node_lines) == 5
+    assert len(node_lines) == 7
     assert "localhost" in node_lines[3]
 
     # Pause scheduling onto a node
@@ -1743,10 +1877,21 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"])
     assert "Offline" in storcon_cli(["nodes"])[3]
 
+    # Restore node, verify status changes in CLI output
+    env.pageservers[0].start()
+
+    def is_online():
+        assert "Offline" not in storcon_cli(["nodes"])
+
+    wait_until(is_online)
+
+    # Let everything stabilize after node failure to avoid interfering with subsequent steps
+    env.storage_controller.reconcile_until_idle(timeout_secs=10)
+
     # List tenants
     tenant_lines = storcon_cli(["tenants"])
     assert len(tenant_lines) == 5
-    assert str(env.initial_tenant) in tenant_lines[3]
+    assert str(tenant_id) in tenant_lines[3]
 
     # Setting scheduling policies intentionally result in warnings, they're for rare use.
     env.storage_controller.allowed_errors.extend(
@@ -1754,23 +1899,58 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     )
 
     # Describe a tenant
-    tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)])
+    tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(tenant_id)])
     assert len(tenant_lines) >= 3 + shard_count * 2
-    assert str(env.initial_tenant) in tenant_lines[0]
+    assert str(tenant_id) in tenant_lines[0]
+
+    # Migrate an attached location
+    def other_ps_id(current_ps_id):
+        return (
+            env.pageservers[0].id
+            if current_ps_id == env.pageservers[1].id
+            else env.pageservers[1].id
+        )
+
+    storcon_cli(
+        [
+            "tenant-shard-migrate",
+            "--tenant-shard-id",
+            f"{tenant_id}-0004",
+            "--node",
+            str(
+                other_ps_id(
+                    env.storage_controller.tenant_describe(tenant_id)["shards"][0]["node_attached"]
+                )
+            ),
+        ]
+    )
+
+    # Migrate a secondary location
+    storcon_cli(
+        [
+            "tenant-shard-migrate-secondary",
+            "--tenant-shard-id",
+            f"{tenant_id}-0004",
+            "--node",
+            str(
+                other_ps_id(
+                    env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+                        "node_secondary"
+                    ][0]
+                )
+            ),
+        ]
+    )
 
     # Pause changes on a tenant
-    storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])
+    storcon_cli(["tenant-policy", "--tenant-id", str(tenant_id), "--scheduling", "stop"])
     assert "Stop" in storcon_cli(["tenants"])[3]
 
     # Cancel ongoing reconcile on a tenant
-    storcon_cli(
-        ["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{env.initial_tenant}-0104"]
-    )
+    storcon_cli(["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{tenant_id}-0104"])
 
     # Change a tenant's placement
-    storcon_cli(
-        ["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"]
-    )
+    storcon_cli(["tenant-policy", "--tenant-id", str(tenant_id), "--placement", "secondary"])
     assert "Secondary" in storcon_cli(["tenants"])[3]
 
     # Modify a tenant's config
@@ -1778,7 +1958,7 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
         [
             "patch-tenant-config",
             "--tenant-id",
-            str(env.initial_tenant),
+            str(tenant_id),
             "--config",
             json.dumps({"pitr_interval": "1m"}),
         ]
@@ -3033,11 +3213,12 @@ def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
 @run_only_on_default_postgres("this is like a 'unit test' against storcon db")
 def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
     def assign_az(ps_cfg):
-        az = f"az-{ps_cfg['id']}"
+        az = f"az-{ps_cfg['id'] % 2}"
+        log.info("Assigned AZ {az}")
         ps_cfg["availability_zone"] = az
 
     neon_env_builder.pageserver_config_override = assign_az
-    neon_env_builder.num_pageservers = 2
+    neon_env_builder.num_pageservers = 4
     env = neon_env_builder.init_configs()
     env.start()
 
@@ -3052,8 +3233,14 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
 
         assert shards[0]["preferred_az_id"] == expected_az
 
+    # When all other schedule scoring parameters are equal, tenants should round-robin on AZs
+    assert env.storage_controller.tenant_describe(tids[0])["shards"][0]["preferred_az_id"] == "az-0"
+    assert env.storage_controller.tenant_describe(tids[1])["shards"][0]["preferred_az_id"] == "az-1"
+    assert env.storage_controller.tenant_describe(tids[2])["shards"][0]["preferred_az_id"] == "az-0"
+
+    # Try modifying preferred AZ
     updated = env.storage_controller.set_preferred_azs(
-        {TenantShardId(tid, 0, 0): "foo" for tid in tids}
+        {TenantShardId(tid, 0, 0): "az-0" for tid in tids}
     )
 
     assert set(updated) == set([TenantShardId(tid, 0, 0) for tid in tids])
@@ -3061,29 +3248,24 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
     for tid in tids:
         shards = env.storage_controller.tenant_describe(tid)["shards"]
         assert len(shards) == 1
-        assert shards[0]["preferred_az_id"] == "foo"
+        assert shards[0]["preferred_az_id"] == "az-0"
 
-    # Generate a layer to avoid shard split handling on ps from tripping
-    # up on debug assert.
-    timeline_id = TimelineId.generate()
-    env.create_timeline("bar", tids[0], timeline_id)
-
-    workload = Workload(env, tids[0], timeline_id, branch_name="bar")
-    workload.init()
-    workload.write_rows(256)
-    workload.validate()
+    # Having modified preferred AZ, we should get moved there
+    env.storage_controller.reconcile_until_idle(max_interval=0.1)
+    for tid in tids:
+        shard = env.storage_controller.tenant_describe(tid)["shards"][0]
+        attached_to = shard["node_attached"]
+        attached_in_az = env.get_pageserver(attached_to).az_id
+        assert shard["preferred_az_id"] == attached_in_az == "az-0"
 
     env.storage_controller.tenant_shard_split(tids[0], shard_count=2)
+    env.storage_controller.reconcile_until_idle(max_interval=0.1)
     shards = env.storage_controller.tenant_describe(tids[0])["shards"]
     assert len(shards) == 2
     for shard in shards:
         attached_to = shard["node_attached"]
-        expected_az = env.get_pageserver(attached_to).az_id
-
-        # The scheduling optimization logic is not yet AZ-aware, so doesn't succeed
-        # in putting the tenant shards in the preferred AZ.
-        # To be fixed in https://github.com/neondatabase/neon/pull/9916
-        # assert shard["preferred_az_id"] == expected_az
+        attached_in_az = env.get_pageserver(attached_to).az_id
+        assert shard["preferred_az_id"] == attached_in_az == "az-0"
 
 
 @run_only_on_default_postgres("Postgres version makes no difference here")
diff --git a/test_runner/websocket_tunnel.py b/test_runner/websocket_tunnel.py
new file mode 100755
index 0000000000..facdb19140
--- /dev/null
+++ b/test_runner/websocket_tunnel.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+#
+# This program helps to test the WebSocket tunneling in proxy. It listens for a TCP
+# connection on a port, and when you connect to it, it opens a websocket connection,
+# and forwards all the traffic to the websocket connection, wrapped in WebSocket binary
+# frames.
+#
+# This is used in the test_proxy::test_websockets test, but it is handy for manual testing too.
+#
+# Usage for manual testing:
+#
+# ## Launch Posgres on port 3000:
+# postgres -D data -p3000
+#
+# ## Launch proxy with WSS enabled:
+# openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj '/CN=*.neon.localtest.me'
+# ./target/debug/proxy --wss 127.0.0.1:40433 --http 127.0.0.1:28080 --mgmt 127.0.0.1:9099 --proxy 127.0.0.1:4433 --tls-key server.key --tls-cert server.crt --auth-backend postgres
+#
+# ## Launch the tunnel:
+#
+# poetry run ./test_runner/websocket_tunnel.py --ws-port 40433 --ws-url "wss://ep-test.neon.localtest.me"
+#
+# ## Now you can connect with psql:
+# psql "postgresql://heikki@localhost:40433/postgres"
+#
+
+import argparse
+import asyncio
+import logging
+import ssl
+from ssl import Purpose
+
+import websockets
+from fixtures.log_helper import log
+
+
+# Enable verbose logging of all the traffic
+def enable_verbose_logging():
+    logger = logging.getLogger("websockets")
+    logger.setLevel(logging.DEBUG)
+    logger.addHandler(logging.StreamHandler())
+
+
+async def start_server(tcp_listen_host, tcp_listen_port, ws_url, ctx):
+    server = await asyncio.start_server(
+        lambda r, w: handle_client(r, w, ws_url, ctx), tcp_listen_host, tcp_listen_port
+    )
+    return server
+
+
+async def handle_tcp_to_websocket(tcp_reader, ws):
+    try:
+        while not tcp_reader.at_eof():
+            data = await tcp_reader.read(1024)
+
+            await ws.send(data)
+    except websockets.exceptions.ConnectionClosedError as e:
+        log.debug(f"connection closed: {e}")
+    except websockets.exceptions.ConnectionClosedOK:
+        log.debug("connection closed")
+    except Exception as e:
+        log.error(e)
+
+
+async def handle_websocket_to_tcp(ws, tcp_writer):
+    try:
+        async for message in ws:
+            tcp_writer.write(message)
+            await tcp_writer.drain()
+    except websockets.exceptions.ConnectionClosedError as e:
+        log.debug(f"connection closed: {e}")
+    except websockets.exceptions.ConnectionClosedOK:
+        log.debug("connection closed")
+    except Exception as e:
+        log.error(e)
+
+
+async def handle_client(tcp_reader, tcp_writer, ws_url: str, ctx: ssl.SSLContext):
+    try:
+        log.info("Received TCP connection. Connecting to websockets proxy.")
+
+        async with websockets.connect(ws_url, ssl=ctx) as ws:
+            try:
+                log.info("Connected to websockets proxy")
+
+                async with asyncio.TaskGroup() as tg:
+                    task1 = tg.create_task(handle_tcp_to_websocket(tcp_reader, ws))
+                    task2 = tg.create_task(handle_websocket_to_tcp(ws, tcp_writer))
+
+                    done, pending = await asyncio.wait(
+                        [task1, task2], return_when=asyncio.FIRST_COMPLETED
+                    )
+                    tcp_writer.close()
+                    await ws.close()
+
+            except* Exception as ex:
+                log.error(ex.exceptions)
+    except Exception as e:
+        log.error(e)
+
+
+async def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--tcp-listen-addr",
+        default="localhost",
+        help="TCP addr to listen on",
+    )
+    parser.add_argument(
+        "--tcp-listen-port",
+        default="40444",
+        help="TCP port to listen on",
+    )
+
+    parser.add_argument(
+        "--ws-url",
+        default="wss://localhost/",
+        help="websocket URL to connect to. This determines the Host header sent to the server",
+    )
+    parser.add_argument(
+        "--ws-host",
+        default="127.0.0.1",
+        help="websockets host to connect to",
+    )
+    parser.add_argument(
+        "--ws-port",
+        type=int,
+        default=443,
+        help="websockets port to connect to",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="enable verbose logging",
+    )
+    args = parser.parse_args()
+
+    if args.verbose:
+        enable_verbose_logging()
+
+    ctx = ssl.create_default_context(Purpose.SERVER_AUTH)
+    ctx.check_hostname = False
+    ctx.verify_mode = ssl.CERT_NONE
+
+    server = await start_server(args.tcp_listen_addr, args.tcp_listen_port, args.ws_url, ctx)
+    print(
+        f"Listening for connections at {args.tcp_listen_addr}:{args.tcp_listen_port}, forwarding them to {args.ws_host}:{args.ws_port}"
+    )
+    async with server:
+        await server.serve_forever()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index c2f65b3201..210a0ba3af 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit c2f65b3201591e02ce45b66731392f98d3388e73
+Subproject commit 210a0ba3afd8134ea910b203f274b165bd4f05d7
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index f262d631ad..d3141e17a7 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit f262d631ad477a1819e84a183e5a7ef561830085
+Subproject commit d3141e17a7155e3d07c8deba4a10c748a29ba1e6
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 97f9fde349..f63b141cfb 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 97f9fde349c6de6d573f5ce96db07eca60ce6185
+Subproject commit f63b141cfb0c813725a6b2574049565bff643018
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 7e3f3974bc..0f8da73ed0 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 7e3f3974bc8895938308f94d0e96879ffae638cd
+Subproject commit 0f8da73ed08d4fc4ee58cccea008c75bfb20baa8
diff --git a/vendor/revisions.json b/vendor/revisions.json
index bff2f70931..b4d57ab709 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.2",
-    "7e3f3974bc8895938308f94d0e96879ffae638cd"
+    "0f8da73ed08d4fc4ee58cccea008c75bfb20baa8"
   ],
   "v16": [
     "16.6",
-    "97f9fde349c6de6d573f5ce96db07eca60ce6185"
+    "f63b141cfb0c813725a6b2574049565bff643018"
   ],
   "v15": [
     "15.10",
-    "f262d631ad477a1819e84a183e5a7ef561830085"
+    "d3141e17a7155e3d07c8deba4a10c748a29ba1e6"
   ],
   "v14": [
     "14.15",
-    "c2f65b3201591e02ce45b66731392f98d3388e73"
+    "210a0ba3afd8134ea910b203f274b165bd4f05d7"
   ]
 }