Compute release 2025-06-20 07:03 UTC

Compute release 2025-06-13 07:03 UTC
Compute release 2025-06-06 07:03 UTC
2026-06-19 05:10:43 +00:00 · 2025-06-20 07:03:23 +00:00 · 2025-06-13 07:03:35 +00:00 · 2025-06-06 07:03:44 +00:00 · 2025-06-04 16:53:00 +00:00 · 2025-06-04 16:53:00 +00:00
69 changed files with 991 additions and 1483 deletions
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -104,10 +104,11 @@ jobs:

      # Set some environment variables used by all the steps.
      #
-      # CARGO_FLAGS is extra options to pass to all "cargo" subcommands.
+      # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
+      #   It also includes --features, if any
      #
-      # CARGO_PROFILE is passed to "cargo build", "cargo test" etc, but not to
-      #   "cargo metadata", because it doesn't accept --release or --debug options.
+      # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
+      #   because "cargo metadata" doesn't accept --release or --debug options
      #
      # We run tests with addtional features, that are turned off by default (e.g. in release builds), see
      # corresponding Cargo.toml files for their descriptions.
@@ -116,16 +117,16 @@ jobs:
          ARCH: ${{ inputs.arch }}
          SANITIZERS: ${{ inputs.sanitizers }}
        run: |
-          CARGO_FLAGS="--locked --features testing"
+          CARGO_FEATURES="--features testing"
          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
            cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
-            CARGO_PROFILE=""
+            CARGO_FLAGS="--locked"
          elif [[ $BUILD_TYPE == "debug" ]]; then
            cov_prefix=""
-            CARGO_PROFILE=""
+            CARGO_FLAGS="--locked"
          elif [[ $BUILD_TYPE == "release" ]]; then
            cov_prefix=""
-            CARGO_PROFILE="--release"
+            CARGO_FLAGS="--locked --release"
          fi
          if [[ $SANITIZERS == 'enabled' ]]; then
            make_vars="WITH_SANITIZERS=yes"
@@ -135,8 +136,8 @@ jobs:
          {
            echo "cov_prefix=${cov_prefix}"
            echo "make_vars=${make_vars}"
+            echo "CARGO_FEATURES=${CARGO_FEATURES}"
            echo "CARGO_FLAGS=${CARGO_FLAGS}"
-            echo "CARGO_PROFILE=${CARGO_PROFILE}"
            echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
          } >> $GITHUB_ENV

@@ -188,18 +189,34 @@ jobs:
          path: pg_install/v17
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}

-      - name: Build all
-        # Note: the Makefile picks up BUILD_TYPE and CARGO_PROFILE from the env variables
-        run: mold -run make ${make_vars} all -j$(nproc) CARGO_BUILD_FLAGS="$CARGO_FLAGS"
+      - name: Build postgres v14
+        if: steps.cache_pg_14.outputs.cache-hit != 'true'
+        run: mold -run make ${make_vars} postgres-v14 -j$(nproc)
+
+      - name: Build postgres v15
+        if: steps.cache_pg_15.outputs.cache-hit != 'true'
+        run: mold -run make ${make_vars} postgres-v15 -j$(nproc)
+
+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make ${make_vars} postgres-v16 -j$(nproc)
+
+      - name: Build postgres v17
+        if: steps.cache_pg_17.outputs.cache-hit != 'true'
+        run: mold -run make ${make_vars} postgres-v17 -j$(nproc)
+
+      - name: Build neon extensions
+        run: mold -run make ${make_vars} neon-pg-ext -j$(nproc)

      - name: Build walproposer-lib
        run: mold -run make ${make_vars} walproposer-lib -j$(nproc)

-      - name: Build unit tests
-        if: inputs.sanitizers != 'enabled'
+      - name: Run cargo build
+        env:
+          WITH_TESTS: ${{ inputs.sanitizers != 'enabled' && '--tests' || '' }}
        run: |
          export ASAN_OPTIONS=detect_leaks=0
-          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_PROFILE --tests
+          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins ${WITH_TESTS}

      # Do install *before* running rust tests because they might recompile the
      # binaries with different features/flags.
@@ -211,7 +228,7 @@ jobs:
          # Install target binaries
          mkdir -p /tmp/neon/bin/
          binaries=$(
-            ${cov_prefix} cargo metadata $CARGO_FLAGS --format-version=1 --no-deps |
+            ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
            jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
          )
          for bin in $binaries; do
@@ -228,7 +245,7 @@ jobs:
            mkdir -p /tmp/neon/test_bin/

            test_exe_paths=$(
-              ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_PROFILE --message-format=json --no-run |
+              ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
              jq -r '.executable | select(. != null)'
            )
            for bin in $test_exe_paths; do
@@ -262,10 +279,10 @@ jobs:
          export LD_LIBRARY_PATH

          #nextest does not yet support running doctests
-          ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_PROFILE
+          ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES

          # run all non-pageserver tests
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_PROFILE -E '!package(pageserver)'
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'

          # run pageserver tests
          # (When developing new pageserver features gated by config fields, we commonly make the rust
@@ -274,13 +291,13 @@ jobs:
          # pageserver tests from non-pageserver tests cuts down the time it takes for this CI step.)
          NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=tokio-epoll-uring  \
          ${cov_prefix} \
-          cargo nextest run $CARGO_FLAGS $CARGO_PROFILE  -E 'package(pageserver)'
+          cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'

          # Run separate tests for real S3
          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
          export REMOTE_STORAGE_S3_REGION=eu-central-1
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_PROFILE -E 'package(remote_storage)' -E 'test(test_real_s3)'
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'

          # Run separate tests for real Azure Blob Storage
          # XXX: replace region with `eu-central-1`-like region
@@ -289,17 +306,17 @@ jobs:
          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_PROFILE -E 'package(remote_storage)' -E 'test(test_real_azure)'
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'

      - name: Install postgres binaries
        run: |
          # Use tar to copy files matching the pattern, preserving the paths in the destionation
          tar c \
            pg_install/v* \
-            build/*/src/test/regress/*.so \
-            build/*/src/test/regress/pg_regress \
-            build/*/src/test/isolation/isolationtester \
-            build/*/src/test/isolation/pg_isolation_regress \
+            pg_install/build/*/src/test/regress/*.so \
+            pg_install/build/*/src/test/regress/pg_regress \
+            pg_install/build/*/src/test/isolation/isolationtester \
+            pg_install/build/*/src/test/isolation/pg_isolation_regress \
            | tar  x -C /tmp/neon

      - name: Upload Neon artifact
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -110,7 +110,7 @@ jobs:

  build-walproposer-lib:
    if: |
-      contains(inputs.pg_versions, 'v17') || inputs.rebuild_everything ||
+      inputs.pg_versions != '[]' || inputs.rebuild_everything ||
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
      github.ref_name == 'main'
@@ -144,7 +144,7 @@ jobs:
        id: cache_walproposer_lib
        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
        with:
-          path: build/walproposer-lib
+          path: pg_install/build/walproposer-lib
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Checkout submodule vendor/postgres-v17
@@ -169,11 +169,11 @@ jobs:
        run:
          make walproposer-lib -j$(sysctl -n hw.ncpu)

-      - name: Upload "build/walproposer-lib" artifact
+      - name: Upload "pg_install/build/walproposer-lib" artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
-          name: build--walproposer-lib
-          path: build/walproposer-lib
+          name: pg_install--build--walproposer-lib
+          path: pg_install/build/walproposer-lib
          # The artifact is supposed to be used by the next job in the same workflow,
          # so there’s no need to store it for too long.
          retention-days: 1
@@ -226,11 +226,11 @@ jobs:
          name: pg_install--v17
          path: pg_install/v17

-      - name: Download "build/walproposer-lib" artifact
+      - name: Download "pg_install/build/walproposer-lib" artifact
        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
        with:
-          name: build--walproposer-lib
-          path: build/walproposer-lib
+          name: pg_install--build--walproposer-lib
+          path: pg_install/build/walproposer-lib

      # `actions/download-artifact` doesn't preserve permissions:
      # https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -670,7 +670,7 @@ jobs:
                                             ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-arm64

  compute-node-image-arch:
-    needs: [ check-permissions, meta ]
+    needs: [ check-permissions, build-build-tools-image, meta ]
    if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
    permissions:
      id-token: write # aws-actions/configure-aws-credentials
@@ -743,6 +743,7 @@ jobs:
            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
            PG_VERSION=${{ matrix.version.pg }}
            BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
            DEBIAN_VERSION=${{ matrix.version.debian }}
          provenance: false
          push: true
@@ -762,6 +763,7 @@ jobs:
            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
            PG_VERSION=${{ matrix.version.pg }}
            BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
            DEBIAN_VERSION=${{ matrix.version.debian }}
          provenance: false
          push: true
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,4 @@
 /artifact_cache
-/build
 /pg_install
 /target
 /tmp_check
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4255,7 +4255,6 @@ dependencies = [
 "tokio-util",
 "tonic 0.13.1",
 "tracing",
- "url",
 "utils",
 "workspace_hack",
 ]
@@ -4473,8 +4472,6 @@ dependencies = [
 "pageserver_api",
 "postgres_ffi",
 "prost 0.13.5",
- "strum",
- "strum_macros",
 "thiserror 1.0.69",
 "tokio",
 "tonic 0.13.1",
--- a/1
+++ b/1
@@ -45,6 +45,7 @@ COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
 ENV BUILD_TYPE=release
 RUN set -e \
    && mold -run make -j $(nproc) -s neon-pg-ext \
+    && rm -rf pg_install/build \
    && tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz .

 # Prepare cargo-chef recipe
--- a/109
+++ b/109
@@ -1,18 +1,8 @@
 ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))

-# Where to install Postgres, default is ./pg_install, maybe useful for package
-# managers.
+# Where to install Postgres, default is ./pg_install, maybe useful for package managers
 POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/

-# CARGO_BUILD_FLAGS: Extra flags to pass to `cargo build`. `--locked`
-# and `--features testing` are popular examples.
-#
-# CARGO_PROFILE: You can also set to override the cargo profile to
-# use. By default, it is derived from BUILD_TYPE.
-
-# All intermediate build artifacts are stored here.
-BUILD_DIR := build
-
 ICU_PREFIX_DIR := /usr/local/icu

 #
@@ -26,12 +16,12 @@ ifeq ($(BUILD_TYPE),release)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl
 	PG_CFLAGS += -O2 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
-	CARGO_PROFILE ?= --profile=release
+	# Unfortunately, `--profile=...` is a nightly feature
+	CARGO_BUILD_FLAGS += --release
 else ifeq ($(BUILD_TYPE),debug)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
 	PG_CFLAGS += -O0 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
-	CARGO_PROFILE ?= --profile=dev
 else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif
@@ -103,7 +93,7 @@ all: neon postgres neon-pg-ext
 .PHONY: neon
 neon: postgres-headers walproposer-lib cargo-target-dir
 	+@echo "Compiling Neon"
-	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE)
+	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
 .PHONY: cargo-target-dir
 cargo-target-dir:
 	# https://github.com/rust-lang/cargo/issues/14281
@@ -114,20 +104,21 @@ cargo-target-dir:
 # Some rules are duplicated for Postgres v14 and 15. We may want to refactor
 # to avoid the duplication in the future, but it's tolerable for now.
 #
-$(BUILD_DIR)/%/config.status:
-	mkdir -p $(BUILD_DIR)
-	test -e $(BUILD_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(BUILD_DIR)/CACHEDIR.TAG
+$(POSTGRES_INSTALL_DIR)/build/%/config.status:
+
+	mkdir -p $(POSTGRES_INSTALL_DIR)
+	test -e $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG

 	+@echo "Configuring Postgres $* build"
 	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
 		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
 		echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
 		exit 1; }
-	mkdir -p $(BUILD_DIR)/$*
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*

 	VERSION=$*; \
 	EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
-	(cd $(BUILD_DIR)/$$VERSION && \
+	(cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \
 	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
 		CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \
 		$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
@@ -139,54 +130,74 @@ $(BUILD_DIR)/%/config.status:
 # the "build-all-versions" entry points) where direct mention of PostgreSQL
 # versions is used.
 .PHONY: postgres-configure-v17
-postgres-configure-v17: $(BUILD_DIR)/v17/config.status
+postgres-configure-v17: $(POSTGRES_INSTALL_DIR)/build/v17/config.status
 .PHONY: postgres-configure-v16
-postgres-configure-v16: $(BUILD_DIR)/v16/config.status
+postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status
 .PHONY: postgres-configure-v15
-postgres-configure-v15: $(BUILD_DIR)/v15/config.status
+postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
 .PHONY: postgres-configure-v14
-postgres-configure-v14: $(BUILD_DIR)/v14/config.status
+postgres-configure-v14: $(POSTGRES_INSTALL_DIR)/build/v14/config.status

 # Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/<version>/include
 .PHONY: postgres-headers-%
 postgres-headers-%: postgres-configure-%
 	+@echo "Installing PostgreSQL $* headers"
-	$(MAKE) -C $(BUILD_DIR)/$*/src/include MAKELEVEL=0 install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/include MAKELEVEL=0 install

 # Compile and install PostgreSQL
 .PHONY: postgres-%
 postgres-%: postgres-configure-% \
 		  postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers`
 	+@echo "Compiling PostgreSQL $*"
-	$(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 install
 	+@echo "Compiling libpq $*"
-	$(MAKE) -C $(BUILD_DIR)/$*/src/interfaces/libpq install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq install
 	+@echo "Compiling pg_prewarm $*"
-	$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_prewarm install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_prewarm install
 	+@echo "Compiling pg_buffercache $*"
-	$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_buffercache install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
 	+@echo "Compiling pg_visibility $*"
-	$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_visibility install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install
 	+@echo "Compiling pageinspect $*"
-	$(MAKE) -C $(BUILD_DIR)/$*/contrib/pageinspect install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
 	+@echo "Compiling pg_trgm $*"
-	$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_trgm install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_trgm install
 	+@echo "Compiling amcheck $*"
-	$(MAKE) -C $(BUILD_DIR)/$*/contrib/amcheck install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
 	+@echo "Compiling test_decoding $*"
-	$(MAKE) -C $(BUILD_DIR)/$*/contrib/test_decoding install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/test_decoding install

 .PHONY: postgres-check-%
 postgres-check-%: postgres-%
-	$(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 check
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 check

 .PHONY: neon-pg-ext-%
 neon-pg-ext-%: postgres-%
-	+@echo "Compiling neon-specific Postgres extensions for $*"
-	mkdir -p $(BUILD_DIR)/pgxn-$*
+	+@echo "Compiling neon $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
-		-C $(BUILD_DIR)/pgxn-$*\
-		-f $(ROOT_PROJECT_DIR)/pgxn/Makefile  install
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
+	+@echo "Compiling neon_walredo $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install
+	+@echo "Compiling neon_rmgr $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_rmgr/Makefile install
+	+@echo "Compiling neon_test_utils $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install
+	+@echo "Compiling neon_utils $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-utils-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install

 # Build walproposer as a static library. walproposer source code is located
 # in the pgxn/neon directory.
@@ -200,15 +211,15 @@ neon-pg-ext-%: postgres-%
 .PHONY: walproposer-lib
 walproposer-lib: neon-pg-ext-v17
 	+@echo "Compiling walproposer-lib"
-	mkdir -p $(BUILD_DIR)/walproposer-lib
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config COPT='$(COPT)' \
-		-C $(BUILD_DIR)/walproposer-lib \
+		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
-	cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgport.a $(BUILD_DIR)/walproposer-lib
-	cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgcommon.a $(BUILD_DIR)/walproposer-lib
-	$(AR) d $(BUILD_DIR)/walproposer-lib/libpgport.a \
+	cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+	cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
 		pg_strong_random.o
-	$(AR) d $(BUILD_DIR)/walproposer-lib/libpgcommon.a \
+	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
 		checksum_helper.o \
 		cryptohash_openssl.o \
 		hmac_openssl.o \
@@ -216,7 +227,7 @@ walproposer-lib: neon-pg-ext-v17
 		parse_manifest.o \
 		scram-common.o
 ifeq ($(UNAME_S),Linux)
-	$(AR) d $(BUILD_DIR)/walproposer-lib/libpgcommon.a \
+	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
 		pg_crc32c.o
 endif

@@ -261,7 +272,7 @@ fmt:

 postgres-%-pg-bsd-indent: postgres-%
 	+@echo "Compiling pg_bsd_indent"
-	$(MAKE) -C $(BUILD_DIR)/$*/src/tools/pg_bsd_indent/
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/

 # Create typedef list for the core. Note that generally it should be combined with
 # buildfarm one to cover platform specific stuff.
@@ -280,7 +291,7 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
 	cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\
 		cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
 	+@echo note: you might want to run it on selected files/dirs instead.
-	INDENT=$(BUILD_DIR)/$*/src/tools/pg_bsd_indent/pg_bsd_indent \
+	INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \
 		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
 		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
 		--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
@@ -291,9 +302,9 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
 neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config COPT='$(COPT)' \
 		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/find_typedef \
-		INDENT=$(BUILD_DIR)/v17/src/tools/pg_bsd_indent/pg_bsd_indent \
+		INDENT=$(POSTGRES_INSTALL_DIR)/build/v17/src/tools/pg_bsd_indent/pg_bsd_indent \
 		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/pgindent/pgindent \
-		-C $(BUILD_DIR)/neon-v17 \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-v17 \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent


--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -77,6 +77,9 @@
 # build_and_test.yml github workflow for how that's done.

 ARG PG_VERSION
+ARG REPOSITORY=ghcr.io/neondatabase
+ARG IMAGE=build-tools
+ARG TAG=pinned
 ARG BUILD_TAG
 ARG DEBIAN_VERSION=bookworm
 ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
@@ -147,7 +150,6 @@ RUN case $DEBIAN_VERSION in \
    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \
    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \
    libclang-dev \
-    jsonnet \
    $VERSION_INSTALLS \
    && apt clean && rm -rf /var/lib/apt/lists/* && \
    useradd -ms /bin/bash nonroot -b /home
@@ -1632,7 +1634,18 @@ FROM pg-build AS neon-ext-build
 ARG PG_VERSION

 COPY pgxn/ pgxn/
-RUN make -j $(getconf _NPROCESSORS_ONLN) -C pgxn -s install-compute
+RUN make -j $(getconf _NPROCESSORS_ONLN) \
+        -C pgxn/neon \
+        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        -C pgxn/neon_utils \
+        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        -C pgxn/neon_test_utils \
+        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        -C pgxn/neon_rmgr \
+        -s install

 #########################################################################################
 #
@@ -1722,7 +1735,7 @@ FROM extensions-${EXTENSIONS} AS neon-pg-ext-build
 # Compile the Neon-specific `compute_ctl`, `fast_import`, and `local_proxy` binaries
 #
 #########################################################################################
-FROM build-deps-with-cargo AS compute-tools
+FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
 ARG BUILD_TAG
 ENV BUILD_TAG=$BUILD_TAG

@@ -1732,7 +1745,7 @@ COPY --chown=nonroot . .
 RUN --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/registry \
    --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/git \
    --mount=type=cache,uid=1000,target=/home/nonroot/target \
-    cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy && \
+    mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy && \
    mkdir target-bin && \
    cp target/release-line-debug-size-lto/compute_ctl \
       target/release-line-debug-size-lto/fast_import \
@@ -1826,11 +1839,10 @@ RUN rm /usr/local/pgsql/lib/lib*.a
 # Preprocess the sql_exporter configuration files
 #
 #########################################################################################
-FROM build-deps AS sql_exporter_preprocessor
+FROM $REPOSITORY/$IMAGE:$TAG AS sql_exporter_preprocessor
 ARG PG_VERSION

 USER nonroot
-WORKDIR /home/nonroot

 COPY --chown=nonroot compute compute

--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -408,9 +408,7 @@ impl ComputeNode {
        // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`.
        const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0";
        let options = match conn_conf.get_options() {
-            // Allow the control plane to override any options set by the
-            // compute
-            Some(options) => format!("{} {}", EXTRA_OPTIONS, options),
+            Some(options) => format!("{} {}", options, EXTRA_OPTIONS),
            None => EXTRA_OPTIONS.to_string(),
        };
        conn_conf.options(&options);
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -209,10 +209,6 @@ pub struct NeonStorageControllerConf {
    pub use_https_safekeeper_api: bool,

    pub use_local_compute_notifications: bool,
-
-    pub timeline_safekeeper_count: Option<i64>,
-
-    pub kick_secondary_downloads: Option<bool>,
 }

 impl NeonStorageControllerConf {
@@ -243,8 +239,6 @@ impl Default for NeonStorageControllerConf {
            timelines_onto_safekeepers: true,
            use_https_safekeeper_api: false,
            use_local_compute_notifications: true,
-            timeline_safekeeper_count: None,
-            kick_secondary_downloads: None,
        }
    }
 }
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -557,10 +557,6 @@ impl StorageController {
            args.push("--use-local-compute-notifications".to_string());
        }

-        if let Some(value) = self.config.kick_secondary_downloads {
-            args.push(format!("--kick-secondary-downloads={value}"));
-        }
-
        if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() {
            args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap()));
        }
@@ -632,10 +628,6 @@ impl StorageController {
            args.push("--timelines-onto-safekeepers".to_string());
        }

-        if let Some(sk_cnt) = self.config.timeline_safekeeper_count {
-            args.push(format!("--timeline-safekeeper-count={sk_cnt}"));
-        }
-
        println!("Starting storage controller");

        background_process::start_process(
--- a/libs/compute_api/src/requests.rs
+++ b/libs/compute_api/src/requests.rs
@@ -16,7 +16,6 @@ pub static COMPUTE_AUDIENCE: &str = "compute";
 pub enum ComputeClaimsScope {
    /// An admin-scoped token allows access to all of `compute_ctl`'s authorized
    /// facilities.
-    #[serde(rename = "compute_ctl:admin")]
    Admin,
 }

@@ -25,7 +24,7 @@ impl FromStr for ComputeClaimsScope {

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
-            "compute_ctl:admin" => Ok(ComputeClaimsScope::Admin),
+            "admin" => Ok(ComputeClaimsScope::Admin),
            _ => Err(anyhow::anyhow!("invalid compute claims scope \"{s}\"")),
        }
    }
@@ -81,23 +80,3 @@ pub struct SetRoleGrantsRequest {
    pub privileges: Vec<Privilege>,
    pub role: PgIdent,
 }
-
-#[cfg(test)]
-mod test {
-    use std::str::FromStr;
-
-    use crate::requests::ComputeClaimsScope;
-
-    /// Confirm that whether we parse the scope by string or through serde, the
-    /// same values parse to the same enum variant.
-    #[test]
-    fn compute_request_scopes() {
-        const ADMIN_SCOPE: &str = "compute_ctl:admin";
-
-        let from_serde: ComputeClaimsScope =
-            serde_json::from_str(&format!("\"{ADMIN_SCOPE}\"")).unwrap();
-        let from_str = ComputeClaimsScope::from_str(ADMIN_SCOPE).unwrap();
-
-        assert_eq!(from_serde, from_str);
-    }
-}
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -76,10 +76,6 @@ pub struct PostHogConfig {
    pub private_api_url: String,
    /// Public API URL
    pub public_api_url: String,
-    /// Refresh interval for the feature flag spec
-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(with = "humantime_serde")]
-    pub refresh_interval: Option<Duration>,
 }

 /// `pageserver.toml`
@@ -820,7 +816,7 @@ pub mod tenant_conf_defaults {
    // By default ingest enough WAL for two new L0 layers before checking if new image
    // image layers should be created.
    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
-    pub const DEFAULT_GC_COMPACTION_ENABLED: bool = true;
+    pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
    pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
    pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
    pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -23,12 +23,22 @@ pub struct ReAttachRequest {
    pub register: Option<NodeRegisterRequest>,
 }

+fn default_mode() -> LocationConfigMode {
+    LocationConfigMode::AttachedSingle
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct ReAttachResponseTenant {
    pub id: TenantShardId,
    /// Mandatory if LocationConfigMode is None or set to an Attached* mode
    pub r#gen: Option<u32>,
+
+    /// Default value only for backward compat: this field should be set
+    #[serde(default = "default_mode")]
    pub mode: LocationConfigMode,
+
+    // Default value only for backward compat: this field should be set
+    #[serde(default = "ShardStripeSize::default")]
    pub stripe_size: ShardStripeSize,
 }
 #[derive(Serialize, Deserialize)]
--- a/libs/posthog_client_lite/src/background_loop.rs
+++ b/libs/posthog_client_lite/src/background_loop.rs
@@ -36,10 +36,7 @@ impl FeatureResolverBackgroundLoop {
        // Main loop of updating the feature flags.
        handle.spawn(
            async move {
-                tracing::info!(
-                    "Starting PostHog feature resolver with refresh period: {:?}",
-                    refresh_period
-                );
+                tracing::info!("Starting PostHog feature resolver");
                let mut ticker = tokio::time::interval(refresh_period);
                ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
                loop {
--- a/libs/proxy/tokio-postgres2/src/cancel_query.rs
+++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs
@@ -1,3 +1,5 @@
+use std::io;
+
 use tokio::net::TcpStream;

 use crate::client::SocketConfig;
@@ -6,7 +8,7 @@ use crate::tls::MakeTlsConnect;
 use crate::{Error, cancel_query_raw, connect_socket};

 pub(crate) async fn cancel_query<T>(
-    config: SocketConfig,
+    config: Option<SocketConfig>,
    ssl_mode: SslMode,
    tls: T,
    process_id: i32,
@@ -15,6 +17,16 @@ pub(crate) async fn cancel_query<T>(
 where
    T: MakeTlsConnect<TcpStream>,
 {
+    let config = match config {
+        Some(config) => config,
+        None => {
+            return Err(Error::connect(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                "unknown host",
+            )));
+        }
+    };
+
    let hostname = match &config.host {
        Host::Tcp(host) => &**host,
    };
--- a/libs/proxy/tokio-postgres2/src/cancel_token.rs
+++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs
@@ -7,16 +7,11 @@ use crate::config::SslMode;
 use crate::tls::{MakeTlsConnect, TlsConnect};
 use crate::{Error, cancel_query, cancel_query_raw};

-/// A cancellation token that allows easy cancellation of a query.
-#[derive(Clone)]
+/// The capability to request cancellation of in-progress queries on a
+/// connection.
+#[derive(Clone, Serialize, Deserialize)]
 pub struct CancelToken {
-    pub socket_config: SocketConfig,
-    pub raw: RawCancelToken,
-}
-
-/// A raw cancellation token that allows cancellation of a query, given a fresh connection to postgres.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct RawCancelToken {
+    pub socket_config: Option<SocketConfig>,
    pub ssl_mode: SslMode,
    pub process_id: i32,
    pub secret_key: i32,
@@ -41,16 +36,14 @@ impl CancelToken {
    {
        cancel_query::cancel_query(
            self.socket_config.clone(),
-            self.raw.ssl_mode,
+            self.ssl_mode,
            tls,
-            self.raw.process_id,
-            self.raw.secret_key,
+            self.process_id,
+            self.secret_key,
        )
        .await
    }
-}

-impl RawCancelToken {
    /// Like `cancel_query`, but uses a stream which is already connected to the server rather than opening a new
    /// connection itself.
    pub async fn cancel_query_raw<S, T>(&self, stream: S, tls: T) -> Result<(), Error>
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -12,7 +12,6 @@ use postgres_protocol2::message::frontend;
 use serde::{Deserialize, Serialize};
 use tokio::sync::mpsc;

-use crate::cancel_token::RawCancelToken;
 use crate::codec::{BackendMessages, FrontendMessage};
 use crate::config::{Host, SslMode};
 use crate::query::RowStream;
@@ -332,12 +331,10 @@ impl Client {
    /// connection associated with this client.
    pub fn cancel_token(&self) -> CancelToken {
        CancelToken {
-            socket_config: self.socket_config.clone(),
-            raw: RawCancelToken {
-                ssl_mode: self.ssl_mode,
-                process_id: self.process_id,
-                secret_key: self.secret_key,
-            },
+            socket_config: Some(self.socket_config.clone()),
+            ssl_mode: self.ssl_mode,
+            process_id: self.process_id,
+            secret_key: self.secret_key,
        }
    }

--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -3,7 +3,7 @@

 use postgres_protocol2::message::backend::ReadyForQueryBody;

-pub use crate::cancel_token::{CancelToken, RawCancelToken};
+pub use crate::cancel_token::CancelToken;
 pub use crate::client::{Client, SocketConfig};
 pub use crate::config::Config;
 pub use crate::connect_raw::RawConnection;
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -13,24 +13,22 @@ fn main() -> anyhow::Result<()> {
    // Tell cargo to invalidate the built crate whenever the wrapper changes
    println!("cargo:rerun-if-changed=bindgen_deps.h");

-    let root_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../..");
-
    // Finding the location of built libraries and Postgres C headers:
    // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/pg_install`
    // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/pg_install/{PG_MAJORVERSION}/include/postgresql/server`
    let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") {
        postgres_install_dir.into()
    } else {
-        root_path.join("pg_install")
+        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pg_install")
    };

    let pg_install_abs = std::fs::canonicalize(pg_install_dir)?;
-    let walproposer_lib_dir = root_path.join("build/walproposer-lib");
+    let walproposer_lib_dir = pg_install_abs.join("build/walproposer-lib");
    let walproposer_lib_search_str = walproposer_lib_dir
        .to_str()
        .ok_or(anyhow!("Bad non-UTF path"))?;

-    let pgxn_neon = root_path.join("pgxn/neon");
+    let pgxn_neon = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pgxn/neon");
    let pgxn_neon = std::fs::canonicalize(pgxn_neon)?;
    let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?;

--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -12,9 +12,6 @@ testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "

 fuzz-read-path = ["testing"]

-# Enables benchmarking only APIs
-benchmarking = []
-
 [dependencies]
 anyhow.workspace = true
 arc-swap.workspace = true
@@ -130,7 +127,6 @@ harness = false
 [[bench]]
 name = "bench_ingest"
 harness = false
-required-features = ["benchmarking"]

 [[bench]]
 name = "upload_queue"
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -1,29 +1,22 @@
 use std::env;
 use std::num::NonZeroUsize;
-use std::sync::Arc;

 use bytes::Bytes;
 use camino::Utf8PathBuf;
 use criterion::{Criterion, criterion_group, criterion_main};
-use futures::stream::FuturesUnordered;
 use pageserver::config::PageServerConf;
 use pageserver::context::{DownloadBehavior, RequestContext};
-use pageserver::keyspace::KeySpace;
 use pageserver::l0_flush::{L0FlushConfig, L0FlushGlobalState};
 use pageserver::task_mgr::TaskKind;
-use pageserver::tenant::storage_layer::IoConcurrency;
-use pageserver::tenant::storage_layer::{InMemoryLayer, ValuesReconstructState};
+use pageserver::tenant::storage_layer::InMemoryLayer;
 use pageserver::{page_cache, virtual_file};
-use pageserver_api::config::GetVectoredConcurrentIo;
 use pageserver_api::key::Key;
 use pageserver_api::models::virtual_file::IoMode;
 use pageserver_api::shard::TenantShardId;
-use tokio_stream::StreamExt;
+use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
 use utils::bin_ser::BeSer;
 use utils::id::{TenantId, TimelineId};
-use utils::lsn::Lsn;
-use utils::sync::gate::Gate;
 use wal_decoder::models::value::Value;
 use wal_decoder::serialized_batch::SerializedValueBatch;

@@ -37,7 +30,7 @@ fn murmurhash32(mut h: u32) -> u32 {
    h
 }

-#[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)]
+#[derive(serde::Serialize, Clone, Copy, Debug)]
 enum KeyLayout {
    /// Sequential unique keys
    Sequential,
@@ -47,30 +40,19 @@ enum KeyLayout {
    RandomReuse(u32),
 }

-#[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)]
+#[derive(serde::Serialize, Clone, Copy, Debug)]
 enum WriteDelta {
    Yes,
    No,
 }

-#[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)]
-enum ConcurrentReads {
-    Yes,
-    No,
-}
-
 async fn ingest(
    conf: &'static PageServerConf,
    put_size: usize,
    put_count: usize,
    key_layout: KeyLayout,
    write_delta: WriteDelta,
-    concurrent_reads: ConcurrentReads,
 ) -> anyhow::Result<()> {
-    if concurrent_reads == ConcurrentReads::Yes {
-        assert_eq!(key_layout, KeyLayout::Sequential);
-    }
-
    let mut lsn = utils::lsn::Lsn(1000);
    let mut key = Key::from_i128(0x0);

@@ -86,18 +68,16 @@ async fn ingest(
    let gate = utils::sync::gate::Gate::default();
    let cancel = CancellationToken::new();

-    let layer = Arc::new(
-        InMemoryLayer::create(
-            conf,
-            timeline_id,
-            tenant_shard_id,
-            lsn,
-            &gate,
-            &cancel,
-            &ctx,
-        )
-        .await?,
-    );
+    let layer = InMemoryLayer::create(
+        conf,
+        timeline_id,
+        tenant_shard_id,
+        lsn,
+        &gate,
+        &cancel,
+        &ctx,
+    )
+    .await?;

    let data = Value::Image(Bytes::from(vec![0u8; put_size]));
    let data_ser_size = data.serialized_size().unwrap() as usize;
@@ -106,61 +86,6 @@ async fn ingest(
        pageserver::context::DownloadBehavior::Download,
    );

-    const READ_BATCH_SIZE: u32 = 32;
-    let (tx, mut rx) = tokio::sync::watch::channel::<Option<Key>>(None);
-    let reader_cancel = CancellationToken::new();
-    let reader_handle = if concurrent_reads == ConcurrentReads::Yes {
-        Some(tokio::task::spawn({
-            let cancel = reader_cancel.clone();
-            let layer = layer.clone();
-            let ctx = ctx.attached_child();
-            async move {
-                let gate = Gate::default();
-                let gate_guard = gate.enter().unwrap();
-                let io_concurrency = IoConcurrency::spawn_from_conf(
-                    GetVectoredConcurrentIo::SidecarTask,
-                    gate_guard,
-                );
-
-                rx.wait_for(|key| key.is_some()).await.unwrap();
-
-                while !cancel.is_cancelled() {
-                    let key = match *rx.borrow() {
-                        Some(some) => some,
-                        None => unreachable!(),
-                    };
-
-                    let mut start_key = key;
-                    start_key.field6 = key.field6.saturating_sub(READ_BATCH_SIZE);
-                    let key_range = start_key..key.next();
-
-                    let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
-
-                    layer
-                        .get_values_reconstruct_data(
-                            KeySpace::single(key_range),
-                            Lsn(1)..Lsn(u64::MAX),
-                            &mut reconstruct_state,
-                            &ctx,
-                        )
-                        .await
-                        .unwrap();
-
-                    let mut collect_futs = std::mem::take(&mut reconstruct_state.keys)
-                        .into_values()
-                        .map(|state| state.sink_pending_ios())
-                        .collect::<FuturesUnordered<_>>();
-                    while collect_futs.next().await.is_some() {}
-                }
-
-                drop(io_concurrency);
-                gate.close().await;
-            }
-        }))
-    } else {
-        None
-    };
-
    const BATCH_SIZE: usize = 16;
    let mut batch = Vec::new();

@@ -188,27 +113,19 @@ async fn ingest(

        batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
        if batch.len() >= BATCH_SIZE {
-            let last_key = Key::from_compact(batch.last().unwrap().0);
-
            let this_batch = std::mem::take(&mut batch);
            let serialized = SerializedValueBatch::from_values(this_batch);
            layer.put_batch(serialized, &ctx).await?;
-
-            tx.send(Some(last_key)).unwrap();
        }
    }
    if !batch.is_empty() {
-        let last_key = Key::from_compact(batch.last().unwrap().0);
-
        let this_batch = std::mem::take(&mut batch);
        let serialized = SerializedValueBatch::from_values(this_batch);
        layer.put_batch(serialized, &ctx).await?;
-
-        tx.send(Some(last_key)).unwrap();
    }
    layer.freeze(lsn + 1).await;

-    if write_delta == WriteDelta::Yes {
+    if matches!(write_delta, WriteDelta::Yes) {
        let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct {
            max_concurrency: NonZeroUsize::new(1).unwrap(),
        });
@@ -219,11 +136,6 @@ async fn ingest(
        tokio::fs::remove_file(path).await?;
    }

-    reader_cancel.cancel();
-    if let Some(handle) = reader_handle {
-        handle.await.unwrap();
-    }
-
    Ok(())
 }

@@ -235,7 +147,6 @@ fn ingest_main(
    put_count: usize,
    key_layout: KeyLayout,
    write_delta: WriteDelta,
-    concurrent_reads: ConcurrentReads,
 ) {
    pageserver::virtual_file::set_io_mode(io_mode);

@@ -245,15 +156,7 @@ fn ingest_main(
        .unwrap();

    runtime.block_on(async move {
-        let r = ingest(
-            conf,
-            put_size,
-            put_count,
-            key_layout,
-            write_delta,
-            concurrent_reads,
-        )
-        .await;
+        let r = ingest(conf, put_size, put_count, key_layout, write_delta).await;
        if let Err(e) = r {
            panic!("{e:?}");
        }
@@ -292,7 +195,6 @@ fn criterion_benchmark(c: &mut Criterion) {
        key_size: usize,
        key_layout: KeyLayout,
        write_delta: WriteDelta,
-        concurrent_reads: ConcurrentReads,
    }
    #[derive(Clone)]
    struct HandPickedParameters {
@@ -343,7 +245,7 @@ fn criterion_benchmark(c: &mut Criterion) {
    ];
    let exploded_parameters = {
        let mut out = Vec::new();
-        for concurrent_reads in [ConcurrentReads::Yes, ConcurrentReads::No] {
+        for io_mode in IoMode::iter() {
            for param in expect.clone() {
                let HandPickedParameters {
                    volume_mib,
@@ -351,18 +253,12 @@ fn criterion_benchmark(c: &mut Criterion) {
                    key_layout,
                    write_delta,
                } = param;
-
-                if key_layout != KeyLayout::Sequential && concurrent_reads == ConcurrentReads::Yes {
-                    continue;
-                }
-
                out.push(ExplodedParameters {
-                    io_mode: IoMode::DirectRw,
+                    io_mode,
                    volume_mib,
                    key_size,
                    key_layout,
                    write_delta,
-                    concurrent_reads,
                });
            }
        }
@@ -376,10 +272,9 @@ fn criterion_benchmark(c: &mut Criterion) {
                key_size,
                key_layout,
                write_delta,
-                concurrent_reads,
            } = self;
            format!(
-                "io_mode={io_mode:?} volume_mib={volume_mib:?} key_size_bytes={key_size:?} key_layout={key_layout:?} write_delta={write_delta:?} concurrent_reads={concurrent_reads:?}"
+                "io_mode={io_mode:?} volume_mib={volume_mib:?} key_size_bytes={key_size:?} key_layout={key_layout:?} write_delta={write_delta:?}"
            )
        }
    }
@@ -392,23 +287,12 @@ fn criterion_benchmark(c: &mut Criterion) {
            key_size,
            key_layout,
            write_delta,
-            concurrent_reads,
        } = params;
        let put_count = volume_mib * 1024 * 1024 / key_size;
        group.throughput(criterion::Throughput::Bytes((key_size * put_count) as u64));
        group.sample_size(10);
        group.bench_function(id, |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    io_mode,
-                    key_size,
-                    put_count,
-                    key_layout,
-                    write_delta,
-                    concurrent_reads,
-                )
-            })
+            b.iter(|| ingest_main(conf, io_mode, key_size, put_count, key_layout, write_delta))
        });
    }
 }
--- a/pageserver/page_api/Cargo.toml
+++ b/pageserver/page_api/Cargo.toml
@@ -11,8 +11,6 @@ futures.workspace = true
 pageserver_api.workspace = true
 postgres_ffi.workspace = true
 prost.workspace = true
-strum.workspace = true
-strum_macros.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tonic.workspace = true
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -459,7 +459,7 @@ impl GetPageResponse {
 /// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream
 /// (potentially shared by many backends), and a gRPC status response would terminate the stream so
 /// we send GetPageResponse messages with these codes instead.
-#[derive(Clone, Copy, Debug, PartialEq, strum_macros::Display)]
+#[derive(Clone, Copy, Debug)]
 pub enum GetPageStatusCode {
    /// Unknown status. For forwards compatibility: used when an older client version receives a new
    /// status code from a newer server version.
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -25,7 +25,6 @@ tokio.workspace = true
 tokio-stream.workspace = true
 tokio-util.workspace = true
 tonic.workspace = true
-url.workspace = true

 pageserver_client.workspace = true
 pageserver_api.workspace = true
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -13,6 +13,7 @@ use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
 use pageserver_client::page_service::BasebackupRequest;
 use pageserver_page_api as page_api;
 use rand::prelude::*;
+use reqwest::Url;
 use tokio::io::AsyncRead;
 use tokio::sync::Barrier;
 use tokio::task::JoinSet;
@@ -20,7 +21,6 @@ use tokio_util::compat::{TokioAsyncReadCompatExt as _, TokioAsyncWriteCompatExt
 use tokio_util::io::StreamReader;
 use tonic::async_trait;
 use tracing::{info, instrument};
-use url::Url;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
 use utils::shard::ShardIndex;
@@ -156,16 +156,12 @@ async fn main_impl(

    let mut work_senders = HashMap::new();
    let mut tasks = Vec::new();
-    let scheme = match Url::parse(&args.page_service_connstring) {
-        Ok(url) => url.scheme().to_lowercase().to_string(),
-        Err(url::ParseError::RelativeUrlWithoutBase) => "postgresql".to_string(),
-        Err(err) => return Err(anyhow!("invalid connstring: {err}")),
-    };
+    let connurl = Url::parse(&args.page_service_connstring)?;
    for &tl in &timelines {
        let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
        work_senders.insert(tl, sender);

-        let client: Box<dyn Client> = match scheme.as_str() {
+        let client: Box<dyn Client> = match connurl.scheme() {
            "postgresql" | "postgres" => Box::new(
                LibpqClient::new(&args.page_service_connstring, tl, !args.no_compression).await?,
            ),
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -10,31 +10,33 @@ use anyhow::Context;
 use async_trait::async_trait;
 use bytes::Bytes;
 use camino::Utf8PathBuf;
-use futures::{Stream, StreamExt as _};
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::pagestream_api::{PagestreamGetPageRequest, PagestreamRequest};
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::TenantShardId;
-use pageserver_page_api as page_api;
+use pageserver_page_api::proto;
 use rand::prelude::*;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
-use url::Url;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
-use utils::shard::ShardIndex;

 use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
 use crate::util::{request_stats, tokio_thread_local_stats};

+#[derive(clap::ValueEnum, Clone, Debug)]
+enum Protocol {
+    Libpq,
+    Grpc,
+}
+
 /// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
 #[derive(clap::Parser)]
 pub(crate) struct Args {
    #[clap(long, default_value = "http://localhost:9898")]
    mgmt_api_endpoint: String,
-    /// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
    page_service_connstring: String,
    #[clap(long)]
@@ -43,9 +45,8 @@ pub(crate) struct Args {
    num_clients: NonZeroUsize,
    #[clap(long)]
    runtime: Option<humantime::Duration>,
-    /// If true, enable compression (only for gRPC).
-    #[clap(long)]
-    compression: bool,
+    #[clap(long, value_enum, default_value = "libpq")]
+    protocol: Protocol,
    /// Each client sends requests at the given rate.
    ///
    /// If a request takes too long and we should be issuing a new request already,
@@ -324,32 +325,18 @@ async fn main_impl(
                .unwrap();

        Box::pin(async move {
-            let scheme = match Url::parse(&args.page_service_connstring) {
-                Ok(url) => url.scheme().to_lowercase().to_string(),
-                Err(url::ParseError::RelativeUrlWithoutBase) => "postgresql".to_string(),
-                Err(err) => panic!("invalid connstring: {err}"),
-            };
-            let client: Box<dyn Client> = match scheme.as_str() {
-                "postgresql" | "postgres" => {
-                    assert!(!args.compression, "libpq does not support compression");
-                    Box::new(
-                        LibpqClient::new(&args.page_service_connstring, worker_id.timeline)
-                            .await
-                            .unwrap(),
-                    )
-                }
-
-                "grpc" => Box::new(
-                    GrpcClient::new(
-                        &args.page_service_connstring,
-                        worker_id.timeline,
-                        args.compression,
-                    )
-                    .await
-                    .unwrap(),
+            let client: Box<dyn Client> = match args.protocol {
+                Protocol::Libpq => Box::new(
+                    LibpqClient::new(args.page_service_connstring.clone(), worker_id.timeline)
+                        .await
+                        .unwrap(),
                ),

-                scheme => panic!("unsupported scheme {scheme}"),
+                Protocol::Grpc => Box::new(
+                    GrpcClient::new(args.page_service_connstring.clone(), worker_id.timeline)
+                        .await
+                        .unwrap(),
+                ),
            };
            run_worker(args, client, ss, cancel, rps_period, ranges, weights).await
        })
@@ -556,8 +543,8 @@ struct LibpqClient {
 }

 impl LibpqClient {
-    async fn new(connstring: &str, ttid: TenantTimelineId) -> anyhow::Result<Self> {
-        let inner = pageserver_client::page_service::Client::new(connstring.to_string())
+    async fn new(connstring: String, ttid: TenantTimelineId) -> anyhow::Result<Self> {
+        let inner = pageserver_client::page_service::Client::new(connstring)
            .await?
            .pagestream(ttid.tenant_id, ttid.timeline_id)
            .await?;
@@ -613,36 +600,34 @@ impl Client for LibpqClient {
    }
 }

-/// A gRPC Pageserver client.
+/// A gRPC client using the raw, no-frills gRPC client.
 struct GrpcClient {
-    req_tx: tokio::sync::mpsc::Sender<page_api::GetPageRequest>,
-    resp_rx: Pin<Box<dyn Stream<Item = Result<page_api::GetPageResponse, tonic::Status>> + Send>>,
+    req_tx: tokio::sync::mpsc::Sender<proto::GetPageRequest>,
+    resp_rx: tonic::Streaming<proto::GetPageResponse>,
 }

 impl GrpcClient {
-    async fn new(
-        connstring: &str,
-        ttid: TenantTimelineId,
-        compression: bool,
-    ) -> anyhow::Result<Self> {
-        let mut client = page_api::Client::new(
-            connstring.to_string(),
-            ttid.tenant_id,
-            ttid.timeline_id,
-            ShardIndex::unsharded(),
-            None,
-            compression.then_some(tonic::codec::CompressionEncoding::Zstd),
-        )
-        .await?;
+    async fn new(connstring: String, ttid: TenantTimelineId) -> anyhow::Result<Self> {
+        let mut client = pageserver_page_api::proto::PageServiceClient::connect(connstring).await?;

        // The channel has a buffer size of 1, since 0 is not allowed. It does not matter, since the
        // benchmark will control the queue depth (i.e. in-flight requests) anyway, and requests are
        // buffered by Tonic and the OS too.
        let (req_tx, req_rx) = tokio::sync::mpsc::channel(1);
        let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx);
-        let resp_rx = Box::pin(client.get_pages(req_stream).await?);
+        let mut req = tonic::Request::new(req_stream);
+        let metadata = req.metadata_mut();
+        metadata.insert("neon-tenant-id", ttid.tenant_id.to_string().try_into()?);
+        metadata.insert("neon-timeline-id", ttid.timeline_id.to_string().try_into()?);
+        metadata.insert("neon-shard-id", "0000".try_into()?);

-        Ok(Self { req_tx, resp_rx })
+        let resp = client.get_pages(req).await?;
+        let resp_stream = resp.into_inner();
+
+        Ok(Self {
+            req_tx,
+            resp_rx: resp_stream,
+        })
    }
 }

@@ -656,27 +641,27 @@ impl Client for GrpcClient {
        rel: RelTag,
        blks: Vec<u32>,
    ) -> anyhow::Result<()> {
-        let req = page_api::GetPageRequest {
+        let req = proto::GetPageRequest {
            request_id: req_id,
-            request_class: page_api::GetPageClass::Normal,
-            read_lsn: page_api::ReadLsn {
-                request_lsn: req_lsn,
-                not_modified_since_lsn: Some(mod_lsn),
-            },
-            rel,
-            block_numbers: blks,
+            request_class: proto::GetPageClass::Normal as i32,
+            read_lsn: Some(proto::ReadLsn {
+                request_lsn: req_lsn.0,
+                not_modified_since_lsn: mod_lsn.0,
+            }),
+            rel: Some(rel.into()),
+            block_number: blks,
        };
        self.req_tx.send(req).await?;
        Ok(())
    }

    async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
-        let resp = self.resp_rx.next().await.unwrap().unwrap();
+        let resp = self.resp_rx.message().await?.unwrap();
        anyhow::ensure!(
-            resp.status_code == page_api::GetPageStatusCode::Ok,
+            resp.status_code == proto::GetPageStatusCode::Ok as i32,
            "unexpected status code: {}",
-            resp.status_code,
+            resp.status_code
        );
-        Ok((resp.request_id, resp.page_images))
+        Ok((resp.request_id, resp.page_image))
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -583,7 +583,7 @@ fn start_pageserver(
            deletion_queue_client,
            l0_flush_global_state,
            basebackup_prepare_sender,
-            feature_resolver: feature_resolver.clone(),
+            feature_resolver,
        },
        shutdown_pageserver.clone(),
    );
@@ -715,7 +715,6 @@ fn start_pageserver(
                disk_usage_eviction_state,
                deletion_queue.new_client(),
                secondary_controller,
-                feature_resolver,
            )
            .context("Failed to initialize router state")?,
        );
--- a/pageserver/src/feature_resolver.rs
+++ b/pageserver/src/feature_resolver.rs
@@ -1,6 +1,5 @@
 use std::{collections::HashMap, sync::Arc, time::Duration};

-use arc_swap::ArcSwap;
 use pageserver_api::config::NodeMetadata;
 use posthog_client_lite::{
    CaptureEvent, FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError,
@@ -13,13 +12,10 @@ use utils::id::TenantId;

 use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION};

-const DEFAULT_POSTHOG_REFRESH_INTERVAL: Duration = Duration::from_secs(600);
-
 #[derive(Clone)]
 pub struct FeatureResolver {
    inner: Option<Arc<FeatureResolverBackgroundLoop>>,
    internal_properties: Option<Arc<HashMap<String, PostHogFlagFilterPropertyValue>>>,
-    force_overrides_for_testing: Arc<ArcSwap<HashMap<String, String>>>,
 }

 impl FeatureResolver {
@@ -27,7 +23,6 @@ impl FeatureResolver {
        Self {
            inner: None,
            internal_properties: None,
-            force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))),
        }
    }

@@ -144,23 +139,18 @@ impl FeatureResolver {
                }
                tenants
            };
-            inner.clone().spawn(
-                handle,
-                posthog_config
-                    .refresh_interval
-                    .unwrap_or(DEFAULT_POSTHOG_REFRESH_INTERVAL),
-                fake_tenants,
-            );
+            // TODO: make refresh period configurable
+            inner
+                .clone()
+                .spawn(handle, Duration::from_secs(60), fake_tenants);
            Ok(FeatureResolver {
                inner: Some(inner),
                internal_properties: Some(internal_properties),
-                force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))),
            })
        } else {
            Ok(FeatureResolver {
                inner: None,
                internal_properties: None,
-                force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))),
            })
        }
    }
@@ -200,11 +190,6 @@ impl FeatureResolver {
        flag_key: &str,
        tenant_id: TenantId,
    ) -> Result<String, PostHogEvaluationError> {
-        let force_overrides = self.force_overrides_for_testing.load();
-        if let Some(value) = force_overrides.get(flag_key) {
-            return Ok(value.clone());
-        }
-
        if let Some(inner) = &self.inner {
            let res = inner.feature_store().evaluate_multivariate(
                flag_key,
@@ -243,15 +228,6 @@ impl FeatureResolver {
        flag_key: &str,
        tenant_id: TenantId,
    ) -> Result<(), PostHogEvaluationError> {
-        let force_overrides = self.force_overrides_for_testing.load();
-        if let Some(value) = force_overrides.get(flag_key) {
-            return if value == "true" {
-                Ok(())
-            } else {
-                Err(PostHogEvaluationError::NoConditionGroupMatched)
-            };
-        }
-
        if let Some(inner) = &self.inner {
            let res = inner.feature_store().evaluate_boolean(
                flag_key,
@@ -283,22 +259,8 @@ impl FeatureResolver {
            inner.feature_store().is_feature_flag_boolean(flag_key)
        } else {
            Err(PostHogEvaluationError::NotAvailable(
-                "PostHog integration is not enabled, cannot auto-determine the flag type"
-                    .to_string(),
+                "PostHog integration is not enabled".to_string(),
            ))
        }
    }
-
-    /// Force override a feature flag for testing. This is only for testing purposes. Assume the caller only call it
-    /// from a single thread so it won't race.
-    pub fn force_override_for_testing(&self, flag_key: &str, value: Option<&str>) {
-        let mut force_overrides = self.force_overrides_for_testing.load().as_ref().clone();
-        if let Some(value) = value {
-            force_overrides.insert(flag_key.to_string(), value.to_string());
-        } else {
-            force_overrides.remove(flag_key);
-        }
-        self.force_overrides_for_testing
-            .store(Arc::new(force_overrides));
-    }
 }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -59,7 +59,6 @@ use crate::config::PageServerConf;
 use crate::context;
 use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
 use crate::deletion_queue::DeletionQueueClient;
-use crate::feature_resolver::FeatureResolver;
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::LocationConf;
@@ -108,7 +107,6 @@ pub struct State {
    deletion_queue_client: DeletionQueueClient,
    secondary_controller: SecondaryController,
    latest_utilization: tokio::sync::Mutex<Option<(std::time::Instant, bytes::Bytes)>>,
-    feature_resolver: FeatureResolver,
 }

 impl State {
@@ -122,7 +120,6 @@ impl State {
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
        deletion_queue_client: DeletionQueueClient,
        secondary_controller: SecondaryController,
-        feature_resolver: FeatureResolver,
    ) -> anyhow::Result<Self> {
        let allowlist_routes = &[
            "/v1/status",
@@ -143,7 +140,6 @@ impl State {
            deletion_queue_client,
            secondary_controller,
            latest_utilization: Default::default(),
-            feature_resolver,
        })
    }
 }
@@ -3679,8 +3675,8 @@ async fn tenant_evaluate_feature_flag(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

-    let flag: String = parse_request_param(&request, "flag_key")?;
-    let as_type: Option<String> = parse_query_param(&request, "as")?;
+    let flag: String = must_parse_query_param(&request, "flag")?;
+    let as_type: String = must_parse_query_param(&request, "as")?;

    let state = get_state(&request);

@@ -3689,11 +3685,11 @@ async fn tenant_evaluate_feature_flag(
            .tenant_manager
            .get_attached_tenant_shard(tenant_shard_id)?;
        let properties = tenant.feature_resolver.collect_properties(tenant_shard_id.tenant_id);
-        if as_type.as_deref() == Some("boolean") {
+        if as_type == "boolean" {
            let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
            let result = result.map(|_| true).map_err(|e| e.to_string());
            json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
-        } else if as_type.as_deref() == Some("multivariate") {
+        } else if as_type == "multivariate" {
            let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
            json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
        } else {
@@ -3713,35 +3709,6 @@ async fn tenant_evaluate_feature_flag(
    .await
 }

-async fn force_override_feature_flag_for_testing_put(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    check_permission(&request, None)?;
-
-    let flag: String = parse_request_param(&request, "flag_key")?;
-    let value: String = must_parse_query_param(&request, "value")?;
-    let state = get_state(&request);
-    state
-        .feature_resolver
-        .force_override_for_testing(&flag, Some(&value));
-    json_response(StatusCode::OK, ())
-}
-
-async fn force_override_feature_flag_for_testing_delete(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    check_permission(&request, None)?;
-
-    let flag: String = parse_request_param(&request, "flag_key")?;
-    let state = get_state(&request);
-    state
-        .feature_resolver
-        .force_override_for_testing(&flag, None);
-    json_response(StatusCode::OK, ())
-}
-
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -4118,14 +4085,8 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/activate_post_import",
            |r| api_handler(r, activate_post_import_handler),
        )
-        .get("/v1/tenant/:tenant_shard_id/feature_flag/:flag_key", |r| {
+        .get("/v1/tenant/:tenant_shard_id/feature_flag", |r| {
            api_handler(r, tenant_evaluate_feature_flag)
        })
-        .put("/v1/feature_flag/:flag_key", |r| {
-            testing_api_handler("force override feature flag - put", r, force_override_feature_flag_for_testing_put)
-        })
-        .delete("/v1/feature_flag/:flag_key", |r| {
-            testing_api_handler("force override feature flag - delete", r, force_override_feature_flag_for_testing_delete)
-        })
        .any(handler_404))
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3426,7 +3426,7 @@ impl TimelineMetrics {
    pub fn dec_frozen_layer(&self, layer: &InMemoryLayer) {
        assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
        let labels = self.make_frozen_layer_labels(layer);
-        let size = layer.len();
+        let size = layer.try_len().expect("frozen layer should have no writer");
        TIMELINE_LAYER_COUNT
            .get_metric_with_label_values(&labels)
            .unwrap()
@@ -3441,7 +3441,7 @@ impl TimelineMetrics {
    pub fn inc_frozen_layer(&self, layer: &InMemoryLayer) {
        assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
        let labels = self.make_frozen_layer_labels(layer);
-        let size = layer.len();
+        let size = layer.try_len().expect("frozen layer should have no writer");
        TIMELINE_LAYER_COUNT
            .get_metric_with_label_values(&labels)
            .unwrap()
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -3544,9 +3544,8 @@ impl proto::PageService for GrpcPageServiceHandler {
        &self,
        req: tonic::Request<proto::GetBaseBackupRequest>,
    ) -> Result<tonic::Response<Self::GetBaseBackupStream>, tonic::Status> {
-        // Send chunks of 256 KB to avoid large memory allocations. pagebench basebackup shows this
-        // to be the sweet spot where throughput is saturated.
-        const CHUNK_SIZE: usize = 256 * 1024;
+        // Send 64 KB chunks to avoid large memory allocations.
+        const CHUNK_SIZE: usize = 64 * 1024;

        let timeline = self.get_request_timeline(&req).await?;
        let ctx = self.ctx.with_scope_timeline(&timeline);
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -61,10 +61,8 @@ pub(crate) struct LocationConf {
    /// The detailed shard identity.  This structure is already scoped within
    /// a TenantShardId, but we need the full ShardIdentity to enable calculating
    /// key->shard mappings.
-    ///
-    /// NB: we store this even for unsharded tenants, so that we agree with storcon on the intended
-    /// stripe size. Otherwise, a split request that does not specify a stripe size may use a
-    /// different default than storcon, which can lead to incorrect stripe sizes and corruption.
+    // TODO(vlad): Remove this default once all configs have a shard identity on disk.
+    #[serde(default = "ShardIdentity::unsharded")]
    pub(crate) shard: ShardIdentity,

    /// The pan-cluster tenant configuration, the same on all locations
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -3,7 +3,7 @@

 use std::io;
 use std::sync::Arc;
-use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::atomic::AtomicU64;

 use camino::Utf8PathBuf;
 use num_traits::Num;
@@ -18,7 +18,6 @@ use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64};
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache;
-use crate::tenant::storage_layer::inmemory_layer::GlobalResourceUnits;
 use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
 use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
 use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
@@ -31,13 +30,9 @@ pub struct EphemeralFile {
    _tenant_shard_id: TenantShardId,
    _timeline_id: TimelineId,
    page_cache_file_id: page_cache::FileId,
+    bytes_written: u64,
    file: TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter,
-
-    buffered_writer: tokio::sync::RwLock<BufferedWriter>,
-
-    bytes_written: AtomicU64,
-
-    resource_units: std::sync::Mutex<GlobalResourceUnits>,
+    buffered_writer: BufferedWriter,
 }

 type BufferedWriter = owned_buffers_io::write::BufferedWriter<
@@ -99,8 +94,9 @@ impl EphemeralFile {
            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
            page_cache_file_id,
+            bytes_written: 0,
            file: file.clone(),
-            buffered_writer: tokio::sync::RwLock::new(BufferedWriter::new(
+            buffered_writer: BufferedWriter::new(
                file,
                0,
                || IoBufferMut::with_capacity(TAIL_SZ),
@@ -108,9 +104,7 @@ impl EphemeralFile {
                cancel.child_token(),
                ctx,
                info_span!(parent: None, "ephemeral_file_buffered_writer", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), timeline_id=%timeline_id, path = %filename),
-            )),
-            bytes_written: AtomicU64::new(0),
-            resource_units: std::sync::Mutex::new(GlobalResourceUnits::new()),
+            ),
        })
    }
 }
@@ -157,17 +151,15 @@ impl std::ops::Deref for TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter

 #[derive(Debug, thiserror::Error)]
 pub(crate) enum EphemeralFileWriteError {
+    #[error("{0}")]
+    TooLong(String),
    #[error("cancelled")]
    Cancelled,
 }

 impl EphemeralFile {
    pub(crate) fn len(&self) -> u64 {
-        // TODO(vlad): The value returned here is not always correct if
-        // we have more than one concurrent writer. Writes are always
-        // sequenced, but we could grab the buffered writer lock if we wanted
-        // to.
-        self.bytes_written.load(Ordering::Acquire)
+        self.bytes_written
    }

    pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
@@ -194,7 +186,7 @@ impl EphemeralFile {
    /// Panics if the write is short because there's no way we can recover from that.
    /// TODO: make upstack handle this as an error.
    pub(crate) async fn write_raw(
-        &self,
+        &mut self,
        srcbuf: &[u8],
        ctx: &RequestContext,
    ) -> Result<u64, EphemeralFileWriteError> {
@@ -206,13 +198,22 @@ impl EphemeralFile {
    }

    async fn write_raw_controlled(
-        &self,
+        &mut self,
        srcbuf: &[u8],
        ctx: &RequestContext,
    ) -> Result<(u64, Option<owned_buffers_io::write::FlushControl>), EphemeralFileWriteError> {
-        let mut writer = self.buffered_writer.write().await;
+        let pos = self.bytes_written;

-        let (nwritten, control) = writer
+        let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
+            EphemeralFileWriteError::TooLong(format!(
+                "write would grow EphemeralFile beyond u64::MAX: len={pos} writen={srcbuf_len}",
+                srcbuf_len = srcbuf.len(),
+            ))
+        })?;
+
+        // Write the payload
+        let (nwritten, control) = self
+            .buffered_writer
            .write_buffered_borrowed_controlled(srcbuf, ctx)
            .await
            .map_err(|e| match e {
@@ -224,61 +225,22 @@ impl EphemeralFile {
            "buffered writer has no short writes"
        );

-        // There's no realistic risk of overflow here. We won't have exabytes sized files on disk.
-        let pos = self
-            .bytes_written
-            .fetch_add(srcbuf.len().into_u64(), Ordering::AcqRel);
-
-        let mut resource_units = self.resource_units.lock().unwrap();
-        resource_units.maybe_publish_size(self.bytes_written.load(Ordering::Relaxed));
+        self.bytes_written = new_bytes_written;

        Ok((pos, control))
    }
-
-    pub(crate) fn tick(&self) -> Option<u64> {
-        let mut resource_units = self.resource_units.lock().unwrap();
-        let len = self.bytes_written.load(Ordering::Relaxed);
-        resource_units.publish_size(len)
-    }
 }

 impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
    async fn read_exact_at_eof_ok<B: IoBufAlignedMut + Send>(
        &self,
        start: u64,
-        mut dst: tokio_epoll_uring::Slice<B>,
+        dst: tokio_epoll_uring::Slice<B>,
        ctx: &RequestContext,
    ) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
-        // We will fill the slice in back to front. Hence, we need
-        // the slice to be fully initialized.
-        // TODO(vlad): Is there a nicer way of doing this?
-        dst.as_mut_rust_slice_full_zeroed();
+        let submitted_offset = self.buffered_writer.bytes_submitted();

-        let writer = self.buffered_writer.read().await;
-
-        // Read bytes written while under lock. This is a hack to deal with concurrent
-        // writes updating the number of bytes written. `bytes_written` is not DIO alligned
-        // but we may end the read there.
-        //
-        // TODO(vlad): Feels like there's a nicer path where we align the end if it
-        // shoots over the end of the file.
-        let bytes_written = self.bytes_written.load(Ordering::Acquire);
-
-        let dst_cap = dst.bytes_total().into_u64();
-        let end = {
-            // saturating_add is correct here because the max file size is u64::MAX, so,
-            // if start + dst.len() > u64::MAX, then we know it will be a short read
-            let mut end: u64 = start.saturating_add(dst_cap);
-            if end > bytes_written {
-                end = bytes_written;
-            }
-            end
-        };
-
-        let submitted_offset = writer.bytes_submitted();
-        let maybe_flushed = writer.inspect_maybe_flushed();
-
-        let mutable = match writer.inspect_mutable() {
+        let mutable = match self.buffered_writer.inspect_mutable() {
            Some(mutable) => &mutable[0..mutable.pending()],
            None => {
                // Timeline::cancel and hence buffered writer flush was cancelled.
@@ -287,6 +249,19 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
            }
        };

+        let maybe_flushed = self.buffered_writer.inspect_maybe_flushed();
+
+        let dst_cap = dst.bytes_total().into_u64();
+        let end = {
+            // saturating_add is correct here because the max file size is u64::MAX, so,
+            // if start + dst.len() > u64::MAX, then we know it will be a short read
+            let mut end: u64 = start.saturating_add(dst_cap);
+            if end > self.bytes_written {
+                end = self.bytes_written;
+            }
+            end
+        };
+
        // inclusive, exclusive
        #[derive(Debug)]
        struct Range<N>(N, N);
@@ -331,33 +306,13 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral

        let mutable_range = Range(std::cmp::max(start, submitted_offset), end);

-        // There are three sources from which we might have to read data:
-        // 1. The file itself
-        // 2. The buffer which contains changes currently being flushed
-        // 3. The buffer which contains chnages yet to be flushed
-        //
-        // For better concurrency, we do them in reverse order: perform the in-memory
-        // reads while holding the writer lock, drop the writer lock and read from the
-        // file if required.
-
-        let dst = if mutable_range.len() > 0 {
-            let offset_in_buffer = mutable_range
-                .0
-                .checked_sub(submitted_offset)
-                .unwrap()
-                .into_usize();
-            let to_copy =
-                &mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())];
+        let dst = if written_range.len() > 0 {
            let bounds = dst.bounds();
-            let mut view = dst.slice({
-                let start =
-                    written_range.len().into_usize() + maybe_flushed_range.len().into_usize();
-                let end = start.checked_add(mutable_range.len().into_usize()).unwrap();
-                start..end
-            });
-            view.as_mut_rust_slice_full_zeroed()
-                .copy_from_slice(to_copy);
-            Slice::from_buf_bounds(Slice::into_inner(view), bounds)
+            let slice = self
+                .file
+                .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
+                .await?;
+            Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
        } else {
            dst
        };
@@ -387,15 +342,24 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
            dst
        };

-        drop(writer);
-
-        let dst = if written_range.len() > 0 {
+        let dst = if mutable_range.len() > 0 {
+            let offset_in_buffer = mutable_range
+                .0
+                .checked_sub(submitted_offset)
+                .unwrap()
+                .into_usize();
+            let to_copy =
+                &mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())];
            let bounds = dst.bounds();
-            let slice = self
-                .file
-                .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
-                .await?;
-            Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
+            let mut view = dst.slice({
+                let start =
+                    written_range.len().into_usize() + maybe_flushed_range.len().into_usize();
+                let end = start.checked_add(mutable_range.len().into_usize()).unwrap();
+                start..end
+            });
+            view.as_mut_rust_slice_full_zeroed()
+                .copy_from_slice(to_copy);
+            Slice::from_buf_bounds(Slice::into_inner(view), bounds)
        } else {
            dst
        };
@@ -496,15 +460,13 @@ mod tests {
        let gate = utils::sync::gate::Gate::default();
        let cancel = CancellationToken::new();

-        let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
            .await
            .unwrap();

-        let writer = file.buffered_writer.read().await;
-        let mutable = writer.mutable();
+        let mutable = file.buffered_writer.mutable();
        let cap = mutable.capacity();
        let align = mutable.align();
-        drop(writer);

        let write_nbytes = cap * 2 + cap / 2;

@@ -542,11 +504,10 @@ mod tests {
        let file_contents = std::fs::read(file.file.path()).unwrap();
        assert!(file_contents == content[0..cap * 2]);

-        let writer = file.buffered_writer.read().await;
-        let maybe_flushed_buffer_contents = writer.inspect_maybe_flushed().unwrap();
+        let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap();
        assert_eq!(&maybe_flushed_buffer_contents[..], &content[cap..cap * 2]);

-        let mutable_buffer_contents = writer.mutable();
+        let mutable_buffer_contents = file.buffered_writer.mutable();
        assert_eq!(mutable_buffer_contents, &content[cap * 2..write_nbytes]);
    }

@@ -556,14 +517,12 @@ mod tests {

        let gate = utils::sync::gate::Gate::default();
        let cancel = CancellationToken::new();
-        let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
            .await
            .unwrap();

        // mutable buffer and maybe_flushed buffer each has `cap` bytes.
-        let writer = file.buffered_writer.read().await;
-        let cap = writer.mutable().capacity();
-        drop(writer);
+        let cap = file.buffered_writer.mutable().capacity();

        let content: Vec<u8> = rand::thread_rng()
            .sample_iter(rand::distributions::Standard)
@@ -581,13 +540,12 @@ mod tests {
            2 * cap.into_u64(),
            "buffered writer requires one write to be flushed if we write 2.5x buffer capacity"
        );
-        let writer = file.buffered_writer.read().await;
        assert_eq!(
-            &writer.inspect_maybe_flushed().unwrap()[0..cap],
+            &file.buffered_writer.inspect_maybe_flushed().unwrap()[0..cap],
            &content[cap..cap * 2]
        );
        assert_eq!(
-            &writer.mutable()[0..cap / 2],
+            &file.buffered_writer.mutable()[0..cap / 2],
            &content[cap * 2..cap * 2 + cap / 2]
        );
    }
@@ -605,15 +563,13 @@ mod tests {
        let gate = utils::sync::gate::Gate::default();
        let cancel = CancellationToken::new();

-        let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
            .await
            .unwrap();

-        let writer = file.buffered_writer.read().await;
-        let mutable = writer.mutable();
+        let mutable = file.buffered_writer.mutable();
        let cap = mutable.capacity();
        let align = mutable.align();
-        drop(writer);
        let content: Vec<u8> = rand::thread_rng()
            .sample_iter(rand::distributions::Standard)
            .take(cap * 2 + cap / 2)
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -109,7 +109,7 @@ pub(crate) enum OnDiskValue {

 /// Reconstruct data accumulated for a single key during a vectored get
 #[derive(Debug, Default)]
-pub struct VectoredValueReconstructState {
+pub(crate) struct VectoredValueReconstructState {
    pub(crate) on_disk_values: Vec<(Lsn, OnDiskValueIoWaiter)>,

    pub(crate) situation: ValueReconstructSituation,
@@ -244,60 +244,13 @@ impl VectoredValueReconstructState {

        res
    }
-
-    /// Benchmarking utility to await for the completion of all pending ios
-    ///
-    /// # Cancel-Safety
-    ///
-    /// Technically fine to stop polling this future, but, the IOs will still
-    /// be executed to completion by the sidecar task and hold on to / consume resources.
-    /// Better not do it to make reasonsing about the system easier.
-    #[cfg(feature = "benchmarking")]
-    pub async fn sink_pending_ios(self) -> Result<(), std::io::Error> {
-        let mut res = Ok(());
-
-        // We should try hard not to bail early, so that by the time we return from this
-        // function, all IO for this value is done. It's not required -- we could totally
-        // stop polling the IO futures in the sidecar task, they need to support that,
-        // but just stopping to poll doesn't reduce the IO load on the disk. It's easier
-        // to reason about the system if we just wait for all IO to complete, even if
-        // we're no longer interested in the result.
-        //
-        // Revisit this when IO futures are replaced with a more sophisticated IO system
-        // and an IO scheduler, where we know which IOs were submitted and which ones
-        // just queued. Cf the comment on IoConcurrency::spawn_io.
-        for (_lsn, waiter) in self.on_disk_values {
-            let value_recv_res = waiter
-                .wait_completion()
-                // we rely on the caller to poll us to completion, so this is not a bail point
-                .await;
-
-            match (&mut res, value_recv_res) {
-                (Err(_), _) => {
-                    // We've already failed, no need to process more.
-                }
-                (Ok(_), Err(_wait_err)) => {
-                    // This shouldn't happen - likely the sidecar task panicked.
-                    unreachable!();
-                }
-                (Ok(_), Ok(Err(err))) => {
-                    let err: std::io::Error = err;
-                    res = Err(err);
-                }
-                (Ok(_ok), Ok(Ok(OnDiskValue::RawImage(_img)))) => {}
-                (Ok(_ok), Ok(Ok(OnDiskValue::WalRecordOrImage(_buf)))) => {}
-            }
-        }
-
-        res
-    }
 }

 /// Bag of data accumulated during a vectored get..
-pub struct ValuesReconstructState {
+pub(crate) struct ValuesReconstructState {
    /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
    /// should not expect to get anything from this hashmap.
-    pub keys: HashMap<Key, VectoredValueReconstructState>,
+    pub(crate) keys: HashMap<Key, VectoredValueReconstructState>,
    /// The keys which are already retrieved
    keys_done: KeySpaceRandomAccum,

@@ -319,7 +272,7 @@ pub struct ValuesReconstructState {
 /// The desired end state is that we always do parallel IO.
 /// This struct and the dispatching in the impl will be removed once
 /// we've built enough confidence.
-pub enum IoConcurrency {
+pub(crate) enum IoConcurrency {
    Sequential,
    SidecarTask {
        task_id: usize,
@@ -364,7 +317,10 @@ impl IoConcurrency {
        Self::spawn(SelectedIoConcurrency::Sequential)
    }

-    pub fn spawn_from_conf(conf: GetVectoredConcurrentIo, gate_guard: GateGuard) -> IoConcurrency {
+    pub(crate) fn spawn_from_conf(
+        conf: GetVectoredConcurrentIo,
+        gate_guard: GateGuard,
+    ) -> IoConcurrency {
        let selected = match conf {
            GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential,
            GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard),
@@ -469,6 +425,16 @@ impl IoConcurrency {
        }
    }

+    pub(crate) fn clone(&self) -> Self {
+        match self {
+            IoConcurrency::Sequential => IoConcurrency::Sequential,
+            IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask {
+                task_id: *task_id,
+                ios_tx: ios_tx.clone(),
+            },
+        }
+    }
+
    /// Submit an IO to be executed in the background. DEADLOCK RISK, read the full doc string.
    ///
    /// The IO is represented as an opaque future.
@@ -607,18 +573,6 @@ impl IoConcurrency {
    }
 }

-impl Clone for IoConcurrency {
-    fn clone(&self) -> Self {
-        match self {
-            IoConcurrency::Sequential => IoConcurrency::Sequential,
-            IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask {
-                task_id: *task_id,
-                ios_tx: ios_tx.clone(),
-            },
-        }
-    }
-}
-
 /// Make noise in case the [`ValuesReconstructState`] gets dropped while
 /// there are still IOs in flight.
 /// Refer to `collect_pending_ios` for why we prefer not to do that.
@@ -649,7 +603,7 @@ impl Drop for ValuesReconstructState {
 }

 impl ValuesReconstructState {
-    pub fn new(io_concurrency: IoConcurrency) -> Self {
+    pub(crate) fn new(io_concurrency: IoConcurrency) -> Self {
        Self {
            keys: HashMap::new(),
            keys_done: KeySpaceRandomAccum::new(),
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -70,15 +70,23 @@ pub struct InMemoryLayer {
    /// We use a separate lock for the index to reduce the critical section
    /// during which reads cannot be planned.
    ///
-    /// Note that the file backing [`InMemoryLayer::file`] is append-only,
-    /// so it is not necessary to hold a lock on the index while reading or writing from the file.
+    /// If you need access to both the index and the underlying file at the same time,
+    /// respect the following locking order to avoid deadlocks:
+    /// 1. [`InMemoryLayer::inner`]
+    /// 2. [`InMemoryLayer::index`]
+    ///
+    /// Note that the file backing [`InMemoryLayer::inner`] is append-only,
+    /// so it is not necessary to hold simultaneous locks on index.
+    /// This avoids holding index locks across IO, and is crucial for avoiding read tail latency.
    /// In particular:
-    /// 1. It is safe to read and release [`InMemoryLayer::index`] before reading from [`InMemoryLayer::file`].
-    /// 2. It is safe to write to [`InMemoryLayer::file`] before locking and updating [`InMemoryLayer::index`].
+    /// 1. It is safe to read and release [`InMemoryLayer::index`] before locking and reading from [`InMemoryLayer::inner`].
+    /// 2. It is safe to write and release [`InMemoryLayer::inner`] before locking and updating [`InMemoryLayer::index`].
    index: RwLock<BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>>,

-    /// Wrapper for the actual on-disk file. Uses interior mutability for concurrent reads/writes.
-    file: EphemeralFile,
+    /// The above fields never change, except for `end_lsn`, which is only set once,
+    /// and `index` (see rationale there).
+    /// All other changing parts are in `inner`, and protected by a mutex.
+    inner: RwLock<InMemoryLayerInner>,

    estimated_in_mem_size: AtomicU64,
 }
@@ -88,10 +96,20 @@ impl std::fmt::Debug for InMemoryLayer {
        f.debug_struct("InMemoryLayer")
            .field("start_lsn", &self.start_lsn)
            .field("end_lsn", &self.end_lsn)
+            .field("inner", &self.inner)
            .finish()
    }
 }

+pub struct InMemoryLayerInner {
+    /// The values are stored in a serialized format in this file.
+    /// Each serialized Value is preceded by a 'u32' length field.
+    /// PerSeg::page_versions map stores offsets into this file.
+    file: EphemeralFile,
+
+    resource_units: GlobalResourceUnits,
+}
+
 /// Support the same max blob length as blob_io, because ultimately
 /// all the InMemoryLayer contents end up being written into a delta layer,
 /// using the [`crate::tenant::blob_io`].
@@ -240,6 +258,12 @@ struct IndexEntryUnpacked {
    pos: u64,
 }

+impl std::fmt::Debug for InMemoryLayerInner {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("InMemoryLayerInner").finish()
+    }
+}
+
 /// State shared by all in-memory (ephemeral) layers.  Updated infrequently during background ticks in Timeline,
 /// to minimize contention.
 ///
@@ -256,7 +280,7 @@ pub(crate) struct GlobalResources {
 }

 // Per-timeline RAII struct for its contribution to [`GlobalResources`]
-pub(crate) struct GlobalResourceUnits {
+struct GlobalResourceUnits {
    // How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible
    // for decrementing the global counter by this many bytes when dropped.
    dirty_bytes: u64,
@@ -268,7 +292,7 @@ impl GlobalResourceUnits {
    // updated when the Timeline "ticks" in the background.
    const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024;

-    pub(crate) fn new() -> Self {
+    fn new() -> Self {
        GLOBAL_RESOURCES
            .dirty_layers
            .fetch_add(1, AtomicOrdering::Relaxed);
@@ -280,7 +304,7 @@ impl GlobalResourceUnits {
    ///
    /// Returns the effective layer size limit that should be applied, if any, to keep
    /// the total number of dirty bytes below the configured maximum.
-    pub(crate) fn publish_size(&mut self, size: u64) -> Option<u64> {
+    fn publish_size(&mut self, size: u64) -> Option<u64> {
        let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
            Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
            Ordering::Greater => {
@@ -325,7 +349,7 @@ impl GlobalResourceUnits {

    // Call publish_size if the input size differs from last published size by more than
    // the drift limit
-    pub(crate) fn maybe_publish_size(&mut self, size: u64) {
+    fn maybe_publish_size(&mut self, size: u64) {
        let publish = match size.cmp(&self.dirty_bytes) {
            Ordering::Equal => false,
            Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT,
@@ -374,8 +398,8 @@ impl InMemoryLayer {
        }
    }

-    pub(crate) fn len(&self) -> u64 {
-        self.file.len()
+    pub(crate) fn try_len(&self) -> Option<u64> {
+        self.inner.try_read().map(|i| i.file.len()).ok()
    }

    pub(crate) fn assert_writable(&self) {
@@ -406,7 +430,7 @@ impl InMemoryLayer {

    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
-    pub async fn get_values_reconstruct_data(
+    pub(crate) async fn get_values_reconstruct_data(
        self: &Arc<InMemoryLayer>,
        keyspace: KeySpace,
        lsn_range: Range<Lsn>,
@@ -455,13 +479,14 @@ impl InMemoryLayer {
                }
            }
        }
-        drop(index); // release the lock before we spawn the IO
+        drop(index); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
        let read_from = Arc::clone(self);
        let read_ctx = ctx.attached_child();
        reconstruct_state
            .spawn_io(async move {
+                let inner = read_from.inner.read().await;
                let f = vectored_dio_read::execute(
-                    &read_from.file,
+                    &inner.file,
                    reads
                        .iter()
                        .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
@@ -493,6 +518,7 @@ impl InMemoryLayer {
                // This is kinda forced for InMemoryLayer because we need to inner.read() anyway,
                // but it's less obvious for DeltaLayer and ImageLayer. So, keep this explicit
                // drop for consistency among all three layer types.
+                drop(inner);
                drop(read_from);
            })
            .await;
@@ -523,6 +549,12 @@ impl std::fmt::Display for InMemoryLayer {
 }

 impl InMemoryLayer {
+    /// Get layer size.
+    pub async fn size(&self) -> Result<u64> {
+        let inner = self.inner.read().await;
+        Ok(inner.file.len())
+    }
+
    pub fn estimated_in_mem_size(&self) -> u64 {
        self.estimated_in_mem_size.load(AtomicOrdering::Relaxed)
    }
@@ -555,7 +587,10 @@ impl InMemoryLayer {
            end_lsn: OnceLock::new(),
            opened_at: Instant::now(),
            index: RwLock::new(BTreeMap::new()),
-            file,
+            inner: RwLock::new(InMemoryLayerInner {
+                file,
+                resource_units: GlobalResourceUnits::new(),
+            }),
            estimated_in_mem_size: AtomicU64::new(0),
        })
    }
@@ -564,37 +599,41 @@ impl InMemoryLayer {
    ///
    /// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from.
    /// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable.
-    ///
-    /// This method shall not be called concurrently. We enforce this property via [`crate::tenant::Timeline::write_lock`].
-    ///
    /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
    pub async fn put_batch(
        &self,
        serialized_batch: SerializedValueBatch,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        self.assert_writable();
+        let (base_offset, metadata) = {
+            let mut inner = self.inner.write().await;
+            self.assert_writable();

-        let base_offset = self.file.len();
+            let base_offset = inner.file.len();

-        let SerializedValueBatch {
-            raw,
-            metadata,
-            max_lsn: _,
-            len: _,
-        } = serialized_batch;
+            let SerializedValueBatch {
+                raw,
+                metadata,
+                max_lsn: _,
+                len: _,
+            } = serialized_batch;

-        // Write the batch to the file
-        self.file.write_raw(&raw, ctx).await?;
-        let new_size = self.file.len();
+            // Write the batch to the file
+            inner.file.write_raw(&raw, ctx).await?;
+            let new_size = inner.file.len();

-        let expected_new_len = base_offset
-            .checked_add(raw.len().into_u64())
-            // write_raw would error if we were to overflow u64.
-            // also IndexEntry and higher levels in
-            //the code don't allow the file to grow that large
-            .unwrap();
-        assert_eq!(new_size, expected_new_len);
+            let expected_new_len = base_offset
+                .checked_add(raw.len().into_u64())
+                // write_raw would error if we were to overflow u64.
+                // also IndexEntry and higher levels in
+                //the code don't allow the file to grow that large
+                .unwrap();
+            assert_eq!(new_size, expected_new_len);
+
+            inner.resource_units.maybe_publish_size(new_size);
+
+            (base_offset, metadata)
+        };

        // Update the index with the new entries
        let mut index = self.index.write().await;
@@ -647,8 +686,10 @@ impl InMemoryLayer {
        self.opened_at
    }

-    pub(crate) fn tick(&self) -> Option<u64> {
-        self.file.tick()
+    pub(crate) async fn tick(&self) -> Option<u64> {
+        let mut inner = self.inner.write().await;
+        let size = inner.file.len();
+        inner.resource_units.publish_size(size)
    }

    pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
@@ -712,6 +753,12 @@ impl InMemoryLayer {
        gate: &utils::sync::gate::Gate,
        cancel: CancellationToken,
    ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
+        // Grab the lock in read-mode. We hold it over the I/O, but because this
+        // layer is not writeable anymore, no one should be trying to acquire the
+        // write lock on it, so we shouldn't block anyone. See the comment on
+        // [`InMemoryLayer::freeze`] to understand how locking between the append path
+        // and layer flushing works.
+        let inner = self.inner.read().await;
        let index = self.index.read().await;

        use l0_flush::Inner;
@@ -746,7 +793,7 @@ impl InMemoryLayer {

        match l0_flush_global_state {
            l0_flush::Inner::Direct { .. } => {
-                let file_contents = self.file.load_to_io_buf(ctx).await?;
+                let file_contents = inner.file.load_to_io_buf(ctx).await?;
                let file_contents = file_contents.freeze();

                for (key, vec_map) in index.iter() {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -816,7 +816,7 @@ impl From<layer_manager::Shutdown> for FlushLayerError {
 }

 #[derive(thiserror::Error, Debug)]
-pub enum GetVectoredError {
+pub(crate) enum GetVectoredError {
    #[error("timeline shutting down")]
    Cancelled,

@@ -849,7 +849,7 @@ impl From<GetReadyAncestorError> for GetVectoredError {
 }

 #[derive(thiserror::Error, Debug)]
-pub enum GetReadyAncestorError {
+pub(crate) enum GetReadyAncestorError {
    #[error("ancestor LSN wait error")]
    AncestorLsnTimeout(#[from] WaitLsnError),

@@ -939,7 +939,7 @@ impl std::fmt::Debug for Timeline {
 }

 #[derive(thiserror::Error, Debug, Clone)]
-pub enum WaitLsnError {
+pub(crate) enum WaitLsnError {
    // Called on a timeline which is shutting down
    #[error("Shutdown")]
    Shutdown,
@@ -1902,11 +1902,16 @@ impl Timeline {
            return;
        };

-        let current_size = open_layer.len();
+        let Some(current_size) = open_layer.try_len() else {
+            // Unexpected: since we hold the write guard, nobody else should be writing to this layer, so
+            // read lock to get size should always succeed.
+            tracing::warn!("Lock conflict while reading size of open layer");
+            return;
+        };

        let current_lsn = self.get_last_record_lsn();

-        let checkpoint_distance_override = open_layer.tick();
+        let checkpoint_distance_override = open_layer.tick().await;

        if let Some(size_override) = checkpoint_distance_override {
            if current_size > size_override {
@@ -6538,7 +6543,7 @@ impl Timeline {

        debug!("retain_lsns: {:?}", retain_lsns);

-        let max_retain_lsn = retain_lsns.iter().max();
+        let mut layers_to_remove = Vec::new();

        // Scan all layers in the timeline (remote or on-disk).
        //
@@ -6548,110 +6553,108 @@ impl Timeline {
        // 3. it doesn't need to be retained for 'retain_lsns';
        // 4. it does not need to be kept for LSNs holding valid leases.
        // 5. newer on-disk image layers cover the layer's whole key range
-        let layers_to_remove = {
-            let mut layers_to_remove = Vec::new();
+        //
+        // TODO holding a write lock is too agressive and avoidable
+        let mut guard = self
+            .layers
+            .write(LayerManagerLockHolder::GarbageCollection)
+            .await;
+        let layers = guard.layer_map()?;
+        'outer: for l in layers.iter_historic_layers() {
+            result.layers_total += 1;

-            let guard = self
-                .layers
-                .read(LayerManagerLockHolder::GarbageCollection)
-                .await;
-            let layers = guard.layer_map()?;
-            'outer: for l in layers.iter_historic_layers() {
-                result.layers_total += 1;
-
-                // 1. Is it newer than GC horizon cutoff point?
-                if l.get_lsn_range().end > space_cutoff {
-                    debug!(
-                        "keeping {} because it's newer than space_cutoff {}",
-                        l.layer_name(),
-                        space_cutoff,
-                    );
-                    result.layers_needed_by_cutoff += 1;
-                    continue 'outer;
-                }
-
-                // 2. It is newer than PiTR cutoff point?
-                if l.get_lsn_range().end > time_cutoff {
-                    debug!(
-                        "keeping {} because it's newer than time_cutoff {}",
-                        l.layer_name(),
-                        time_cutoff,
-                    );
-                    result.layers_needed_by_pitr += 1;
-                    continue 'outer;
-                }
-
-                // 3. Is it needed by a child branch?
-                // NOTE With that we would keep data that
-                // might be referenced by child branches forever.
-                // We can track this in child timeline GC and delete parent layers when
-                // they are no longer needed. This might be complicated with long inheritance chains.
-                if let Some(retain_lsn) = max_retain_lsn {
-                    // start_lsn is inclusive
-                    if &l.get_lsn_range().start <= retain_lsn {
-                        debug!(
-                            "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}",
-                            l.layer_name(),
-                            retain_lsn,
-                            l.is_incremental(),
-                        );
-                        result.layers_needed_by_branches += 1;
-                        continue 'outer;
-                    }
-                }
-
-                // 4. Is there a valid lease that requires us to keep this layer?
-                if let Some(lsn) = &max_lsn_with_valid_lease {
-                    // keep if layer start <= any of the lease
-                    if &l.get_lsn_range().start <= lsn {
-                        debug!(
-                            "keeping {} because there is a valid lease preventing GC at {}",
-                            l.layer_name(),
-                            lsn,
-                        );
-                        result.layers_needed_by_leases += 1;
-                        continue 'outer;
-                    }
-                }
-
-                // 5. Is there a later on-disk layer for this relation?
-                //
-                // The end-LSN is exclusive, while disk_consistent_lsn is
-                // inclusive. For example, if disk_consistent_lsn is 100, it is
-                // OK for a delta layer to have end LSN 101, but if the end LSN
-                // is 102, then it might not have been fully flushed to disk
-                // before crash.
-                //
-                // For example, imagine that the following layers exist:
-                //
-                // 1000      - image (A)
-                // 1000-2000 - delta (B)
-                // 2000      - image (C)
-                // 2000-3000 - delta (D)
-                // 3000      - image (E)
-                //
-                // If GC horizon is at 2500, we can remove layers A and B, but
-                // we cannot remove C, even though it's older than 2500, because
-                // the delta layer 2000-3000 depends on it.
-                if !layers
-                    .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
-                {
-                    debug!("keeping {} because it is the latest layer", l.layer_name());
-                    result.layers_not_updated += 1;
-                    continue 'outer;
-                }
-
-                // We didn't find any reason to keep this file, so remove it.
+            // 1. Is it newer than GC horizon cutoff point?
+            if l.get_lsn_range().end > space_cutoff {
                info!(
-                    "garbage collecting {} is_dropped: xx is_incremental: {}",
+                    "keeping {} because it's newer than space_cutoff {}",
                    l.layer_name(),
-                    l.is_incremental(),
+                    space_cutoff,
                );
-                layers_to_remove.push(l);
+                result.layers_needed_by_cutoff += 1;
+                continue 'outer;
            }

-            layers_to_remove
-        };
+            // 2. It is newer than PiTR cutoff point?
+            if l.get_lsn_range().end > time_cutoff {
+                info!(
+                    "keeping {} because it's newer than time_cutoff {}",
+                    l.layer_name(),
+                    time_cutoff,
+                );
+                result.layers_needed_by_pitr += 1;
+                continue 'outer;
+            }
+
+            // 3. Is it needed by a child branch?
+            // NOTE With that we would keep data that
+            // might be referenced by child branches forever.
+            // We can track this in child timeline GC and delete parent layers when
+            // they are no longer needed. This might be complicated with long inheritance chains.
+            //
+            // TODO Vec is not a great choice for `retain_lsns`
+            for retain_lsn in &retain_lsns {
+                // start_lsn is inclusive
+                if &l.get_lsn_range().start <= retain_lsn {
+                    info!(
+                        "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}",
+                        l.layer_name(),
+                        retain_lsn,
+                        l.is_incremental(),
+                    );
+                    result.layers_needed_by_branches += 1;
+                    continue 'outer;
+                }
+            }
+
+            // 4. Is there a valid lease that requires us to keep this layer?
+            if let Some(lsn) = &max_lsn_with_valid_lease {
+                // keep if layer start <= any of the lease
+                if &l.get_lsn_range().start <= lsn {
+                    info!(
+                        "keeping {} because there is a valid lease preventing GC at {}",
+                        l.layer_name(),
+                        lsn,
+                    );
+                    result.layers_needed_by_leases += 1;
+                    continue 'outer;
+                }
+            }
+
+            // 5. Is there a later on-disk layer for this relation?
+            //
+            // The end-LSN is exclusive, while disk_consistent_lsn is
+            // inclusive. For example, if disk_consistent_lsn is 100, it is
+            // OK for a delta layer to have end LSN 101, but if the end LSN
+            // is 102, then it might not have been fully flushed to disk
+            // before crash.
+            //
+            // For example, imagine that the following layers exist:
+            //
+            // 1000      - image (A)
+            // 1000-2000 - delta (B)
+            // 2000      - image (C)
+            // 2000-3000 - delta (D)
+            // 3000      - image (E)
+            //
+            // If GC horizon is at 2500, we can remove layers A and B, but
+            // we cannot remove C, even though it's older than 2500, because
+            // the delta layer 2000-3000 depends on it.
+            if !layers
+                .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
+            {
+                info!("keeping {} because it is the latest layer", l.layer_name());
+                result.layers_not_updated += 1;
+                continue 'outer;
+            }
+
+            // We didn't find any reason to keep this file, so remove it.
+            info!(
+                "garbage collecting {} is_dropped: xx is_incremental: {}",
+                l.layer_name(),
+                l.is_incremental(),
+            );
+            layers_to_remove.push(l);
+        }

        if !layers_to_remove.is_empty() {
            // Persist the new GC cutoff value before we actually remove anything.
@@ -6667,19 +6670,15 @@ impl Timeline {
                    }
                })?;

-            let mut guard = self
-                .layers
-                .write(LayerManagerLockHolder::GarbageCollection)
-                .await;
-
            let gc_layers = layers_to_remove
                .iter()
-                .flat_map(|desc| guard.try_get_from_key(&desc.key()).cloned())
+                .map(|x| guard.get_from_desc(x))
                .collect::<Vec<Layer>>();

            result.layers_removed = gc_layers.len() as u64;

            self.remote_client.schedule_gc_update(&gc_layers)?;
+
            guard.open_mut()?.finish_gc_timeline(&gc_layers);

            #[cfg(feature = "testing")]
@@ -7367,7 +7366,7 @@ impl TimelineWriter<'_> {
            .tl
            .get_layer_for_write(at, &self.write_guard, ctx)
            .await?;
-        let initial_size = layer.len();
+        let initial_size = layer.size().await?;

        let last_freeze_at = self.last_freeze_at.load();
        self.write_guard.replace(TimelineWriterState::new(
--- a/pgxn/Makefile
+++ b/pgxn/Makefile
@@ -1,28 +0,0 @@
-# This makefile assumes that 'pg_config' is in the path, or is passed in the
-# PG_CONFIG variable.
-#
-# This is used in two different ways:
-#
-# 1. The main makefile calls this, when you invoke the `make neon-pg-ext-%`
-#    target. It passes PG_CONFIG pointing to pg_install/%/bin/pg_config.
-#    This is a VPATH build; the current directory is build/pgxn-%, and
-#    the path to the Makefile is passed with the -f argument.
-#
-# 2. compute-node.Dockerfile invokes this to build the compute extensions
-#    for the specific Postgres version. It relies on pg_config already
-#    being in $(PATH).
-
-srcdir = $(dir $(firstword $(MAKEFILE_LIST)))
-
-PG_CONFIG = pg_config
-
-subdirs = neon neon_rmgr neon_walredo neon_utils neon_test_utils
-
-.PHONY: install install-compute install-storage $(subdirs)
-install: $(subdirs)
-install-compute: neon neon_utils neon_test_utils neon_rmgr
-install-storage: neon_rmgr neon_walredo
-
-$(subdirs): %:
-	mkdir -p $*
-	$(MAKE) PG_CONFIG=$(PG_CONFIG) -C $* -f $(abspath $(srcdir)/$@/Makefile) install
--- a/proxy/src/batch.rs
+++ b/proxy/src/batch.rs
@@ -1,146 +0,0 @@
-//! Batch processing system based on intrusive linked lists.
-//!
-//! Enqueuing a batch job requires no allocations, with
-//! direct support for cancelling jobs early.
-use std::collections::BTreeMap;
-use std::pin::pin;
-use std::sync::Mutex;
-
-use futures::future::Either;
-use scopeguard::ScopeGuard;
-use tokio::sync::oneshot::error::TryRecvError;
-
-use crate::ext::LockExt;
-
-pub trait QueueProcessing: Send + 'static {
-    type Req: Send + 'static;
-    type Res: Send;
-
-    /// Get the desired batch size.
-    fn batch_size(&self, queue_size: usize) -> usize;
-
-    /// This applies a full batch of events.
-    /// Must respond with a full batch of replies.
-    ///
-    /// If this apply can error, it's expected that errors be forwarded to each Self::Res.
-    ///
-    /// Batching does not need to happen atomically.
-    fn apply(&mut self, req: Vec<Self::Req>) -> impl Future<Output = Vec<Self::Res>> + Send;
-}
-
-pub struct BatchQueue<P: QueueProcessing> {
-    processor: tokio::sync::Mutex<P>,
-    inner: Mutex<BatchQueueInner<P>>,
-}
-
-struct BatchJob<P: QueueProcessing> {
-    req: P::Req,
-    res: tokio::sync::oneshot::Sender<P::Res>,
-}
-
-impl<P: QueueProcessing> BatchQueue<P> {
-    pub fn new(p: P) -> Self {
-        Self {
-            processor: tokio::sync::Mutex::new(p),
-            inner: Mutex::new(BatchQueueInner {
-                version: 0,
-                queue: BTreeMap::new(),
-            }),
-        }
-    }
-
-    pub async fn call(&self, req: P::Req) -> P::Res {
-        let (id, mut rx) = self.inner.lock_propagate_poison().register_job(req);
-        let guard = scopeguard::guard(id, move |id| {
-            let mut inner = self.inner.lock_propagate_poison();
-            if inner.queue.remove(&id).is_some() {
-                tracing::debug!("batched task cancelled before completion");
-            }
-        });
-
-        let resp = loop {
-            // try become the leader, or try wait for success.
-            let mut processor = match futures::future::select(rx, pin!(self.processor.lock())).await
-            {
-                // we got the resp.
-                Either::Left((resp, _)) => break resp.ok(),
-                // we are the leader.
-                Either::Right((p, rx_)) => {
-                    rx = rx_;
-                    p
-                }
-            };
-
-            let (reqs, resps) = self.inner.lock_propagate_poison().get_batch(&processor);
-
-            // apply a batch.
-            let values = processor.apply(reqs).await;
-
-            // send response values.
-            for (tx, value) in std::iter::zip(resps, values) {
-                // sender hung up but that's fine.
-                drop(tx.send(value));
-            }
-
-            match rx.try_recv() {
-                Ok(resp) => break Some(resp),
-                Err(TryRecvError::Closed) => break None,
-                // edge case - there was a race condition where
-                // we became the leader but were not in the batch.
-                //
-                // Example:
-                // thread 1: register job id=1
-                // thread 2: register job id=2
-                // thread 2: processor.lock().await
-                // thread 1: processor.lock().await
-                // thread 2: becomes leader, batch_size=1, jobs=[1].
-                Err(TryRecvError::Empty) => {}
-            }
-        };
-
-        // already removed.
-        ScopeGuard::into_inner(guard);
-
-        resp.expect("no response found. batch processer should not panic")
-    }
-}
-
-struct BatchQueueInner<P: QueueProcessing> {
-    version: u64,
-    queue: BTreeMap<u64, BatchJob<P>>,
-}
-
-impl<P: QueueProcessing> BatchQueueInner<P> {
-    fn register_job(&mut self, req: P::Req) -> (u64, tokio::sync::oneshot::Receiver<P::Res>) {
-        let (tx, rx) = tokio::sync::oneshot::channel();
-
-        let id = self.version;
-
-        // Overflow concern:
-        // This is a u64, and we might enqueue 2^16 tasks per second.
-        // This gives us 2^48 seconds (9 million years).
-        // Even if this does overflow, it will not break, but some
-        // jobs with the higher version might never get prioritised.
-        self.version += 1;
-
-        self.queue.insert(id, BatchJob { req, res: tx });
-
-        (id, rx)
-    }
-
-    fn get_batch(&mut self, p: &P) -> (Vec<P::Req>, Vec<tokio::sync::oneshot::Sender<P::Res>>) {
-        let batch_size = p.batch_size(self.queue.len());
-        let mut reqs = Vec::with_capacity(batch_size);
-        let mut resps = Vec::with_capacity(batch_size);
-
-        while reqs.len() < batch_size {
-            let Some((_, job)) = self.queue.pop_first() else {
-                break;
-            };
-            reqs.push(job.req);
-            resps.push(job.res);
-        }
-
-        (reqs, resps)
-    }
-}
--- a/proxy/src/binary/local_proxy.rs
+++ b/proxy/src/binary/local_proxy.rs
@@ -201,7 +201,7 @@ pub async fn run() -> anyhow::Result<()> {
        auth_backend,
        http_listener,
        shutdown.clone(),
-        Arc::new(CancellationHandler::new(&config.connect_to_compute)),
+        Arc::new(CancellationHandler::new(&config.connect_to_compute, None)),
        endpoint_rate_limiter,
    );

--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -23,8 +23,7 @@ use utils::{project_build_tag, project_git_version};

 use crate::auth::backend::jwt::JwkCache;
 use crate::auth::backend::{ConsoleRedirectBackend, MaybeOwned};
-use crate::batch::BatchQueue;
-use crate::cancellation::{CancellationHandler, CancellationProcessor};
+use crate::cancellation::{CancellationHandler, handle_cancel_messages};
 use crate::config::{
    self, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, ProjectInfoCacheOptions,
    ProxyConfig, ProxyProtocolV2, remote_storage_from_toml,
@@ -393,7 +392,13 @@ pub async fn run() -> anyhow::Result<()> {
        .as_ref()
        .map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit));

-    let cancellation_handler = Arc::new(CancellationHandler::new(&config.connect_to_compute));
+    // channel size should be higher than redis client limit to avoid blocking
+    let cancel_ch_size = args.cancellation_ch_size;
+    let (tx_cancel, rx_cancel) = tokio::sync::mpsc::channel(cancel_ch_size);
+    let cancellation_handler = Arc::new(CancellationHandler::new(
+        &config.connect_to_compute,
+        Some(tx_cancel),
+    ));

    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
        RateBucketInfo::to_leaky_bucket(&args.endpoint_rps_limit)
@@ -525,11 +530,21 @@ pub async fn run() -> anyhow::Result<()> {
                    match redis_kv_client.try_connect().await {
                        Ok(()) => {
                            info!("Connected to Redis KV client");
-                            cancellation_handler.init_tx(BatchQueue::new(CancellationProcessor {
-                                client: redis_kv_client,
-                                batch_size: args.cancellation_batch_size,
-                            }));
+                            maintenance_tasks.spawn(async move {
+                                handle_cancel_messages(
+                                    &mut redis_kv_client,
+                                    rx_cancel,
+                                    args.cancellation_batch_size,
+                                )
+                                .await?;

+                                drop(redis_kv_client);
+
+                                // `handle_cancel_messages` was terminated due to the tx_cancel
+                                // being dropped. this is not worthy of an error, and this task can only return `Err`,
+                                // so let's wait forever instead.
+                                std::future::pending().await
+                            });
                            break;
                        }
                        Err(e) => {
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,23 +1,19 @@
-use std::convert::Infallible;
 use std::net::{IpAddr, SocketAddr};
-use std::sync::{Arc, OnceLock};
-use std::time::Duration;
+use std::sync::Arc;

-use anyhow::anyhow;
-use futures::FutureExt;
+use anyhow::{Context, anyhow};
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
-use postgres_client::RawCancelToken;
+use postgres_client::CancelToken;
 use postgres_client::tls::MakeTlsConnect;
 use redis::{Cmd, FromRedisValue, Value};
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::net::TcpStream;
-use tokio::time::timeout;
-use tracing::{debug, error, info};
+use tokio::sync::{mpsc, oneshot};
+use tracing::{debug, error, info, warn};

 use crate::auth::AuthError;
 use crate::auth::backend::ComputeUserInfo;
-use crate::batch::{BatchQueue, QueueProcessing};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::ControlPlaneApi;
@@ -31,36 +27,46 @@ use crate::redis::kv_ops::RedisKVClient;

 type IpSubnetKey = IpNet;

-const CANCEL_KEY_TTL: std::time::Duration = std::time::Duration::from_secs(600);
-const CANCEL_KEY_REFRESH: std::time::Duration = std::time::Duration::from_secs(570);
+const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time

 // Message types for sending through mpsc channel
 pub enum CancelKeyOp {
    StoreCancelKey {
-        key: CancelKeyData,
-        value: Box<str>,
-        expire: std::time::Duration,
+        key: String,
+        field: String,
+        value: String,
+        resp_tx: Option<oneshot::Sender<anyhow::Result<()>>>,
+        _guard: CancelChannelSizeGuard<'static>,
+        expire: i64, // TTL for key
    },
    GetCancelData {
-        key: CancelKeyData,
+        key: String,
+        resp_tx: oneshot::Sender<anyhow::Result<Vec<(String, String)>>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+    RemoveCancelKey {
+        key: String,
+        field: String,
+        resp_tx: Option<oneshot::Sender<anyhow::Result<()>>>,
+        _guard: CancelChannelSizeGuard<'static>,
    },
 }

 pub struct Pipeline {
    inner: redis::Pipeline,
-    replies: usize,
+    replies: Vec<CancelReplyOp>,
 }

 impl Pipeline {
    fn with_capacity(n: usize) -> Self {
        Self {
            inner: redis::Pipeline::with_capacity(n),
-            replies: 0,
+            replies: Vec::with_capacity(n),
        }
    }

-    async fn execute(self, client: &mut RedisKVClient) -> Vec<anyhow::Result<Value>> {
-        let responses = self.replies;
+    async fn execute(&mut self, client: &mut RedisKVClient) {
+        let responses = self.replies.len();
        let batch_size = self.inner.len();

        match client.query(&self.inner).await {
@@ -70,73 +76,176 @@ impl Pipeline {
                    batch_size,
                    responses, "successfully completed cancellation jobs",
                );
-                values.into_iter().map(Ok).collect()
+                for (value, reply) in std::iter::zip(values, self.replies.drain(..)) {
+                    reply.send_value(value);
+                }
            }
            Ok(value) => {
                error!(batch_size, ?value, "unexpected redis return value");
-                std::iter::repeat_with(|| Err(anyhow!("incorrect response type from redis")))
-                    .take(responses)
-                    .collect()
+                for reply in self.replies.drain(..) {
+                    reply.send_err(anyhow!("incorrect response type from redis"));
+                }
            }
            Err(err) => {
-                std::iter::repeat_with(|| Err(anyhow!("could not send cmd to redis: {err}")))
-                    .take(responses)
-                    .collect()
+                for reply in self.replies.drain(..) {
+                    reply.send_err(anyhow!("could not send cmd to redis: {err}"));
+                }
            }
        }
+
+        self.inner.clear();
+        self.replies.clear();
    }

-    fn add_command_with_reply(&mut self, cmd: Cmd) {
+    fn add_command_with_reply(&mut self, cmd: Cmd, reply: CancelReplyOp) {
        self.inner.add_command(cmd);
-        self.replies += 1;
+        self.replies.push(reply);
    }

    fn add_command_no_reply(&mut self, cmd: Cmd) {
        self.inner.add_command(cmd).ignore();
    }
+
+    fn add_command(&mut self, cmd: Cmd, reply: Option<CancelReplyOp>) {
+        match reply {
+            Some(reply) => self.add_command_with_reply(cmd, reply),
+            None => self.add_command_no_reply(cmd),
+        }
+    }
 }

 impl CancelKeyOp {
-    fn register(&self, pipe: &mut Pipeline) {
+    fn register(self, pipe: &mut Pipeline) {
        #[allow(clippy::used_underscore_binding)]
        match self {
-            CancelKeyOp::StoreCancelKey { key, value, expire } => {
-                let key = KeyPrefix::Cancel(*key).build_redis_key();
-                pipe.add_command_with_reply(Cmd::hset(&key, "data", &**value));
-                pipe.add_command_no_reply(Cmd::expire(&key, expire.as_secs() as i64));
+            CancelKeyOp::StoreCancelKey {
+                key,
+                field,
+                value,
+                resp_tx,
+                _guard,
+                expire,
+            } => {
+                let reply =
+                    resp_tx.map(|resp_tx| CancelReplyOp::StoreCancelKey { resp_tx, _guard });
+                pipe.add_command(Cmd::hset(&key, field, value), reply);
+                pipe.add_command_no_reply(Cmd::expire(key, expire));
            }
-            CancelKeyOp::GetCancelData { key } => {
-                let key = KeyPrefix::Cancel(*key).build_redis_key();
-                pipe.add_command_with_reply(Cmd::hget(key, "data"));
+            CancelKeyOp::GetCancelData {
+                key,
+                resp_tx,
+                _guard,
+            } => {
+                let reply = CancelReplyOp::GetCancelData { resp_tx, _guard };
+                pipe.add_command_with_reply(Cmd::hgetall(key), reply);
+            }
+            CancelKeyOp::RemoveCancelKey {
+                key,
+                field,
+                resp_tx,
+                _guard,
+            } => {
+                let reply =
+                    resp_tx.map(|resp_tx| CancelReplyOp::RemoveCancelKey { resp_tx, _guard });
+                pipe.add_command(Cmd::hdel(key, field), reply);
            }
        }
    }
 }

-pub struct CancellationProcessor {
-    pub client: RedisKVClient,
-    pub batch_size: usize,
+// Message types for sending through mpsc channel
+pub enum CancelReplyOp {
+    StoreCancelKey {
+        resp_tx: oneshot::Sender<anyhow::Result<()>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+    GetCancelData {
+        resp_tx: oneshot::Sender<anyhow::Result<Vec<(String, String)>>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+    RemoveCancelKey {
+        resp_tx: oneshot::Sender<anyhow::Result<()>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
 }

-impl QueueProcessing for CancellationProcessor {
-    type Req = (CancelChannelSizeGuard<'static>, CancelKeyOp);
-    type Res = anyhow::Result<redis::Value>;
-
-    fn batch_size(&self, _queue_size: usize) -> usize {
-        self.batch_size
+impl CancelReplyOp {
+    fn send_err(self, e: anyhow::Error) {
+        match self {
+            CancelReplyOp::StoreCancelKey { resp_tx, _guard } => {
+                resp_tx
+                    .send(Err(e))
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::GetCancelData { resp_tx, _guard } => {
+                resp_tx
+                    .send(Err(e))
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => {
+                resp_tx
+                    .send(Err(e))
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+        }
    }

-    async fn apply(&mut self, batch: Vec<Self::Req>) -> Vec<Self::Res> {
-        let mut pipeline = Pipeline::with_capacity(batch.len());
+    fn send_value(self, v: redis::Value) {
+        match self {
+            CancelReplyOp::StoreCancelKey { resp_tx, _guard } => {
+                let send =
+                    FromRedisValue::from_owned_redis_value(v).context("could not parse value");
+                resp_tx
+                    .send(send)
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::GetCancelData { resp_tx, _guard } => {
+                let send =
+                    FromRedisValue::from_owned_redis_value(v).context("could not parse value");
+                resp_tx
+                    .send(send)
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => {
+                let send =
+                    FromRedisValue::from_owned_redis_value(v).context("could not parse value");
+                resp_tx
+                    .send(send)
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+        }
+    }
+}
+
+// Running as a separate task to accept messages through the rx channel
+pub async fn handle_cancel_messages(
+    client: &mut RedisKVClient,
+    mut rx: mpsc::Receiver<CancelKeyOp>,
+    batch_size: usize,
+) -> anyhow::Result<()> {
+    let mut batch = Vec::with_capacity(batch_size);
+    let mut pipeline = Pipeline::with_capacity(batch_size);
+
+    loop {
+        if rx.recv_many(&mut batch, batch_size).await == 0 {
+            warn!("shutting down cancellation queue");
+            break Ok(());
+        }

        let batch_size = batch.len();
        debug!(batch_size, "running cancellation jobs");

-        for (_, op) in &batch {
-            op.register(&mut pipeline);
+        for msg in batch.drain(..) {
+            msg.register(&mut pipeline);
        }

-        pipeline.execute(&mut self.client).await
+        pipeline.execute(client).await;
    }
 }

@@ -147,7 +256,7 @@ pub struct CancellationHandler {
    compute_config: &'static ComputeConfig,
    // rate limiter of cancellation requests
    limiter: Arc<std::sync::Mutex<LeakyBucketRateLimiter<IpSubnetKey>>>,
-    tx: OnceLock<BatchQueue<CancellationProcessor>>, // send messages to the redis KV client task
+    tx: Option<mpsc::Sender<CancelKeyOp>>, // send messages to the redis KV client task
 }

 #[derive(Debug, Error)]
@@ -187,10 +296,13 @@ impl ReportableError for CancelError {
 }

 impl CancellationHandler {
-    pub fn new(compute_config: &'static ComputeConfig) -> Self {
+    pub fn new(
+        compute_config: &'static ComputeConfig,
+        tx: Option<mpsc::Sender<CancelKeyOp>>,
+    ) -> Self {
        Self {
            compute_config,
-            tx: OnceLock::new(),
+            tx,
            limiter: Arc::new(std::sync::Mutex::new(
                LeakyBucketRateLimiter::<IpSubnetKey>::new_with_shards(
                    LeakyBucketRateLimiter::<IpSubnetKey>::DEFAULT,
@@ -200,14 +312,7 @@ impl CancellationHandler {
        }
    }

-    pub fn init_tx(&self, queue: BatchQueue<CancellationProcessor>) {
-        self.tx
-            .set(queue)
-            .map_err(|_| {})
-            .expect("cancellation queue should be registered once");
-    }
-
-    pub(crate) fn get_key(self: Arc<Self>) -> Session {
+    pub(crate) fn get_key(self: &Arc<Self>) -> Session {
        // we intentionally generate a random "backend pid" and "secret key" here.
        // we use the corresponding u64 as an identifier for the
        // actual endpoint+pid+secret for postgres/pgbouncer.
@@ -217,10 +322,14 @@ impl CancellationHandler {

        let key: CancelKeyData = rand::random();

+        let prefix_key: KeyPrefix = KeyPrefix::Cancel(key);
+        let redis_key = prefix_key.build_redis_key();
+
        debug!("registered new query cancellation key {key}");
        Session {
            key,
-            cancellation_handler: self,
+            redis_key,
+            cancellation_handler: Arc::clone(self),
        }
    }

@@ -228,43 +337,62 @@ impl CancellationHandler {
        &self,
        key: CancelKeyData,
    ) -> Result<Option<CancelClosure>, CancelError> {
-        let guard = Metrics::get()
-            .proxy
-            .cancel_channel_size
-            .guard(RedisMsgKind::HGet);
-        let op = CancelKeyOp::GetCancelData { key };
+        let prefix_key: KeyPrefix = KeyPrefix::Cancel(key);
+        let redis_key = prefix_key.build_redis_key();

-        let Some(tx) = self.tx.get() else {
+        let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
+        let op = CancelKeyOp::GetCancelData {
+            key: redis_key,
+            resp_tx,
+            _guard: Metrics::get()
+                .proxy
+                .cancel_channel_size
+                .guard(RedisMsgKind::HGetAll),
+        };
+
+        let Some(tx) = &self.tx else {
            tracing::warn!("cancellation handler is not available");
            return Err(CancelError::InternalError);
        };

-        const TIMEOUT: Duration = Duration::from_secs(5);
-        let result = timeout(TIMEOUT, tx.call((guard, op)))
-            .await
-            .map_err(|_| {
-                tracing::warn!("timed out waiting to receive GetCancelData response");
-                CancelError::RateLimit
-            })?
+        tx.try_send(op)
            .map_err(|e| {
-                tracing::warn!("failed to receive GetCancelData response: {e}");
-                CancelError::InternalError
-            })?;
+                tracing::warn!("failed to send GetCancelData for {key}: {e}");
+            })
+            .map_err(|()| CancelError::InternalError)?;

-        let cancel_state_str = String::from_owned_redis_value(result).map_err(|e| {
+        let result = resp_rx.await.map_err(|e| {
            tracing::warn!("failed to receive GetCancelData response: {e}");
            CancelError::InternalError
        })?;

-        let cancel_closure: CancelClosure =
-            serde_json::from_str(&cancel_state_str).map_err(|e| {
-                tracing::warn!("failed to deserialize cancel state: {e}");
-                CancelError::InternalError
-            })?;
+        let cancel_state_str: Option<String> = match result {
+            Ok(mut state) => {
+                if state.len() == 1 {
+                    Some(state.remove(0).1)
+                } else {
+                    tracing::warn!("unexpected number of entries in cancel state: {state:?}");
+                    return Err(CancelError::InternalError);
+                }
+            }
+            Err(e) => {
+                tracing::warn!("failed to receive cancel state from redis: {e}");
+                return Err(CancelError::InternalError);
+            }
+        };

-        Ok(Some(cancel_closure))
+        let cancel_state: Option<CancelClosure> = match cancel_state_str {
+            Some(state) => {
+                let cancel_closure: CancelClosure = serde_json::from_str(&state).map_err(|e| {
+                    tracing::warn!("failed to deserialize cancel state: {e}");
+                    CancelError::InternalError
+                })?;
+                Some(cancel_closure)
+            }
+            None => None,
+        };
+        Ok(cancel_state)
    }
-
    /// Try to cancel a running query for the corresponding connection.
    /// If the cancellation key is not found, it will be published to Redis.
    /// check_allowed - if true, check if the IP is allowed to cancel the query.
@@ -339,10 +467,10 @@ impl CancellationHandler {
 /// This should've been a [`std::future::Future`], but
 /// it's impossible to name a type of an unboxed future
 /// (we'd need something like `#![feature(type_alias_impl_trait)]`).
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct CancelClosure {
    socket_addr: SocketAddr,
-    cancel_token: RawCancelToken,
+    cancel_token: CancelToken,
    hostname: String, // for pg_sni router
    user_info: ComputeUserInfo,
 }
@@ -350,7 +478,7 @@ pub struct CancelClosure {
 impl CancelClosure {
    pub(crate) fn new(
        socket_addr: SocketAddr,
-        cancel_token: RawCancelToken,
+        cancel_token: CancelToken,
        hostname: String,
        user_info: ComputeUserInfo,
    ) -> Self {
@@ -363,7 +491,7 @@ impl CancelClosure {
    }
    /// Cancels the query running on user's compute node.
    pub(crate) async fn try_cancel_query(
-        &self,
+        self,
        compute_config: &ComputeConfig,
    ) -> Result<(), CancelError> {
        let socket = TcpStream::connect(self.socket_addr).await?;
@@ -384,6 +512,7 @@ impl CancelClosure {
 pub(crate) struct Session {
    /// The user-facing key identifying this session.
    key: CancelKeyData,
+    redis_key: String,
    cancellation_handler: Arc<CancellationHandler>,
 }

@@ -392,66 +521,60 @@ impl Session {
        &self.key
    }

-    /// Ensure the cancel key is continously refreshed,
-    /// but stop when the channel is dropped.
-    pub(crate) async fn maintain_cancel_key(
+    // Send the store key op to the cancellation handler and set TTL for the key
+    pub(crate) fn write_cancel_key(
        &self,
-        session_id: uuid::Uuid,
-        cancel: tokio::sync::oneshot::Receiver<Infallible>,
-        cancel_closure: &CancelClosure,
-        compute_config: &ComputeConfig,
-    ) {
-        futures::future::select(
-            std::pin::pin!(self.maintain_redis_cancel_key(cancel_closure)),
-            cancel,
-        )
-        .await;
-
-        if let Err(err) = cancel_closure
-            .try_cancel_query(compute_config)
-            .boxed()
-            .await
-        {
-            tracing::warn!(
-                ?session_id,
-                ?err,
-                "could not cancel the query in the database"
-            );
-        }
-    }
-
-    // Ensure the cancel key is continously refreshed.
-    async fn maintain_redis_cancel_key(&self, cancel_closure: &CancelClosure) -> ! {
-        let Some(tx) = self.cancellation_handler.tx.get() else {
+        cancel_closure: CancelClosure,
+    ) -> Result<(), CancelError> {
+        let Some(tx) = &self.cancellation_handler.tx else {
            tracing::warn!("cancellation handler is not available");
-            // don't exit, as we only want to exit if cancelled externally.
-            std::future::pending().await
+            return Err(CancelError::InternalError);
        };

-        let closure_json = serde_json::to_string(&cancel_closure)
-            .expect("serialising to json string should not fail")
-            .into_boxed_str();
+        let closure_json = serde_json::to_string(&cancel_closure).map_err(|e| {
+            tracing::warn!("failed to serialize cancel closure: {e}");
+            CancelError::InternalError
+        })?;

-        loop {
-            let guard = Metrics::get()
+        let op = CancelKeyOp::StoreCancelKey {
+            key: self.redis_key.clone(),
+            field: "data".to_string(),
+            value: closure_json,
+            resp_tx: None,
+            _guard: Metrics::get()
                .proxy
                .cancel_channel_size
-                .guard(RedisMsgKind::HSet);
-            let op = CancelKeyOp::StoreCancelKey {
-                key: self.key,
-                value: closure_json.clone(),
-                expire: CANCEL_KEY_TTL,
-            };
+                .guard(RedisMsgKind::HSet),
+            expire: CANCEL_KEY_TTL,
+        };

-            tracing::debug!(
-                src=%self.key,
-                dest=?cancel_closure.cancel_token,
-                "registering cancellation key"
-            );
+        let _ = tx.try_send(op).map_err(|e| {
+            let key = self.key;
+            tracing::warn!("failed to send StoreCancelKey for {key}: {e}");
+        });
+        Ok(())
+    }

-            if tx.call((guard, op)).await.is_ok() {
-                tokio::time::sleep(CANCEL_KEY_REFRESH).await;
-            }
-        }
+    pub(crate) fn remove_cancel_key(&self) -> Result<(), CancelError> {
+        let Some(tx) = &self.cancellation_handler.tx else {
+            tracing::warn!("cancellation handler is not available");
+            return Err(CancelError::InternalError);
+        };
+
+        let op = CancelKeyOp::RemoveCancelKey {
+            key: self.redis_key.clone(),
+            field: "data".to_string(),
+            resp_tx: None,
+            _guard: Metrics::get()
+                .proxy
+                .cancel_channel_size
+                .guard(RedisMsgKind::HDel),
+        };
+
+        let _ = tx.try_send(op).map_err(|e| {
+            let key = self.key;
+            tracing::warn!("failed to send RemoveCancelKey for {key}: {e}");
+        });
+        Ok(())
    }
 }
--- a/proxy/src/compute/mod.rs
+++ b/proxy/src/compute/mod.rs
@@ -9,7 +9,7 @@ use itertools::Itertools;
 use postgres_client::config::{AuthKeys, SslMode};
 use postgres_client::maybe_tls_stream::MaybeTlsStream;
 use postgres_client::tls::MakeTlsConnect;
-use postgres_client::{NoTls, RawCancelToken, RawConnection};
+use postgres_client::{CancelToken, NoTls, RawConnection};
 use postgres_protocol::message::backend::NoticeResponseBody;
 use thiserror::Error;
 use tokio::net::{TcpStream, lookup_host};
@@ -265,8 +265,7 @@ impl ConnectInfo {
    }
 }

-pub type RustlsStream = <ComputeConfig as MakeTlsConnect<tokio::net::TcpStream>>::Stream;
-pub type MaybeRustlsStream = MaybeTlsStream<tokio::net::TcpStream, RustlsStream>;
+type RustlsStream = <ComputeConfig as MakeTlsConnect<tokio::net::TcpStream>>::Stream;

 pub(crate) struct PostgresConnection {
    /// Socket connected to a compute node.
@@ -280,7 +279,7 @@ pub(crate) struct PostgresConnection {
    /// Notices received from compute after authenticating
    pub(crate) delayed_notice: Vec<NoticeResponseBody>,

-    pub(crate) guage: NumDbConnectionsGuard<'static>,
+    _guage: NumDbConnectionsGuard<'static>,
 }

 impl ConnectInfo {
@@ -328,7 +327,8 @@ impl ConnectInfo {
        // Yet another reason to rework the connection establishing code.
        let cancel_closure = CancelClosure::new(
            socket_addr,
-            RawCancelToken {
+            CancelToken {
+                socket_config: None,
                ssl_mode: self.ssl_mode,
                process_id,
                secret_key,
@@ -343,7 +343,7 @@ impl ConnectInfo {
            delayed_notice,
            cancel_closure,
            aux,
-            guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()),
+            _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()),
        };

        Ok(connection)
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -120,7 +120,7 @@ pub async fn task_main(
                Ok(Some(p)) => {
                    ctx.set_success();
                    let _disconnect = ctx.log_connect();
-                    match p.proxy_pass().await {
+                    match p.proxy_pass(&config.connect_to_compute).await {
                        Ok(()) => {}
                        Err(ErrorSource::Client(e)) => {
                            error!(
@@ -232,35 +232,22 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
    .or_else(|e| async { Err(stream.throw_error(e, Some(ctx)).await) })
    .await?;

-    let session = cancellation_handler.get_key();
+    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+    let session = cancellation_handler_clone.get_key();
+
+    session.write_cancel_key(node.cancel_closure.clone())?;

    prepare_client_connection(&node, *session.key(), &mut stream);
    let stream = stream.flush_and_into_inner().await?;

-    let session_id = ctx.session_id();
-    let (cancel_on_shutdown, cancel) = tokio::sync::oneshot::channel();
-    tokio::spawn(async move {
-        session
-            .maintain_cancel_key(
-                session_id,
-                cancel,
-                &node.cancel_closure,
-                &config.connect_to_compute,
-            )
-            .await;
-    });
-
    Ok(Some(ProxyPassthrough {
        client: stream,
-        compute: node.stream,
-
-        aux: node.aux,
+        aux: node.aux.clone(),
        private_link_id: None,
-
-        _cancel_on_shutdown: cancel_on_shutdown,
-
+        compute: node,
+        session_id: ctx.session_id(),
+        cancel: session,
        _req: request_gauge,
        _conn: conn_gauge,
-        _db_conn: node.guage,
    }))
 }
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -75,7 +75,6 @@
 pub mod binary;

 mod auth;
-mod batch;
 mod cache;
 mod cancellation;
 mod compute;
--- a/proxy/src/pglb/passthrough.rs
+++ b/proxy/src/pglb/passthrough.rs
@@ -1,17 +1,15 @@
-use std::convert::Infallible;
-
+use futures::FutureExt;
 use smol_str::SmolStr;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::debug;
 use utils::measured_stream::MeasuredStream;

 use super::copy_bidirectional::ErrorSource;
-use crate::compute::MaybeRustlsStream;
+use crate::cancellation;
+use crate::compute::PostgresConnection;
+use crate::config::ComputeConfig;
 use crate::control_plane::messages::MetricsAuxInfo;
-use crate::metrics::{
-    Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard,
-    NumDbConnectionsGuard,
-};
+use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard};
 use crate::stream::Stream;
 use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS};

@@ -66,20 +64,40 @@ pub(crate) async fn proxy_pass(

 pub(crate) struct ProxyPassthrough<S> {
    pub(crate) client: Stream<S>,
-    pub(crate) compute: MaybeRustlsStream,
-
+    pub(crate) compute: PostgresConnection,
    pub(crate) aux: MetricsAuxInfo,
+    pub(crate) session_id: uuid::Uuid,
    pub(crate) private_link_id: Option<SmolStr>,
-
-    pub(crate) _cancel_on_shutdown: tokio::sync::oneshot::Sender<Infallible>,
+    pub(crate) cancel: cancellation::Session,

    pub(crate) _req: NumConnectionRequestsGuard<'static>,
    pub(crate) _conn: NumClientConnectionsGuard<'static>,
-    pub(crate) _db_conn: NumDbConnectionsGuard<'static>,
 }

 impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
-    pub(crate) async fn proxy_pass(self) -> Result<(), ErrorSource> {
-        proxy_pass(self.client, self.compute, self.aux, self.private_link_id).await
+    pub(crate) async fn proxy_pass(
+        self,
+        compute_config: &ComputeConfig,
+    ) -> Result<(), ErrorSource> {
+        let res = proxy_pass(
+            self.client,
+            self.compute.stream,
+            self.aux,
+            self.private_link_id,
+        )
+        .await;
+        if let Err(err) = self
+            .compute
+            .cancel_closure
+            .try_cancel_query(compute_config)
+            .boxed()
+            .await
+        {
+            tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database");
+        }
+
+        drop(self.cancel.remove_cancel_key()); // we don't need a result. If the queue is full, we just log the error
+
+        res
    }
 }
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -155,7 +155,7 @@ pub async fn task_main(
                Ok(Some(p)) => {
                    ctx.set_success();
                    let _disconnect = ctx.log_connect();
-                    match p.proxy_pass().await {
+                    match p.proxy_pass(&config.connect_to_compute).await {
                        Ok(()) => {}
                        Err(ErrorSource::Client(e)) => {
                            warn!(
@@ -372,24 +372,13 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(
        Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?,
    };

-    let session = cancellation_handler.get_key();
+    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+    let session = cancellation_handler_clone.get_key();

+    session.write_cancel_key(node.cancel_closure.clone())?;
    prepare_client_connection(&node, *session.key(), &mut stream);
    let stream = stream.flush_and_into_inner().await?;

-    let session_id = ctx.session_id();
-    let (cancel_on_shutdown, cancel) = tokio::sync::oneshot::channel();
-    tokio::spawn(async move {
-        session
-            .maintain_cancel_key(
-                session_id,
-                cancel,
-                &node.cancel_closure,
-                &config.connect_to_compute,
-            )
-            .await;
-    });
-
    let private_link_id = match ctx.extra() {
        Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()),
        Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()),
@@ -398,16 +387,13 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin + Send>(

    Ok(Some(ProxyPassthrough {
        client: stream,
-        compute: node.stream,
-
-        aux: node.aux,
+        aux: node.aux.clone(),
        private_link_id,
-
-        _cancel_on_shutdown: cancel_on_shutdown,
-
+        compute: node,
+        session_id: ctx.session_id(),
+        cancel: session,
        _req: request_gauge,
        _conn: conn_gauge,
-        _db_conn: node.guage,
    }))
 }

--- a/proxy/src/redis/keys.rs
+++ b/proxy/src/redis/keys.rs
@@ -1,4 +1,8 @@
-use crate::pqproto::CancelKeyData;
+use std::io::ErrorKind;
+
+use anyhow::Ok;
+
+use crate::pqproto::{CancelKeyData, id_to_cancel_key};

 pub mod keyspace {
    pub const CANCEL_PREFIX: &str = "cancel";
@@ -19,12 +23,39 @@ impl KeyPrefix {
            }
        }
    }
+
+    #[allow(dead_code)]
+    pub(crate) fn as_str(&self) -> &'static str {
+        match self {
+            KeyPrefix::Cancel(_) => keyspace::CANCEL_PREFIX,
+        }
+    }
+}
+
+#[allow(dead_code)]
+pub(crate) fn parse_redis_key(key: &str) -> anyhow::Result<KeyPrefix> {
+    let (prefix, key_str) = key.split_once(':').ok_or_else(|| {
+        anyhow::anyhow!(std::io::Error::new(
+            ErrorKind::InvalidData,
+            "missing prefix"
+        ))
+    })?;
+
+    match prefix {
+        keyspace::CANCEL_PREFIX => {
+            let id = u64::from_str_radix(key_str, 16)?;
+
+            Ok(KeyPrefix::Cancel(id_to_cancel_key(id)))
+        }
+        _ => Err(anyhow::anyhow!(std::io::Error::new(
+            ErrorKind::InvalidData,
+            "unknown prefix"
+        ))),
+    }
 }

 #[cfg(test)]
 mod tests {
-    use crate::pqproto::id_to_cancel_key;
-
    use super::*;

    #[test]
@@ -34,4 +65,16 @@ mod tests {
        let redis_key = cancel_key.build_redis_key();
        assert_eq!(redis_key, "cancel:30390000d431");
    }
+
+    #[test]
+    fn test_parse_redis_key() {
+        let redis_key = "cancel:30390000d431";
+        let key: KeyPrefix = parse_redis_key(redis_key).expect("Failed to parse key");
+
+        let ref_key = id_to_cancel_key(12345 << 32 | 54321);
+
+        assert_eq!(key.as_str(), KeyPrefix::Cancel(ref_key).as_str());
+        let KeyPrefix::Cancel(cancel_key) = key;
+        assert_eq!(ref_key, cancel_key);
+    }
 }
--- a/proxy/src/redis/kv_ops.rs
+++ b/proxy/src/redis/kv_ops.rs
@@ -1,6 +1,3 @@
-use std::time::Duration;
-
-use futures::FutureExt;
 use redis::aio::ConnectionLike;
 use redis::{Cmd, FromRedisValue, Pipeline, RedisResult};

@@ -38,11 +35,14 @@ impl RedisKVClient {
    }

    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
-        self.client
-            .connect()
-            .boxed()
-            .await
-            .inspect_err(|e| tracing::error!("failed to connect to redis: {e}"))
+        match self.client.connect().await {
+            Ok(()) => {}
+            Err(e) => {
+                tracing::error!("failed to connect to redis: {e}");
+                return Err(e);
+            }
+        }
+        Ok(())
    }

    pub(crate) async fn query<T: FromRedisValue>(
@@ -54,25 +54,15 @@ impl RedisKVClient {
            return Err(anyhow::anyhow!("Rate limit exceeded"));
        }

-        let e = match q.query(&mut self.client).await {
+        match q.query(&mut self.client).await {
            Ok(t) => return Ok(t),
-            Err(e) => e,
-        };
-
-        tracing::error!("failed to run query: {e}");
-        match e.retry_method() {
-            redis::RetryMethod::Reconnect => {
-                tracing::info!("Redis client is disconnected. Reconnecting...");
-                self.try_connect().await?;
+            Err(e) => {
+                tracing::error!("failed to run query: {e}");
            }
-            redis::RetryMethod::RetryImmediately => {}
-            redis::RetryMethod::WaitAndRetry => {
-                // somewhat arbitrary.
-                tokio::time::sleep(Duration::from_millis(100)).await;
-            }
-            _ => Err(e)?,
        }

+        tracing::info!("Redis client is disconnected. Reconnecting...");
+        self.try_connect().await?;
        Ok(q.query(&mut self.client).await?)
    }
 }
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -167,7 +167,7 @@ pub(crate) async fn serve_websocket(
        Ok(Some(p)) => {
            ctx.set_success();
            ctx.log_connect();
-            match p.proxy_pass().await {
+            match p.proxy_pass(&config.connect_to_compute).await {
                Ok(()) => Ok(()),
                Err(ErrorSource::Client(err)) => Err(err).context("client"),
                Err(ErrorSource::Compute(err)) => Err(err).context("compute"),
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -5,9 +5,6 @@ use std::time::Duration;

 use anyhow::{Context, anyhow};
 use camino::Utf8PathBuf;
-
-#[cfg(feature = "testing")]
-use clap::ArgAction;
 use clap::Parser;
 use futures::future::OptionFuture;
 use http_utils::tls_certs::ReloadingCertificateResolver;
@@ -210,19 +207,6 @@ struct Cli {
    /// the compute notification directly (instead of via control plane).
    #[arg(long, default_value = "false")]
    use_local_compute_notifications: bool,
-
-    /// Number of safekeepers to choose for a timeline when creating it.
-    /// Safekeepers will be choosen from different availability zones.
-    /// This option exists primarily for testing purposes.
-    #[arg(long, default_value = "3", value_parser = clap::value_parser!(i64).range(1..))]
-    timeline_safekeeper_count: i64,
-
-    /// When set, actively checks and initiates heatmap downloads/uploads during reconciliation.
-    /// This speed up migrations by avoiding the default wait for the heatmap download interval.
-    /// Primarily useful for testing to reduce test execution time.
-    #[cfg(feature = "testing")]
-    #[arg(long, default_value = "true", action=ArgAction::Set)]
-    kick_secondary_downloads: bool,
 }

 enum StrictMode {
@@ -387,11 +371,6 @@ async fn async_main() -> anyhow::Result<()> {
        StrictMode::Strict if args.use_local_compute_notifications => {
            anyhow::bail!("`--use-local-compute-notifications` is only permitted in `--dev` mode");
        }
-        StrictMode::Strict if args.timeline_safekeeper_count < 3 => {
-            anyhow::bail!(
-                "Running with less than 3 safekeepers per timeline is only permitted in `--dev` mode"
-            );
-        }
        StrictMode::Strict => {
            tracing::info!("Starting in strict mode: configuration is OK.")
        }
@@ -454,9 +433,6 @@ async fn async_main() -> anyhow::Result<()> {
        ssl_ca_certs,
        timelines_onto_safekeepers: args.timelines_onto_safekeepers,
        use_local_compute_notifications: args.use_local_compute_notifications,
-        timeline_safekeeper_count: args.timeline_safekeeper_count,
-        #[cfg(feature = "testing")]
-        kick_secondary_downloads: args.kick_secondary_downloads,
    };

    // Validate that we can connect to the database
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -856,7 +856,6 @@ impl Reconciler {
                &self.shard,
                &self.config,
                &self.placement_policy,
-                self.intent.secondary.len(),
            );
            match self.observed.locations.get(&node.get_id()) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
@@ -1236,11 +1235,11 @@ pub(crate) fn attached_location_conf(
    shard: &ShardIdentity,
    config: &TenantConfig,
    policy: &PlacementPolicy,
-    secondary_count: usize,
 ) -> LocationConfig {
    let has_secondaries = match policy {
-        PlacementPolicy::Detached | PlacementPolicy::Secondary => false,
-        PlacementPolicy::Attached(0) => secondary_count > 0,
+        PlacementPolicy::Attached(0) | PlacementPolicy::Detached | PlacementPolicy::Secondary => {
+            false
+        }
        PlacementPolicy::Attached(_) => true,
    };

--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -825,7 +825,6 @@ impl Scheduler {
        struct AzScore {
            home_shard_count: usize,
            scheduleable: bool,
-            node_count: usize,
        }

        let mut azs: HashMap<&AvailabilityZone, AzScore> = HashMap::new();
@@ -833,7 +832,6 @@ impl Scheduler {
            let az = azs.entry(&node.az).or_default();
            az.home_shard_count += node.home_shard_count;
            az.scheduleable |= matches!(node.may_schedule, MaySchedule::Yes(_));
-            az.node_count += 1;
        }

        // If any AZs are schedulable, then filter out the non-schedulable ones (i.e. AZs where
@@ -842,20 +840,10 @@ impl Scheduler {
            azs.retain(|_, i| i.scheduleable);
        }

-        // We will multiply up shard counts by the max node count for scoring, before dividing
-        // by per-node max node count, to get a normalized score that doesn't collapse to zero
-        // when the absolute shard count is less than the node count.
-        let max_node_count = azs.values().map(|i| i.node_count).max().unwrap_or(0);
-
        // Find the AZ with the lowest number of shards currently allocated
        Some(
            azs.into_iter()
-                .min_by_key(|i| {
-                    (
-                        (i.1.home_shard_count * max_node_count) / i.1.node_count,
-                        i.0,
-                    )
-                })
+                .min_by_key(|i| (i.1.home_shard_count, i.0))
                .unwrap()
                .0
                .clone(),
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -466,13 +466,6 @@ pub struct Config {
    pub timelines_onto_safekeepers: bool,

    pub use_local_compute_notifications: bool,
-
-    /// Number of safekeepers to choose for a timeline when creating it.
-    /// Safekeepers will be choosen from different availability zones.
-    pub timeline_safekeeper_count: i64,
-
-    #[cfg(feature = "testing")]
-    pub kick_secondary_downloads: bool,
 }

 impl From<DatabaseError> for ApiError {
@@ -2067,7 +2060,6 @@ impl Service {
                            &tenant_shard.shard,
                            &tenant_shard.config,
                            &PlacementPolicy::Attached(0),
-                            tenant_shard.intent.get_secondary().len(),
                        )),
                    },
                )]);
@@ -5609,15 +5601,7 @@ impl Service {
            for parent_id in parent_ids {
                let child_ids = parent_id.split(new_shard_count);

-                let (
-                    pageserver,
-                    generation,
-                    policy,
-                    parent_ident,
-                    config,
-                    preferred_az,
-                    secondary_count,
-                ) = {
+                let (pageserver, generation, policy, parent_ident, config, preferred_az) = {
                    let mut old_state = tenants
                        .remove(&parent_id)
                        .expect("It was present, we just split it");
@@ -5637,7 +5621,6 @@ impl Service {
                        old_state.shard,
                        old_state.config.clone(),
                        old_state.preferred_az().cloned(),
-                        old_state.intent.get_secondary().len(),
                    )
                };

@@ -5659,7 +5642,6 @@ impl Service {
                                &child_shard,
                                &config,
                                &policy,
-                                secondary_count,
                            )),
                        },
                    );
@@ -8387,11 +8369,6 @@ impl Service {
    /// we have this helper to move things along faster.
    #[cfg(feature = "testing")]
    async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) {
-        if !self.config.kick_secondary_downloads {
-            // No-op if kick_secondary_downloads functionaliuty is not configured
-            return;
-        }
-
        let (attached_node, secondaries) = {
            let locked = self.inner.read().unwrap();
            let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -1,4 +1,3 @@
-use std::cmp::max;
 use std::collections::HashSet;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -609,8 +608,7 @@ impl Service {
        Ok(())
    }

-    /// Choose safekeepers for the new timeline in different azs.
-    /// 3 are choosen by default, but may be configured via config (for testing).
+    /// Choose safekeepers for the new timeline: 3 in different azs.
    pub(crate) async fn safekeepers_for_new_timeline(
        &self,
    ) -> Result<Vec<SafekeeperInfo>, ApiError> {
@@ -653,14 +651,18 @@ impl Service {
            )
        });
        // Number of safekeepers in different AZs we are looking for
-        let mut wanted_count = self.config.timeline_safekeeper_count as usize;
-        // TODO(diko): remove this when `timeline_safekeeper_count` option is in the release
-        // branch and is specified in tests/neon_local config.
-        if cfg!(feature = "testing") && all_safekeepers.len() < wanted_count {
-            // In testing mode, we can have less safekeepers than the config says
-            wanted_count = max(all_safekeepers.len(), 1);
-        }
-
+        let wanted_count = match all_safekeepers.len() {
+            0 => {
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                    "couldn't find any active safekeeper for new timeline",
+                )));
+            }
+            // Have laxer requirements on testig mode as we don't want to
+            // spin up three safekeepers for every single test
+            #[cfg(feature = "testing")]
+            1 | 2 => all_safekeepers.len(),
+            _ => 3,
+        };
        let mut sks = Vec::new();
        let mut azs = HashSet::new();
        for (_sk_util, sk_info, az_id) in all_safekeepers.iter() {
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1381,13 +1381,8 @@ impl TenantShard {
                .generation
                .expect("Attempted to enter attached state without a generation");

-            let wanted_conf = attached_location_conf(
-                generation,
-                &self.shard,
-                &self.config,
-                &self.policy,
-                self.intent.get_secondary().len(),
-            );
+            let wanted_conf =
+                attached_location_conf(generation, &self.shard, &self.config, &self.policy);
            match self.observed.locations.get(&node_id) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                Some(_) | None => {
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -24,7 +24,7 @@ The value to place in the `aud` claim.

@final
 class ComputeClaimsScope(StrEnum):
-    ADMIN = "compute_ctl:admin"
+    ADMIN = "admin"


@final
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -453,7 +453,6 @@ class NeonEnvBuilder:
        pageserver_get_vectored_concurrent_io: str | None = None,
        pageserver_tracing_config: PageserverTracingConfig | None = None,
        pageserver_import_config: PageserverImportConfig | None = None,
-        storcon_kick_secondary_downloads: bool | None = None,
    ):
        self.repo_dir = repo_dir
        self.rust_log_override = rust_log_override
@@ -515,8 +514,6 @@ class NeonEnvBuilder:
        self.pageserver_tracing_config = pageserver_tracing_config
        self.pageserver_import_config = pageserver_import_config

-        self.storcon_kick_secondary_downloads = storcon_kick_secondary_downloads
-
        self.pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = (
            pageserver_default_tenant_config_compaction_algorithm
        )
@@ -1224,14 +1221,6 @@ class NeonEnv:
            else:
                cfg["storage_controller"] = {"use_local_compute_notifications": False}

-        if config.storcon_kick_secondary_downloads is not None:
-            # Configure whether storage controller should actively kick off secondary downloads
-            if "storage_controller" not in cfg:
-                cfg["storage_controller"] = {}
-            cfg["storage_controller"]["kick_secondary_downloads"] = (
-                config.storcon_kick_secondary_downloads
-            )
-
        # Create config for pageserver
        http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
        pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -1219,31 +1219,3 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        )
        self.verbose_error(res)
        return res.json()
-
-    def force_override_feature_flag(self, flag: str, value: str | None = None):
-        if value is None:
-            res = self.delete(
-                f"http://localhost:{self.port}/v1/feature_flag/{flag}",
-            )
-        else:
-            res = self.put(
-                f"http://localhost:{self.port}/v1/feature_flag/{flag}",
-                params={"value": value},
-            )
-        self.verbose_error(res)
-
-    def evaluate_feature_flag_boolean(self, tenant_id: TenantId, flag: str) -> Any:
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/feature_flag/{flag}",
-            params={"as": "boolean"},
-        )
-        self.verbose_error(res)
-        return res.json()
-
-    def evaluate_feature_flag_multivariate(self, tenant_id: TenantId, flag: str) -> Any:
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/feature_flag/{flag}",
-            params={"as": "multivariate"},
-        )
-        self.verbose_error(res)
-        return res.json()
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -146,6 +146,8 @@ def run_benchmark(env: NeonEnv, pg_bin: PgBin, record, duration_secs: int):
        ps_http.base_url,
        "--page-service-connstring",
        env.pageserver.connstr(password=None),
+        "--gzip-probability",
+        "1",
        "--runtime",
        f"{duration_secs}s",
        # don't specify the targets explicitly, let pagebench auto-discover them
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -184,7 +184,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
        "timeline_offloading": False,
        "rel_size_v2_enabled": True,
        "relsize_snapshot_cache_capacity": 10000,
-        "gc_compaction_enabled": False,
+        "gc_compaction_enabled": True,
        "gc_compaction_verification": False,
        "gc_compaction_initial_threshold_kb": 1024000,
        "gc_compaction_ratio_percent": 200,
--- a/test_runner/regress/test_feature_flag.py
+++ b/test_runner/regress/test_feature_flag.py
@@ -1,51 +0,0 @@
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-from fixtures.utils import run_only_on_default_postgres
-
-if TYPE_CHECKING:
-    from fixtures.neon_fixtures import NeonEnvBuilder
-
-
-@run_only_on_default_postgres("Pageserver-only test only needs to run on one version")
-def test_feature_flag(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start()
-    env.pageserver.http_client().force_override_feature_flag("test-feature-flag", "true")
-    assert env.pageserver.http_client().evaluate_feature_flag_boolean(
-        env.initial_tenant, "test-feature-flag"
-    )["result"]["Ok"]
-    assert (
-        env.pageserver.http_client().evaluate_feature_flag_multivariate(
-            env.initial_tenant, "test-feature-flag"
-        )["result"]["Ok"]
-        == "true"
-    )
-
-    env.pageserver.http_client().force_override_feature_flag("test-feature-flag", "false")
-    assert (
-        env.pageserver.http_client().evaluate_feature_flag_boolean(
-            env.initial_tenant, "test-feature-flag"
-        )["result"]["Err"]
-        == "No condition group is matched"
-    )
-    assert (
-        env.pageserver.http_client().evaluate_feature_flag_multivariate(
-            env.initial_tenant, "test-feature-flag"
-        )["result"]["Ok"]
-        == "false"
-    )
-
-    env.pageserver.http_client().force_override_feature_flag("test-feature-flag", None)
-    assert (
-        "Err"
-        in env.pageserver.http_client().evaluate_feature_flag_boolean(
-            env.initial_tenant, "test-feature-flag"
-        )["result"]
-    )
-    assert (
-        "Err"
-        in env.pageserver.http_client().evaluate_feature_flag_multivariate(
-            env.initial_tenant, "test-feature-flag"
-        )["result"]
-    )
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -173,11 +173,7 @@ def test_pg_regress(
    (runpath / "testtablespace").mkdir(parents=True)

    # Compute all the file locations that pg_regress will need.
-    #
-    # XXX: We assume that the `build` directory is a sibling of the
-    # pg_distrib_dir.  That is the default when you check out the
-    # repository; `build` and `pg_install` are created side by side.
-    build_path = pg_distrib_dir / f"../build/{env.pg_version.v_prefixed}/src/test/regress"
+    build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/regress"
    src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/regress"
    bindir = pg_distrib_dir / f"v{env.pg_version}/bin"
    schedule = src_path / "parallel_schedule"
@@ -254,11 +250,7 @@ def test_isolation(
    (runpath / "testtablespace").mkdir(parents=True)

    # Compute all the file locations that pg_isolation_regress will need.
-    #
-    # XXX: We assume that the `build` directory is a sibling of the
-    # pg_distrib_dir.  That is the default when you check out the
-    # repository; `build` and `pg_install` are created side by side.
-    build_path = pg_distrib_dir / f"../build/{env.pg_version.v_prefixed}/src/test/isolation"
+    build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/isolation"
    src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/isolation"
    bindir = pg_distrib_dir / f"v{env.pg_version}/bin"
    schedule = src_path / "isolation_schedule"
@@ -322,11 +314,8 @@ def test_sql_regress(
    (runpath / "testtablespace").mkdir(parents=True)

    # Compute all the file locations that pg_regress will need.
-    #
-    # XXX: We assume that the `build` directory is a sibling of the
-    # pg_distrib_dir.  That is the default when you check out the
-    # repository; `build` and `pg_install` are created side by side.
-    build_path = pg_distrib_dir / f"../build/{env.pg_version.v_prefixed}/src/test/regress"
+    # This test runs neon specific tests
+    build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/regress"
    src_path = base_dir / "test_runner/sql_regress"
    bindir = pg_distrib_dir / f"v{env.pg_version}/bin"
    schedule = src_path / "parallel_schedule"
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -74,7 +74,7 @@ def test_tenant_s3_restore(
            last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
            last_flush_lsns.append(last_flush_lsn)
        ps_http.timeline_checkpoint(tenant_id, timeline_id)
-        wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn, timeout=60)
+        wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
        log.info(f"{timeline} timeline {timeline_id} {last_flush_lsn=}")
        parent = timeline

--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3642,9 +3642,7 @@ def test_timeline_delete_mid_live_migration(neon_env_builder: NeonEnvBuilder, mi
    env.start()

    for ps in env.pageservers:
-        ps.allowed_errors.extend(
-            [".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"]
-        )
+        ps.allowed_errors.append(".*Timeline.* has been deleted.*")

    tenant_id = TenantId.generate()
    timeline_id = TimelineId.generate()
@@ -4436,53 +4434,6 @@ def test_storage_controller_graceful_migration(neon_env_builder: NeonEnvBuilder,
        assert initial_ps.http_client().tenant_list_locations()["tenant_shards"] == []


-def test_attached_0_graceful_migration(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.num_pageservers = 4
-    neon_env_builder.num_azs = 2
-
-    neon_env_builder.storcon_kick_secondary_downloads = False
-
-    env = neon_env_builder.init_start()
-
-    # It is default, but we want to ensure that there are no secondary locations requested
-    env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 0}})
-    env.storage_controller.reconcile_until_idle()
-
-    desc = env.storage_controller.tenant_describe(env.initial_tenant)["shards"][0]
-    src_ps_id = desc["node_attached"]
-    src_ps = env.get_pageserver(src_ps_id)
-    src_az = desc["preferred_az_id"]
-
-    # There must be no secondary locations with Attached(0) placement policy
-    assert len(desc["node_secondary"]) == 0
-
-    # Migrate tenant shard to the same AZ node
-    dst_ps = [ps for ps in env.pageservers if ps.id != src_ps_id and ps.az_id == src_az][0]
-
-    env.storage_controller.tenant_shard_migrate(
-        TenantShardId(env.initial_tenant, 0, 0),
-        dst_ps.id,
-        config=StorageControllerMigrationConfig(prewarm=True),
-    )
-
-    def tenant_shard_migrated():
-        src_locations = src_ps.http_client().tenant_list_locations()["tenant_shards"]
-        assert len(src_locations) == 0
-        log.info(f"Tenant shard migrated from {src_ps.id}")
-        dst_locations = dst_ps.http_client().tenant_list_locations()["tenant_shards"]
-        assert len(dst_locations) == 1
-        assert dst_locations[0][1]["mode"] == "AttachedSingle"
-        log.info(f"Tenant shard migrated to {dst_ps.id}")
-
-    # After all we expect that tenant shard exists only on dst node.
-    # We wait so long because [`DEFAULT_HEATMAP_PERIOD`] and [`DEFAULT_DOWNLOAD_INTERVAL`]
-    # are set to 60 seconds by default.
-    #
-    # TODO: we should consider making these configurable, so the test can run faster.
-    wait_until(tenant_shard_migrated, timeout=180, interval=5, status_interval=10)
-    log.info("Tenant shard migrated successfully")
-
-
@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
 def test_storage_controller_migrate_with_pageserver_restart(
    neon_env_builder: NeonEnvBuilder, make_httpserver
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -1099,9 +1099,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(

    for ps in env.pageservers:
        ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
-        ps.allowed_errors.extend(
-            [".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"]
-        )
+        ps.allowed_errors.append(".*Timeline.* has been deleted.*")

    pageservers = dict((int(p.id), p) for p in env.pageservers)

@@ -1223,9 +1221,7 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv

    for ps in env.pageservers:
        ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
-        ps.allowed_errors.extend(
-            [".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"]
-        )
+        ps.allowed_errors.append(".*Timeline.* has been deleted.*")

    pageservers = dict((int(p.id), p) for p in env.pageservers)

--- a/test_runner/regress/test_timeline_gc_blocking.py
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -25,9 +25,7 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool
        initial_tenant_shard_count=2 if sharded else None,
    )
    for ps in env.pageservers:
-        ps.allowed_errors.extend(
-            [".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"]
-        )
+        ps.allowed_errors.append(".*Timeline.* has been deleted.*")

    if sharded:
        http = env.storage_controller.pageserver_api()
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -13,6 +13,6 @@
  ],
  "v14": [
    "14.18",
-    "9085654ee8022d5cc4ca719380a1dc53e5e3246f"
+    "6770bc251301ef40c66f7ecb731741dc435b5051"
  ]
 }