Introduce flag for deletion API

Fix keep-failing reconciles test & add logs (#12497 )
## Problem Test is flaky due to the following warning in the logs: ``` Keeping extra secondaries: can't determine which of [NodeId(1), NodeId(2)] to remove (some nodes offline?) ``` Some nodes being offline is expected behavior in this test. ## Summary of changes - Added `Keeping extra secondaries` to the list of allowed errors - Improved logging for better debugging experience Co-authored-by: Aleksandr Sarantsev <aleksandr.sarantsev@databricks.com>
2026-07-08 06:30:37 +00:00 · 2025-07-08 17:20:15 +04:00 · 2025-07-08 08:51:50 +00:00 · 2025-07-07 17:46:33 +00:00 · 2025-07-07 15:12:02 +00:00 · 2025-07-07 12:24:06 +00:00
163 changed files with 6559 additions and 2597 deletions
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -33,6 +33,7 @@ workspace-members = [
    "compute_api",
    "consumption_metrics",
    "desim",
+    "json",
    "metrics",
    "pageserver_api",
    "postgres_backend",
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -7,6 +7,7 @@ self-hosted-runner:
    - small-metal
    - small-arm64
    - unit-perf
+    - unit-perf-aws-arm
    - us-east-2
 config-variables:
  - AWS_ECR_REGION
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -391,6 +391,10 @@ jobs:
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
          USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}

+      # Temporary disable this step until we figure out why it's so flaky
+      # Ref https://github.com/neondatabase/neon/issues/4540
      - name: Merge and upload coverage data
-        if: inputs.build-type == 'debug'
+        if: |
+          false &&
+          inputs.build-type == 'debug' && matrix.pg_version == 'v16'
        uses: ./.github/actions/save-coverage-data
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -32,162 +32,14 @@ permissions:
  contents: read

 jobs:
-  build-pgxn:
-    if: |
-      inputs.pg_versions != '[]' || inputs.rebuild_everything ||
-      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
-      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
-      github.ref_name == 'main'
-    timeout-minutes: 30
-    runs-on: macos-15
-    strategy:
-      matrix:
-        postgres-version: ${{ inputs.rebuild_everything && fromJSON('["v14", "v15", "v16", "v17"]') || fromJSON(inputs.pg_versions) }}
-    env:
-      # Use release build only, to have less debug info around
-      # Hence keeping target/ (and general cache size) smaller
-      BUILD_TYPE: release
-    steps:
-      - name: Harden the runner (Audit all outbound calls)
-        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
-        with:
-          egress-policy: audit
-
-      - name: Checkout main repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Set pg ${{ matrix.postgres-version }} for caching
-        id: pg_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-${{ matrix.postgres-version }}) | tee -a "${GITHUB_OUTPUT}"
-
-      - name: Cache postgres ${{ matrix.postgres-version }} build
-        id: cache_pg
-        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
-        with:
-          path: pg_install/${{ matrix.postgres-version }}
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ matrix.postgres-version }}-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Checkout submodule vendor/postgres-${{ matrix.postgres-version }}
-        if: steps.cache_pg.outputs.cache-hit != 'true'
-        run: |
-          git submodule init vendor/postgres-${{ matrix.postgres-version }}
-          git submodule update --depth 1 --recursive
-
-      - name: Install build dependencies
-        if: steps.cache_pg.outputs.cache-hit != 'true'
-        run: |
-          brew install flex bison openssl protobuf icu4c
-
-      - name: Set extra env for macOS
-        if: steps.cache_pg.outputs.cache-hit != 'true'
-        run: |
-          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
-          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
-
-      - name: Build Postgres ${{ matrix.postgres-version }}
-        if: steps.cache_pg.outputs.cache-hit != 'true'
-        run: |
-          make postgres-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu)
-
-      - name: Build Neon Pg Ext ${{ matrix.postgres-version }}
-        if: steps.cache_pg.outputs.cache-hit != 'true'
-        run: |
-          make "neon-pg-ext-${{ matrix.postgres-version }}" -j$(sysctl -n hw.ncpu)
-
-      - name: Upload "pg_install/${{ matrix.postgres-version }}" artifact
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        with:
-          name: pg_install--${{ matrix.postgres-version }}
-          path: pg_install/${{ matrix.postgres-version }}
-          # The artifact is supposed to be used by the next job in the same workflow,
-          # so there’s no need to store it for too long.
-          retention-days: 1
-
-  build-walproposer-lib:
-    if: |
-      contains(inputs.pg_versions, 'v17') || inputs.rebuild_everything ||
-      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
-      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
-      github.ref_name == 'main'
-    timeout-minutes: 30
-    runs-on: macos-15
-    needs: [build-pgxn]
-    env:
-      # Use release build only, to have less debug info around
-      # Hence keeping target/ (and general cache size) smaller
-      BUILD_TYPE: release
-    steps:
-      - name: Harden the runner (Audit all outbound calls)
-        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
-        with:
-          egress-policy: audit
-
-      - name: Checkout main repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Set pg v17 for caching
-        id: pg_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}"
-
-      - name: Download "pg_install/v17" artifact
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
-        with:
-          name: pg_install--v17
-          path: pg_install/v17
-
-      # `actions/download-artifact` doesn't preserve permissions:
-      # https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss
-      - name: Make pg_install/v*/bin/* executable
-        run: |
-          chmod +x pg_install/v*/bin/*
-
-      - name: Cache walproposer-lib
-        id: cache_walproposer_lib
-        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
-        with:
-          path: build/walproposer-lib
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Checkout submodule vendor/postgres-v17
-        if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
-        run: |
-          git submodule init vendor/postgres-v17
-          git submodule update --depth 1 --recursive
-
-      - name: Install build dependencies
-        if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
-        run: |
-          brew install flex bison openssl protobuf icu4c
-
-      - name: Set extra env for macOS
-        if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
-        run: |
-          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
-          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
-
-      - name: Build walproposer-lib (only for v17)
-        if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
-        run:
-          make walproposer-lib -j$(sysctl -n hw.ncpu) PG_INSTALL_CACHED=1
-
-      - name: Upload "build/walproposer-lib" artifact
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        with:
-          name: build--walproposer-lib
-          path: build/walproposer-lib
-          # The artifact is supposed to be used by the next job in the same workflow,
-          # so there’s no need to store it for too long.
-          retention-days: 1
-
-  cargo-build:
+  make-all:
    if: |
      inputs.pg_versions != '[]' || inputs.rebuild_rust_code || inputs.rebuild_everything ||
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
      github.ref_name == 'main'
-    timeout-minutes: 30
+    timeout-minutes: 60
    runs-on: macos-15
-    needs: [build-pgxn, build-walproposer-lib]
    env:
      # Use release build only, to have less debug info around
      # Hence keeping target/ (and general cache size) smaller
@@ -203,41 +55,53 @@ jobs:
        with:
          submodules: true

-      - name: Download "pg_install/v14" artifact
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
-        with:
-          name: pg_install--v14
-          path: pg_install/v14
-
-      - name: Download "pg_install/v15" artifact
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
-        with:
-          name: pg_install--v15
-          path: pg_install/v15
-
-      - name: Download "pg_install/v16" artifact
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
-        with:
-          name: pg_install--v16
-          path: pg_install/v16
-
-      - name: Download "pg_install/v17" artifact
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
-        with:
-          name: pg_install--v17
-          path: pg_install/v17
-
-      - name: Download "build/walproposer-lib" artifact
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
-        with:
-          name: build--walproposer-lib
-          path: build/walproposer-lib
-
-      # `actions/download-artifact` doesn't preserve permissions:
-      # https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss
-      - name: Make pg_install/v*/bin/* executable
+      - name: Install build dependencies
        run: |
-          chmod +x pg_install/v*/bin/*
+          brew install flex bison openssl protobuf icu4c
+
+      - name: Set extra env for macOS
+        run: |
+          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
+          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
+
+      - name: Restore "pg_install/" cache
+        id: cache_pg
+        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
+        with:
+          path: pg_install
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-install-v14-${{ hashFiles('Makefile', 'postgres.mk', 'vendor/revisions.json') }}
+
+      - name: Checkout vendor/postgres submodules
+        if: steps.cache_pg.outputs.cache-hit != 'true'
+        run: |
+          git submodule init
+          git submodule update --depth 1 --recursive
+
+      - name: Build Postgres
+        if: steps.cache_pg.outputs.cache-hit != 'true'
+        run: |
+          make postgres -j$(sysctl -n hw.ncpu)
+
+      # This isn't strictly necessary, but it makes the cached and non-cached builds more similar,
+      # When pg_install is restored from cache, there is no 'build/' directory. By removing it
+      # in a non-cached build too, we enforce that the rest of the steps don't depend on it,
+      # so that we notice any build caching bugs earlier.
+      - name: Remove build artifacts
+        if: steps.cache_pg.outputs.cache-hit != 'true'
+        run: |
+          rm -rf build
+
+      # Explicitly update the rust toolchain before running 'make'. The parallel make build can
+      # invoke 'cargo build' more than once in parallel, for different crates.  That's OK, 'cargo'
+      # does its own locking to prevent concurrent builds from stepping on each other's
+      # toes. However, it will first try to update the toolchain, and that step is not locked the
+      # same way. To avoid two toolchain updates running in parallel and stepping on each other's
+      # toes, ensure that the toolchain is up-to-date beforehand.
+      - name: Update rust toolchain
+        run: |
+          rustup --version &&
+          rustup update &&
+          rustup show

      - name: Cache cargo deps
        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
@@ -249,17 +113,12 @@ jobs:
            target
          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust

-      - name: Install build dependencies
-        run: |
-          brew install flex bison openssl protobuf icu4c
-
-      - name: Set extra env for macOS
-        run: |
-          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
-          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
-
-      - name: Run cargo build
-        run: cargo build --all --release -j$(sysctl -n hw.ncpu)
+      # Build the neon-specific postgres extensions, and all the Rust bits.
+      #
+      # Pass PG_INSTALL_CACHED=1 because PostgreSQL was already built and cached
+      # separately.
+      - name: Build all
+        run: PG_INSTALL_CACHED=1 BUILD_TYPE=release make -j$(sysctl -n hw.ncpu) all

      - name: Check that no warnings are produced
        run: ./run_clippy.sh
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -306,14 +306,14 @@ jobs:
      statuses: write
      contents: write
      pull-requests: write
-    runs-on: [ self-hosted, unit-perf ]
+    runs-on: [ self-hosted, unit-perf-aws-arm ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ github.actor }}
        password: ${{ secrets.GITHUB_TOKEN }}
      # for changed limits, see comments on `options:` earlier in this file
-      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 --ulimit nofile=65536:65536 --security-opt seccomp=unconfined
    strategy:
      fail-fast: false
      matrix:
@@ -484,6 +484,21 @@ jobs:
      - name: Merge coverage data
        run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge

+      - name: Build coverage report
+        env:
+          COMMIT_URL: ${{ github.server_url }}/${{ github.repository }}/commit/${{ github.event.pull_request.head.sha || github.sha }}
+        run: |
+          scripts/coverage --dir=/tmp/coverage \
+            report \
+            --input-objects=/tmp/coverage/binaries.list \
+            --commit-url=${COMMIT_URL} \
+            --format=github
+
+          scripts/coverage --dir=/tmp/coverage \
+            report \
+            --input-objects=/tmp/coverage/binaries.list \
+            --format=lcov
+
      - name: Build coverage report NEW
        id: upload-coverage-report-new
        env:
@@ -496,13 +511,6 @@ jobs:
          CURRENT="${COMMIT_SHA}"
          BASELINE="$(git merge-base $BASE_SHA $CURRENT)"

-          mkdir /tmp/coverage/report
-
-          scripts/coverage --dir=/tmp/coverage \
-            report \
-            --input-objects=/tmp/coverage/binaries.list \
-            --format=lcov
-
          cp /tmp/coverage/report/lcov.info ./${CURRENT}.info

          GENHTML_ARGS="--ignore-errors path,unmapped,empty --synthesize-missing --demangle-cpp rustfilt --output-directory lcov-html ${CURRENT}.info"
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -1,4 +1,4 @@
-name: Periodic pagebench performance test on unit-perf hetzner runner
+name: Periodic pagebench performance test on unit-perf-aws-arm runners

 on:
  schedule:
@@ -40,7 +40,7 @@ jobs:
      statuses: write
      contents: write
      pull-requests: write
-    runs-on: [ self-hosted, unit-perf ]
+    runs-on: [ self-hosted, unit-perf-aws-arm ]
    container:
      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
      credentials:
--- a/.github/workflows/proxy-benchmark.yml
+++ b/.github/workflows/proxy-benchmark.yml
@@ -1,4 +1,4 @@
-name: Periodic proxy performance test on unit-perf hetzner runner
+name: Periodic proxy performance test on unit-perf-aws-arm runners

 on:
  push: # TODO: remove after testing
@@ -32,7 +32,7 @@ jobs:
      statuses: write
      contents: write
      pull-requests: write
-    runs-on: [self-hosted, unit-perf]
+    runs-on: [self-hosted, unit-perf-aws-arm]
    timeout-minutes: 60  # 1h timeout
    container:
      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@
 /tmp_check_cli
 __pycache__/
 test_output/
+neon_previous/
 .vscode
 .idea
 *.swp
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1083,6 +1083,25 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"

+[[package]]
+name = "cbindgen"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "975982cdb7ad6a142be15bdf84aea7ec6a9e5d4d797c004d43185b24cfe4e684"
+dependencies = [
+ "clap",
+ "heck",
+ "indexmap 2.9.0",
+ "log",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "serde_json",
+ "syn 2.0.100",
+ "tempfile",
+ "toml",
+]
+
 [[package]]
 name = "cc"
 version = "1.2.16"
@@ -1267,6 +1286,15 @@ dependencies = [
 "unicode-width",
 ]

+[[package]]
+name = "communicator"
+version = "0.1.0"
+dependencies = [
+ "cbindgen",
+ "neon-shmem",
+ "workspace_hack",
+]
+
 [[package]]
 name = "compute_api"
 version = "0.1.0"
@@ -1305,6 +1333,7 @@ dependencies = [
 "fail",
 "flate2",
 "futures",
+ "hostname-validator",
 "http 1.1.0",
 "indexmap 2.9.0",
 "itertools 0.10.5",
@@ -2771,6 +2800,12 @@ dependencies = [
 "windows",
 ]

+[[package]]
+name = "hostname-validator"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f558a64ac9af88b5ba400d99b579451af0d39c6d360980045b91aac966d705e2"
+
 [[package]]
 name = "http"
 version = "0.2.9"
@@ -3454,6 +3489,15 @@ dependencies = [
 "wasm-bindgen",
 ]

+[[package]]
+name = "json"
+version = "0.1.0"
+dependencies = [
+ "futures",
+ "itoa",
+ "ryu",
+]
+
 [[package]]
 name = "json-structural-diff"
 version = "0.2.0"
@@ -8658,8 +8702,10 @@ dependencies = [
 "fail",
 "form_urlencoded",
 "futures-channel",
+ "futures-core",
 "futures-executor",
 "futures-io",
+ "futures-sink",
 "futures-util",
 "generic-array",
 "getrandom 0.2.11",
@@ -8686,6 +8732,7 @@ dependencies = [
 "num-iter",
 "num-rational",
 "num-traits",
+ "once_cell",
 "p256 0.13.2",
 "parquet",
 "prettyplease",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -42,10 +42,12 @@ members = [
    "libs/walproposer",
    "libs/wal_decoder",
    "libs/postgres_initdb",
+    "libs/proxy/json",
    "libs/proxy/postgres-protocol2",
    "libs/proxy/postgres-types2",
    "libs/proxy/tokio-postgres2",
    "endpoint_storage",
+    "pgxn/neon/communicator",
 ]

 [workspace.package]
@@ -255,6 +257,7 @@ desim = { version = "0.1", path = "./libs/desim" }
 endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" }
 http-utils = { version = "0.1", path = "./libs/http-utils/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
+neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
 pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
@@ -284,6 +287,7 @@ walproposer = { version = "0.1", path = "./libs/walproposer/" }
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
+cbindgen = "0.29.0"
 criterion = "0.5.1"
 rcgen = "0.13"
 rstest = "0.18"
--- a/53
+++ b/53
@@ -30,7 +30,18 @@ ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR}
 ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA}
 ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA}

-# Build Postgres
+# Naive way:
+#
+# 1. COPY . .
+# 1. make neon-pg-ext
+# 2. cargo build <storage binaries>
+#
+# But to enable docker to cache intermediate layers, we perform a few preparatory steps:
+#
+# - Build all postgres versions, depending on just the contents of vendor/
+# - Use cargo chef to build all rust dependencies
+
+# 1. Build all postgres versions
 FROM $REPOSITORY/$IMAGE:$TAG AS pg-build
 WORKDIR /home/nonroot

@@ -38,17 +49,15 @@ COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
 COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
 COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17
-COPY --chown=nonroot pgxn pgxn
 COPY --chown=nonroot Makefile Makefile
 COPY --chown=nonroot postgres.mk postgres.mk
 COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh

 ENV BUILD_TYPE=release
 RUN set -e \
-    && mold -run make -j $(nproc) -s neon-pg-ext \
-    && tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz .
+    && mold -run make -j $(nproc) -s postgres

-# Prepare cargo-chef recipe
+# 2. Prepare cargo-chef recipe
 FROM $REPOSITORY/$IMAGE:$TAG AS plan
 WORKDIR /home/nonroot

@@ -56,23 +65,22 @@ COPY --chown=nonroot . .

 RUN cargo chef prepare --recipe-path recipe.json

-# Build neon binaries
+# Main build image
 FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
 ARG GIT_VERSION=local
 ARG BUILD_TAG
-
-COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
-COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
-COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
-COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server
-COPY --from=plan     /home/nonroot/recipe.json                              recipe.json
-
 ARG ADDITIONAL_RUSTFLAGS=""

+# 3. Build cargo dependencies. Note that this step doesn't depend on anything else than
+# `recipe.json`, so the layer can be reused as long as none of the dependencies change.
+COPY --from=plan     /home/nonroot/recipe.json                              recipe.json
 RUN set -e \
    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo chef cook --locked --release --recipe-path recipe.json

+# Perform the main build. We reuse the Postgres build artifacts from the intermediate 'pg-build'
+# layer, and the cargo dependencies built in the previous step.
+COPY --chown=nonroot --from=pg-build /home/nonroot/pg_install/ pg_install
 COPY --chown=nonroot . .

 RUN set -e \
@@ -87,10 +95,10 @@ RUN set -e \
      --bin endpoint_storage \
      --bin neon_local \
      --bin storage_scrubber \
-      --locked --release
+      --locked --release \
+    && mold -run make -j $(nproc) -s neon-pg-ext

-# Build final image
-#
+# Assemble the final image
 FROM $BASE_IMAGE_SHA
 WORKDIR /data

@@ -130,12 +138,15 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/endpoint_storage    /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber    /usr/local/bin
+COPY --from=build /home/nonroot/pg_install/v14 /usr/local/v14/
+COPY --from=build /home/nonroot/pg_install/v15 /usr/local/v15/
+COPY --from=build /home/nonroot/pg_install/v16 /usr/local/v16/
+COPY --from=build /home/nonroot/pg_install/v17 /usr/local/v17/

-COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
-COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
-COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/
-COPY --from=pg-build /home/nonroot/pg_install/v17 /usr/local/v17/
-COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
+# Deprecated: Old deployment scripts use this tarball which contains all the Postgres binaries.
+# That's obsolete, since all the same files are also present under /usr/local/v*. But to keep the
+# old scripts working for now, create the tarball.
+RUN tar -C /usr/local -cvzf /data/postgres_install.tar.gz v14 v15 v16 v17

 # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
 # Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
--- a/16
+++ b/16
@@ -30,11 +30,18 @@ ifeq ($(BUILD_TYPE),release)
 	PG_CFLAGS += -O2 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
 	CARGO_PROFILE ?= --profile=release
+	# NEON_CARGO_ARTIFACT_TARGET_DIR is the directory where `cargo build` places
+	# the final build artifacts. There is unfortunately no easy way of changing
+	# it to a fully predictable path, nor to extract the path with a simple
+	# command. See https://github.com/rust-lang/cargo/issues/9661 and
+	# https://github.com/rust-lang/cargo/issues/6790.
+	NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release
 else ifeq ($(BUILD_TYPE),debug)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
 	PG_CFLAGS += -O0 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
 	CARGO_PROFILE ?= --profile=dev
+	NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug
 else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif
@@ -102,7 +109,7 @@ all: neon postgres-install neon-pg-ext

 ### Neon Rust bits
 #
-# The 'postgres_ffi' depends on the Postgres headers.
+# The 'postgres_ffi' crate depends on the Postgres headers.
 .PHONY: neon
 neon: postgres-headers-install walproposer-lib cargo-target-dir
 	+@echo "Compiling Neon"
@@ -115,10 +122,13 @@ cargo-target-dir:
 	test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG

 .PHONY: neon-pg-ext-%
-neon-pg-ext-%: postgres-install-%
+neon-pg-ext-%: postgres-install-% cargo-target-dir
 	+@echo "Compiling neon-specific Postgres extensions for $*"
 	mkdir -p $(BUILD_DIR)/pgxn-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
+	$(MAKE) PG_CONFIG="$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config" COPT='$(COPT)' \
+		NEON_CARGO_ARTIFACT_TARGET_DIR="$(NEON_CARGO_ARTIFACT_TARGET_DIR)" \
+		CARGO_BUILD_FLAGS="$(CARGO_BUILD_FLAGS)" \
+		CARGO_PROFILE="$(CARGO_PROFILE)" \
 		-C $(BUILD_DIR)/pgxn-$*\
 		-f $(ROOT_PROJECT_DIR)/pgxn/Makefile  install

--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1572,6 +1572,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
 FROM build-deps AS pgaudit-src
 ARG PG_VERSION
 WORKDIR /ext-src
+COPY "compute/patches/pgaudit-parallel_workers-${PG_VERSION}.patch" .
 RUN case "${PG_VERSION}" in \
    "v14") \
    export PGAUDIT_VERSION=1.6.3 \
@@ -1594,7 +1595,8 @@ RUN case "${PG_VERSION}" in \
    esac && \
    wget https://github.com/pgaudit/pgaudit/archive/refs/tags/${PGAUDIT_VERSION}.tar.gz -O pgaudit.tar.gz && \
    echo "${PGAUDIT_CHECKSUM} pgaudit.tar.gz" | sha256sum --check && \
-    mkdir pgaudit-src && cd pgaudit-src && tar xzf ../pgaudit.tar.gz --strip-components=1 -C .
+    mkdir pgaudit-src && cd pgaudit-src && tar xzf ../pgaudit.tar.gz --strip-components=1 -C . && \
+    patch -p1 < "/ext-src/pgaudit-parallel_workers-${PG_VERSION}.patch"

 FROM pg-build AS pgaudit-build
 COPY --from=pgaudit-src /ext-src/ /ext-src/
@@ -1634,11 +1636,14 @@ RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN)
 # compile neon extensions
 #
 #########################################################################################
-FROM pg-build AS neon-ext-build
+FROM pg-build-with-cargo AS neon-ext-build
 ARG PG_VERSION

-COPY pgxn/ pgxn/
-RUN make -j $(getconf _NPROCESSORS_ONLN) -C pgxn -s install-compute
+USER root
+COPY . .
+
+RUN make -j $(getconf _NPROCESSORS_ONLN) -C pgxn -s install-compute \
+      BUILD_TYPE=release CARGO_BUILD_FLAGS="--locked --release" NEON_CARGO_ARTIFACT_TARGET_DIR="$(pwd)/target/release"

 #########################################################################################
 #
@@ -1983,7 +1988,7 @@ RUN apt update && \
        locales \
        lsof \
        procps \
-        rsyslog \
+        rsyslog-gnutls \
        screen \
        tcpdump \
        $VERSION_INSTALLS && \
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -8,6 +8,8 @@
    import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
    import 'sql_exporter/compute_logical_snapshots_bytes.libsonnet',
    import 'sql_exporter/compute_max_connections.libsonnet',
+    import 'sql_exporter/compute_pg_oldest_frozen_xid_age.libsonnet',
+    import 'sql_exporter/compute_pg_oldest_mxid_age.libsonnet',
    import 'sql_exporter/compute_receive_lsn.libsonnet',
    import 'sql_exporter/compute_subscriptions_count.libsonnet',
    import 'sql_exporter/connection_counts.libsonnet',
--- a/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.libsonnet
+++ b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.libsonnet
@@ -0,0 +1,13 @@
+{
+  metric_name: 'compute_pg_oldest_frozen_xid_age',
+  type: 'gauge',
+  help: 'Age of oldest XIDs that have not been frozen by VACUUM. An indicator of how long it has been since VACUUM last ran.',
+  key_labels: [
+    'database_name',
+  ],
+  value_label: 'metric',
+  values: [
+    'frozen_xid_age',
+  ],
+  query: importstr 'sql_exporter/compute_pg_oldest_frozen_xid_age.sql',
+}
--- a/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql
+++ b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql
@@ -0,0 +1,4 @@
+SELECT datname database_name,
+  age(datfrozenxid) frozen_xid_age
+FROM pg_database
+ORDER BY frozen_xid_age DESC LIMIT 10;
--- a/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.libsonnet
+++ b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.libsonnet
@@ -0,0 +1,13 @@
+{
+  metric_name: 'compute_pg_oldest_mxid_age',
+  type: 'gauge',
+  help: 'Age of oldest MXIDs that have not been replaced by VACUUM. An indicator of how long it has been since VACUUM last ran.',
+  key_labels: [
+    'database_name',
+  ],
+  value_label: 'metric',
+  values: [
+    'min_mxid_age',
+  ],
+  query: importstr 'sql_exporter/compute_pg_oldest_mxid_age.sql',
+}
--- a/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql
+++ b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql
@@ -0,0 +1,4 @@
+SELECT datname database_name,
+  mxid_age(datminmxid) min_mxid_age
+FROM pg_database
+ORDER BY min_mxid_age DESC LIMIT 10;
--- a/compute/patches/anon_v2.patch
+++ b/compute/patches/anon_v2.patch
@@ -1,8 +1,8 @@
 diff --git a/sql/anon.sql b/sql/anon.sql
-index 0cdc769..f6cc950 100644
+index 0cdc769..b450327 100644
 --- a/sql/anon.sql
 +++ b/sql/anon.sql
-@@ -1141,3 +1141,8 @@ $$
+@@ -1141,3 +1141,15 @@ $$
 -- TODO : https://en.wikipedia.org/wiki/L-diversity
 
 -- TODO : https://en.wikipedia.org/wiki/T-closeness
@@ -11,6 +11,13 @@ index 0cdc769..f6cc950 100644
 +
 +GRANT ALL ON SCHEMA anon to neon_superuser;
 +GRANT ALL ON ALL TABLES IN SCHEMA anon TO neon_superuser;
+
+DO $$
+BEGIN
+    IF current_setting('server_version_num')::int >= 150000 THEN
+        GRANT SET ON PARAMETER anon.transparent_dynamic_masking TO neon_superuser;
+    END IF;
+END $$;
 diff --git a/sql/init.sql b/sql/init.sql
 index 7da6553..9b6164b 100644
 --- a/sql/init.sql
--- a/compute/patches/pgaudit-parallel_workers-v14.patch
+++ b/compute/patches/pgaudit-parallel_workers-v14.patch
@@ -0,0 +1,143 @@
+commit 7220bb3a3f23fa27207d77562dcc286f9a123313
+Author: Tristan Partin <tristan.partin@databricks.com>
+Date:   2025-06-23 02:09:31 +0000
+
+    Disable logging in parallel workers
+    
+    When a query uses parallel workers, pgaudit will log the same query for
+    every parallel worker. This is undesireable since it can result in log
+    amplification for queries that use parallel workers.
+    
+    Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
+
+diff --git a/expected/pgaudit.out b/expected/pgaudit.out
+index baa8011..a601375 100644
+--- a/expected/pgaudit.out
+++ b/expected/pgaudit.out
+@@ -2563,6 +2563,37 @@ COMMIT;
+ NOTICE:  AUDIT: SESSION,12,4,MISC,COMMIT,,,COMMIT;,<not logged>
+ DROP TABLE part_test;
+ NOTICE:  AUDIT: SESSION,13,1,DDL,DROP TABLE,,,DROP TABLE part_test;,<not logged>
+--
+-- Test logging in parallel workers
+SET pgaudit.log = 'read';
+SET pgaudit.log_client = on;
+SET pgaudit.log_level = 'notice';
+-- Force parallel execution for testing
+SET max_parallel_workers_per_gather = 2;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET min_parallel_index_scan_size = 0;
+-- Create table with enough data to trigger parallel execution
+CREATE TABLE parallel_test (id int, data text);
+INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data';
+SELECT count(*) FROM parallel_test;
+NOTICE:  AUDIT: SESSION,14,1,READ,SELECT,,,SELECT count(*) FROM parallel_test;,<not logged>
+ count 
+-------
+  1000
+(1 row)
+
+-- Cleanup parallel test
+DROP TABLE parallel_test;
+RESET max_parallel_workers_per_gather;
+RESET parallel_tuple_cost;
+RESET parallel_setup_cost;
+RESET min_parallel_table_scan_size;
+RESET min_parallel_index_scan_size;
+RESET pgaudit.log;
+RESET pgaudit.log_client;
+RESET pgaudit.log_level;
+ -- Cleanup
+ -- Set client_min_messages up to warning to avoid noise
+ SET client_min_messages = 'warning';
+diff --git a/pgaudit.c b/pgaudit.c
+index 5e6fd38..ac9ded2 100644
+--- a/pgaudit.c
+++ b/pgaudit.c
+@@ -11,6 +11,7 @@
+ #include "postgres.h"
+ 
+ #include "access/htup_details.h"
+#include "access/parallel.h"
+ #include "access/sysattr.h"
+ #include "access/xact.h"
+ #include "access/relation.h"
+@@ -1303,7 +1304,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags)
+ {
+     AuditEventStackItem *stackItem = NULL;
+ 
+-    if (!internalStatement)
+    if (!internalStatement && !IsParallelWorker())
+     {
+         /* Push the audit even onto the stack */
+         stackItem = stack_push();
+@@ -1384,7 +1385,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, bool abort)
+ 
+     /* Log DML if the audit role is valid or session logging is enabled */
+     if ((auditOid != InvalidOid || auditLogBitmap != 0) &&
+-        !IsAbortedTransactionBlockState())
+        !IsAbortedTransactionBlockState() && !IsParallelWorker())
+     {
+         /* If auditLogRows is on, wait for rows processed to be set */
+         if (auditLogRows && auditEventStack != NULL)
+@@ -1438,7 +1439,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c
+     else
+         standard_ExecutorRun(queryDesc, direction, count, execute_once);
+ 
+-    if (auditLogRows && !internalStatement)
+    if (auditLogRows && !internalStatement && !IsParallelWorker())
+     {
+         /* Find an item from the stack by the query memory context */
+         stackItem = stack_find_context(queryDesc->estate->es_query_cxt);
+@@ -1458,7 +1459,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc)
+     AuditEventStackItem *stackItem = NULL;
+     AuditEventStackItem *auditEventStackFull = NULL;
+ 
+-    if (auditLogRows && !internalStatement)
+    if (auditLogRows && !internalStatement && !IsParallelWorker())
+     {
+         /* Find an item from the stack by the query memory context */
+         stackItem = stack_find_context(queryDesc->estate->es_query_cxt);
+diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql
+index cc1374a..1870a60 100644
+--- a/sql/pgaudit.sql
+++ b/sql/pgaudit.sql
+@@ -1612,6 +1612,36 @@ COMMIT;
+ 
+ DROP TABLE part_test;
+ 
+--
+-- Test logging in parallel workers
+SET pgaudit.log = 'read';
+SET pgaudit.log_client = on;
+SET pgaudit.log_level = 'notice';
+
+-- Force parallel execution for testing
+SET max_parallel_workers_per_gather = 2;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET min_parallel_index_scan_size = 0;
+
+-- Create table with enough data to trigger parallel execution
+CREATE TABLE parallel_test (id int, data text);
+INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data';
+
+SELECT count(*) FROM parallel_test;
+
+-- Cleanup parallel test
+DROP TABLE parallel_test;
+RESET max_parallel_workers_per_gather;
+RESET parallel_tuple_cost;
+RESET parallel_setup_cost;
+RESET min_parallel_table_scan_size;
+RESET min_parallel_index_scan_size;
+RESET pgaudit.log;
+RESET pgaudit.log_client;
+RESET pgaudit.log_level;
+
+ -- Cleanup
+ -- Set client_min_messages up to warning to avoid noise
+ SET client_min_messages = 'warning';
--- a/compute/patches/pgaudit-parallel_workers-v15.patch
+++ b/compute/patches/pgaudit-parallel_workers-v15.patch
@@ -0,0 +1,143 @@
+commit 29dc2847f6255541992f18faf8a815dfab79631a
+Author: Tristan Partin <tristan.partin@databricks.com>
+Date:   2025-06-23 02:09:31 +0000
+
+    Disable logging in parallel workers
+    
+    When a query uses parallel workers, pgaudit will log the same query for
+    every parallel worker. This is undesireable since it can result in log
+    amplification for queries that use parallel workers.
+    
+    Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
+
+diff --git a/expected/pgaudit.out b/expected/pgaudit.out
+index b22560b..73f0327 100644
+--- a/expected/pgaudit.out
+++ b/expected/pgaudit.out
+@@ -2563,6 +2563,37 @@ COMMIT;
+ NOTICE:  AUDIT: SESSION,12,4,MISC,COMMIT,,,COMMIT;,<not logged>
+ DROP TABLE part_test;
+ NOTICE:  AUDIT: SESSION,13,1,DDL,DROP TABLE,,,DROP TABLE part_test;,<not logged>
+--
+-- Test logging in parallel workers
+SET pgaudit.log = 'read';
+SET pgaudit.log_client = on;
+SET pgaudit.log_level = 'notice';
+-- Force parallel execution for testing
+SET max_parallel_workers_per_gather = 2;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET min_parallel_index_scan_size = 0;
+-- Create table with enough data to trigger parallel execution
+CREATE TABLE parallel_test (id int, data text);
+INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data';
+SELECT count(*) FROM parallel_test;
+NOTICE:  AUDIT: SESSION,14,1,READ,SELECT,,,SELECT count(*) FROM parallel_test;,<not logged>
+ count 
+-------
+  1000
+(1 row)
+
+-- Cleanup parallel test
+DROP TABLE parallel_test;
+RESET max_parallel_workers_per_gather;
+RESET parallel_tuple_cost;
+RESET parallel_setup_cost;
+RESET min_parallel_table_scan_size;
+RESET min_parallel_index_scan_size;
+RESET pgaudit.log;
+RESET pgaudit.log_client;
+RESET pgaudit.log_level;
+ -- Cleanup
+ -- Set client_min_messages up to warning to avoid noise
+ SET client_min_messages = 'warning';
+diff --git a/pgaudit.c b/pgaudit.c
+index 5e6fd38..ac9ded2 100644
+--- a/pgaudit.c
+++ b/pgaudit.c
+@@ -11,6 +11,7 @@
+ #include "postgres.h"
+ 
+ #include "access/htup_details.h"
+#include "access/parallel.h"
+ #include "access/sysattr.h"
+ #include "access/xact.h"
+ #include "access/relation.h"
+@@ -1303,7 +1304,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags)
+ {
+     AuditEventStackItem *stackItem = NULL;
+ 
+-    if (!internalStatement)
+    if (!internalStatement && !IsParallelWorker())
+     {
+         /* Push the audit even onto the stack */
+         stackItem = stack_push();
+@@ -1384,7 +1385,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, bool abort)
+ 
+     /* Log DML if the audit role is valid or session logging is enabled */
+     if ((auditOid != InvalidOid || auditLogBitmap != 0) &&
+-        !IsAbortedTransactionBlockState())
+        !IsAbortedTransactionBlockState() && !IsParallelWorker())
+     {
+         /* If auditLogRows is on, wait for rows processed to be set */
+         if (auditLogRows && auditEventStack != NULL)
+@@ -1438,7 +1439,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c
+     else
+         standard_ExecutorRun(queryDesc, direction, count, execute_once);
+ 
+-    if (auditLogRows && !internalStatement)
+    if (auditLogRows && !internalStatement && !IsParallelWorker())
+     {
+         /* Find an item from the stack by the query memory context */
+         stackItem = stack_find_context(queryDesc->estate->es_query_cxt);
+@@ -1458,7 +1459,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc)
+     AuditEventStackItem *stackItem = NULL;
+     AuditEventStackItem *auditEventStackFull = NULL;
+ 
+-    if (auditLogRows && !internalStatement)
+    if (auditLogRows && !internalStatement && !IsParallelWorker())
+     {
+         /* Find an item from the stack by the query memory context */
+         stackItem = stack_find_context(queryDesc->estate->es_query_cxt);
+diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql
+index 8052426..7f0667b 100644
+--- a/sql/pgaudit.sql
+++ b/sql/pgaudit.sql
+@@ -1612,6 +1612,36 @@ COMMIT;
+ 
+ DROP TABLE part_test;
+ 
+--
+-- Test logging in parallel workers
+SET pgaudit.log = 'read';
+SET pgaudit.log_client = on;
+SET pgaudit.log_level = 'notice';
+
+-- Force parallel execution for testing
+SET max_parallel_workers_per_gather = 2;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET min_parallel_index_scan_size = 0;
+
+-- Create table with enough data to trigger parallel execution
+CREATE TABLE parallel_test (id int, data text);
+INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data';
+
+SELECT count(*) FROM parallel_test;
+
+-- Cleanup parallel test
+DROP TABLE parallel_test;
+RESET max_parallel_workers_per_gather;
+RESET parallel_tuple_cost;
+RESET parallel_setup_cost;
+RESET min_parallel_table_scan_size;
+RESET min_parallel_index_scan_size;
+RESET pgaudit.log;
+RESET pgaudit.log_client;
+RESET pgaudit.log_level;
+
+ -- Cleanup
+ -- Set client_min_messages up to warning to avoid noise
+ SET client_min_messages = 'warning';
--- a/compute/patches/pgaudit-parallel_workers-v16.patch
+++ b/compute/patches/pgaudit-parallel_workers-v16.patch
@@ -0,0 +1,143 @@
+commit cc708dde7ef2af2a8120d757102d2e34c0463a0f
+Author: Tristan Partin <tristan.partin@databricks.com>
+Date:   2025-06-23 02:09:31 +0000
+
+    Disable logging in parallel workers
+    
+    When a query uses parallel workers, pgaudit will log the same query for
+    every parallel worker. This is undesireable since it can result in log
+    amplification for queries that use parallel workers.
+    
+    Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
+
+diff --git a/expected/pgaudit.out b/expected/pgaudit.out
+index 8772054..9b66ac6 100644
+--- a/expected/pgaudit.out
+++ b/expected/pgaudit.out
+@@ -2556,6 +2556,37 @@ DROP SERVER fdw_server;
+ NOTICE:  AUDIT: SESSION,11,1,DDL,DROP SERVER,,,DROP SERVER fdw_server;,<not logged>
+ DROP EXTENSION postgres_fdw;
+ NOTICE:  AUDIT: SESSION,12,1,DDL,DROP EXTENSION,,,DROP EXTENSION postgres_fdw;,<not logged>
+--
+-- Test logging in parallel workers
+SET pgaudit.log = 'read';
+SET pgaudit.log_client = on;
+SET pgaudit.log_level = 'notice';
+-- Force parallel execution for testing
+SET max_parallel_workers_per_gather = 2;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET min_parallel_index_scan_size = 0;
+-- Create table with enough data to trigger parallel execution
+CREATE TABLE parallel_test (id int, data text);
+INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data';
+SELECT count(*) FROM parallel_test;
+NOTICE:  AUDIT: SESSION,13,1,READ,SELECT,,,SELECT count(*) FROM parallel_test;,<not logged>
+ count 
+-------
+  1000
+(1 row)
+
+-- Cleanup parallel test
+DROP TABLE parallel_test;
+RESET max_parallel_workers_per_gather;
+RESET parallel_tuple_cost;
+RESET parallel_setup_cost;
+RESET min_parallel_table_scan_size;
+RESET min_parallel_index_scan_size;
+RESET pgaudit.log;
+RESET pgaudit.log_client;
+RESET pgaudit.log_level;
+ -- Cleanup
+ -- Set client_min_messages up to warning to avoid noise
+ SET client_min_messages = 'warning';
+diff --git a/pgaudit.c b/pgaudit.c
+index 004d1f9..f061164 100644
+--- a/pgaudit.c
+++ b/pgaudit.c
+@@ -11,6 +11,7 @@
+ #include "postgres.h"
+ 
+ #include "access/htup_details.h"
+#include "access/parallel.h"
+ #include "access/sysattr.h"
+ #include "access/xact.h"
+ #include "access/relation.h"
+@@ -1339,7 +1340,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags)
+ {
+     AuditEventStackItem *stackItem = NULL;
+ 
+-    if (!internalStatement)
+    if (!internalStatement && !IsParallelWorker())
+     {
+         /* Push the audit even onto the stack */
+         stackItem = stack_push();
+@@ -1420,7 +1421,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, List *permInfos, bool abort)
+ 
+     /* Log DML if the audit role is valid or session logging is enabled */
+     if ((auditOid != InvalidOid || auditLogBitmap != 0) &&
+-        !IsAbortedTransactionBlockState())
+        !IsAbortedTransactionBlockState() && !IsParallelWorker())
+     {
+         /* If auditLogRows is on, wait for rows processed to be set */
+         if (auditLogRows && auditEventStack != NULL)
+@@ -1475,7 +1476,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c
+     else
+         standard_ExecutorRun(queryDesc, direction, count, execute_once);
+ 
+-    if (auditLogRows && !internalStatement)
+    if (auditLogRows && !internalStatement && !IsParallelWorker())
+     {
+         /* Find an item from the stack by the query memory context */
+         stackItem = stack_find_context(queryDesc->estate->es_query_cxt);
+@@ -1495,7 +1496,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc)
+     AuditEventStackItem *stackItem = NULL;
+     AuditEventStackItem *auditEventStackFull = NULL;
+ 
+-    if (auditLogRows && !internalStatement)
+    if (auditLogRows && !internalStatement && !IsParallelWorker())
+     {
+         /* Find an item from the stack by the query memory context */
+         stackItem = stack_find_context(queryDesc->estate->es_query_cxt);
+diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql
+index 6aae88b..de6d7fd 100644
+--- a/sql/pgaudit.sql
+++ b/sql/pgaudit.sql
+@@ -1631,6 +1631,36 @@ DROP USER MAPPING FOR regress_user1 SERVER fdw_server;
+ DROP SERVER fdw_server;
+ DROP EXTENSION postgres_fdw;
+ 
+--
+-- Test logging in parallel workers
+SET pgaudit.log = 'read';
+SET pgaudit.log_client = on;
+SET pgaudit.log_level = 'notice';
+
+-- Force parallel execution for testing
+SET max_parallel_workers_per_gather = 2;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET min_parallel_index_scan_size = 0;
+
+-- Create table with enough data to trigger parallel execution
+CREATE TABLE parallel_test (id int, data text);
+INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data';
+
+SELECT count(*) FROM parallel_test;
+
+-- Cleanup parallel test
+DROP TABLE parallel_test;
+RESET max_parallel_workers_per_gather;
+RESET parallel_tuple_cost;
+RESET parallel_setup_cost;
+RESET min_parallel_table_scan_size;
+RESET min_parallel_index_scan_size;
+RESET pgaudit.log;
+RESET pgaudit.log_client;
+RESET pgaudit.log_level;
+
+ -- Cleanup
+ -- Set client_min_messages up to warning to avoid noise
+ SET client_min_messages = 'warning';
--- a/compute/patches/pgaudit-parallel_workers-v17.patch
+++ b/compute/patches/pgaudit-parallel_workers-v17.patch
@@ -0,0 +1,143 @@
+commit 8d02e4c6c5e1e8676251b0717a46054267091cb4
+Author: Tristan Partin <tristan.partin@databricks.com>
+Date:   2025-06-23 02:09:31 +0000
+
+    Disable logging in parallel workers
+    
+    When a query uses parallel workers, pgaudit will log the same query for
+    every parallel worker. This is undesireable since it can result in log
+    amplification for queries that use parallel workers.
+    
+    Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
+
+diff --git a/expected/pgaudit.out b/expected/pgaudit.out
+index d696287..4b1059a 100644
+--- a/expected/pgaudit.out
+++ b/expected/pgaudit.out
+@@ -2568,6 +2568,37 @@ DROP SERVER fdw_server;
+ NOTICE:  AUDIT: SESSION,11,1,DDL,DROP SERVER,,,DROP SERVER fdw_server,<not logged>
+ DROP EXTENSION postgres_fdw;
+ NOTICE:  AUDIT: SESSION,12,1,DDL,DROP EXTENSION,,,DROP EXTENSION postgres_fdw,<not logged>
+--
+-- Test logging in parallel workers
+SET pgaudit.log = 'read';
+SET pgaudit.log_client = on;
+SET pgaudit.log_level = 'notice';
+-- Force parallel execution for testing
+SET max_parallel_workers_per_gather = 2;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET min_parallel_index_scan_size = 0;
+-- Create table with enough data to trigger parallel execution
+CREATE TABLE parallel_test (id int, data text);
+INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data';
+SELECT count(*) FROM parallel_test;
+NOTICE:  AUDIT: SESSION,13,1,READ,SELECT,,,SELECT count(*) FROM parallel_test,<not logged>
+ count 
+-------
+  1000
+(1 row)
+
+-- Cleanup parallel test
+DROP TABLE parallel_test;
+RESET max_parallel_workers_per_gather;
+RESET parallel_tuple_cost;
+RESET parallel_setup_cost;
+RESET min_parallel_table_scan_size;
+RESET min_parallel_index_scan_size;
+RESET pgaudit.log;
+RESET pgaudit.log_client;
+RESET pgaudit.log_level;
+ -- Cleanup
+ -- Set client_min_messages up to warning to avoid noise
+ SET client_min_messages = 'warning';
+diff --git a/pgaudit.c b/pgaudit.c
+index 1764af1..0e48875 100644
+--- a/pgaudit.c
+++ b/pgaudit.c
+@@ -11,6 +11,7 @@
+ #include "postgres.h"
+ 
+ #include "access/htup_details.h"
+#include "access/parallel.h"
+ #include "access/sysattr.h"
+ #include "access/xact.h"
+ #include "access/relation.h"
+@@ -1406,7 +1407,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags)
+ {
+     AuditEventStackItem *stackItem = NULL;
+ 
+-    if (!internalStatement)
+    if (!internalStatement && !IsParallelWorker())
+     {
+         /* Push the audit event onto the stack */
+         stackItem = stack_push();
+@@ -1489,7 +1490,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, List *permInfos, bool abort)
+ 
+     /* Log DML if the audit role is valid or session logging is enabled */
+     if ((auditOid != InvalidOid || auditLogBitmap != 0) &&
+-        !IsAbortedTransactionBlockState())
+        !IsAbortedTransactionBlockState() && !IsParallelWorker())
+     {
+         /* If auditLogRows is on, wait for rows processed to be set */
+         if (auditLogRows && auditEventStack != NULL)
+@@ -1544,7 +1545,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c
+     else
+         standard_ExecutorRun(queryDesc, direction, count, execute_once);
+ 
+-    if (auditLogRows && !internalStatement)
+    if (auditLogRows && !internalStatement && !IsParallelWorker())
+     {
+         /* Find an item from the stack by the query memory context */
+         stackItem = stack_find_context(queryDesc->estate->es_query_cxt);
+@@ -1564,7 +1565,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc)
+     AuditEventStackItem *stackItem = NULL;
+     AuditEventStackItem *auditEventStackFull = NULL;
+ 
+-    if (auditLogRows && !internalStatement)
+    if (auditLogRows && !internalStatement && !IsParallelWorker())
+     {
+         /* Find an item from the stack by the query memory context */
+         stackItem = stack_find_context(queryDesc->estate->es_query_cxt);
+diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql
+index e161f01..c873098 100644
+--- a/sql/pgaudit.sql
+++ b/sql/pgaudit.sql
+@@ -1637,6 +1637,36 @@ DROP USER MAPPING FOR regress_user1 SERVER fdw_server;
+ DROP SERVER fdw_server;
+ DROP EXTENSION postgres_fdw;
+ 
+--
+-- Test logging in parallel workers
+SET pgaudit.log = 'read';
+SET pgaudit.log_client = on;
+SET pgaudit.log_level = 'notice';
+
+-- Force parallel execution for testing
+SET max_parallel_workers_per_gather = 2;
+SET parallel_tuple_cost = 0;
+SET parallel_setup_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET min_parallel_index_scan_size = 0;
+
+-- Create table with enough data to trigger parallel execution
+CREATE TABLE parallel_test (id int, data text);
+INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data';
+
+SELECT count(*) FROM parallel_test;
+
+-- Cleanup parallel test
+DROP TABLE parallel_test;
+RESET max_parallel_workers_per_gather;
+RESET parallel_tuple_cost;
+RESET parallel_setup_cost;
+RESET min_parallel_table_scan_size;
+RESET min_parallel_index_scan_size;
+RESET pgaudit.log;
+RESET pgaudit.log_client;
+RESET pgaudit.log_level;
+
+ -- Cleanup
+ -- Set client_min_messages up to warning to avoid noise
+ SET client_min_messages = 'warning';
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -27,6 +27,7 @@ fail.workspace = true
 flate2.workspace = true
 futures.workspace = true
 http.workspace = true
+hostname-validator = "1.1"
 indexmap.workspace = true
 itertools.workspace = true
 jsonwebtoken.workspace = true
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -29,7 +29,8 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
 use std::sync::{Arc, Condvar, Mutex, RwLock};
 use std::time::{Duration, Instant};
 use std::{env, fs};
-use tokio::spawn;
+use tokio::task::JoinHandle;
+use tokio::{spawn, time};
 use tracing::{Instrument, debug, error, info, instrument, warn};
 use url::Url;
 use utils::id::{TenantId, TimelineId};
@@ -107,6 +108,8 @@ pub struct ComputeNodeParams {
    pub installed_extensions_collection_interval: Arc<AtomicU64>,
 }

+type TaskHandle = Mutex<Option<JoinHandle<()>>>;
+
 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
    pub params: ComputeNodeParams,
@@ -129,7 +132,8 @@ pub struct ComputeNode {
    pub compute_ctl_config: ComputeCtlConfig,

    /// Handle to the extension stats collection task
-    extension_stats_task: Mutex<Option<tokio::task::JoinHandle<()>>>,
+    extension_stats_task: TaskHandle,
+    lfc_offload_task: TaskHandle,
 }

 // store some metrics about download size that might impact startup time
@@ -368,7 +372,7 @@ fn maybe_cgexec(cmd: &str) -> Command {

 struct PostgresHandle {
    postgres: std::process::Child,
-    log_collector: tokio::task::JoinHandle<Result<()>>,
+    log_collector: JoinHandle<Result<()>>,
 }

 impl PostgresHandle {
@@ -382,7 +386,7 @@ struct StartVmMonitorResult {
    #[cfg(target_os = "linux")]
    token: tokio_util::sync::CancellationToken,
    #[cfg(target_os = "linux")]
-    vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
+    vm_monitor: Option<JoinHandle<Result<()>>>,
 }

 impl ComputeNode {
@@ -433,6 +437,7 @@ impl ComputeNode {
            ext_download_progress: RwLock::new(HashMap::new()),
            compute_ctl_config: config.compute_ctl_config,
            extension_stats_task: Mutex::new(None),
+            lfc_offload_task: Mutex::new(None),
        })
    }

@@ -520,8 +525,8 @@ impl ComputeNode {
            None
        };

-        // Terminate the extension stats collection task
        this.terminate_extension_stats_task();
+        this.terminate_lfc_offload_task();

        // Terminate the vm_monitor so it releases the file watcher on
        // /sys/fs/cgroup/neon-postgres.
@@ -759,10 +764,15 @@ impl ComputeNode {
        // Configure and start rsyslog for compliance audit logging
        match pspec.spec.audit_log_level {
            ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
-                let remote_endpoint =
+                let remote_tls_endpoint =
+                    std::env::var("AUDIT_LOGGING_TLS_ENDPOINT").unwrap_or("".to_string());
+                let remote_plain_endpoint =
                    std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
-                if remote_endpoint.is_empty() {
-                    anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
+
+                if remote_plain_endpoint.is_empty() && remote_tls_endpoint.is_empty() {
+                    anyhow::bail!(
+                        "AUDIT_LOGGING_ENDPOINT and AUDIT_LOGGING_TLS_ENDPOINT are both empty"
+                    );
                }

                let log_directory_path = Path::new(&self.params.pgdata).join("log");
@@ -778,7 +788,8 @@ impl ComputeNode {
                    log_directory_path.clone(),
                    endpoint_id,
                    project_id,
-                    &remote_endpoint,
+                    &remote_plain_endpoint,
+                    &remote_tls_endpoint,
                )?;

                // Launch a background task to clean up the audit logs
@@ -845,12 +856,15 @@ impl ComputeNode {
        // Log metrics so that we can search for slow operations in logs
        info!(?metrics, postmaster_pid = %postmaster_pid, "compute start finished");

-        // Spawn the extension stats background task
        self.spawn_extension_stats_task();

        if pspec.spec.autoprewarm {
+            info!("autoprewarming on startup as requested");
            self.prewarm_lfc(None);
        }
+        if let Some(seconds) = pspec.spec.offload_lfc_interval_seconds {
+            self.spawn_lfc_offload_task(Duration::from_secs(seconds.into()));
+        };
        Ok(())
    }

@@ -2351,10 +2365,7 @@ LIMIT 100",
    }

    pub fn spawn_extension_stats_task(&self) {
-        // Cancel any existing task
-        if let Some(handle) = self.extension_stats_task.lock().unwrap().take() {
-            handle.abort();
-        }
+        self.terminate_extension_stats_task();

        let conf = self.tokio_conn_conf.clone();
        let atomic_interval = self.params.installed_extensions_collection_interval.clone();
@@ -2365,24 +2376,23 @@ LIMIT 100",
            installed_extensions_collection_interval
        );
        let handle = tokio::spawn(async move {
-            // An initial sleep is added to ensure that two collections don't happen at the same time.
-            // The first collection happens during compute startup.
-            tokio::time::sleep(tokio::time::Duration::from_secs(
-                installed_extensions_collection_interval,
-            ))
-            .await;
-            let mut interval = tokio::time::interval(tokio::time::Duration::from_secs(
-                installed_extensions_collection_interval,
-            ));
            loop {
-                interval.tick().await;
+                info!(
+                    "[NEON_EXT_INT_SLEEP]: Interval: {}",
+                    installed_extensions_collection_interval
+                );
+                // Sleep at the start of the loop to ensure that two collections don't happen at the same time.
+                // The first collection happens during compute startup.
+                tokio::time::sleep(tokio::time::Duration::from_secs(
+                    installed_extensions_collection_interval,
+                ))
+                .await;
                let _ = installed_extensions(conf.clone()).await;
                // Acquire a read lock on the compute spec and then update the interval if necessary
-                interval = tokio::time::interval(tokio::time::Duration::from_secs(std::cmp::max(
+                installed_extensions_collection_interval = std::cmp::max(
                    installed_extensions_collection_interval,
                    2 * atomic_interval.load(std::sync::atomic::Ordering::SeqCst),
-                )));
-                installed_extensions_collection_interval = interval.period().as_secs();
+                );
            }
        });

@@ -2391,8 +2401,30 @@ LIMIT 100",
    }

    fn terminate_extension_stats_task(&self) {
-        if let Some(handle) = self.extension_stats_task.lock().unwrap().take() {
-            handle.abort();
+        if let Some(h) = self.extension_stats_task.lock().unwrap().take() {
+            h.abort()
+        }
+    }
+
+    pub fn spawn_lfc_offload_task(self: &Arc<Self>, interval: Duration) {
+        self.terminate_lfc_offload_task();
+        let secs = interval.as_secs();
+        info!("spawning lfc offload worker with {secs}s interval");
+        let this = self.clone();
+        let handle = spawn(async move {
+            let mut interval = time::interval(interval);
+            interval.tick().await; // returns immediately
+            loop {
+                interval.tick().await;
+                this.offload_lfc_async().await;
+            }
+        });
+        *self.lfc_offload_task.lock().unwrap() = Some(handle);
+    }
+
+    fn terminate_lfc_offload_task(&self) {
+        if let Some(h) = self.lfc_offload_task.lock().unwrap().take() {
+            h.abort()
        }
    }

--- a/compute_tools/src/compute_prewarm.rs
+++ b/compute_tools/src/compute_prewarm.rs
@@ -5,6 +5,7 @@ use compute_api::responses::LfcOffloadState;
 use compute_api::responses::LfcPrewarmState;
 use http::StatusCode;
 use reqwest::Client;
+use std::mem::replace;
 use std::sync::Arc;
 use tokio::{io::AsyncReadExt, spawn};
 use tracing::{error, info};
@@ -88,17 +89,15 @@ impl ComputeNode {
        self.state.lock().unwrap().lfc_offload_state.clone()
    }

-    /// Returns false if there is a prewarm request ongoing, true otherwise
+    /// If there is a prewarm request ongoing, return false, true otherwise
    pub fn prewarm_lfc(self: &Arc<Self>, from_endpoint: Option<String>) -> bool {
-        crate::metrics::LFC_PREWARM_REQUESTS.inc();
        {
            let state = &mut self.state.lock().unwrap().lfc_prewarm_state;
-            if let LfcPrewarmState::Prewarming =
-                std::mem::replace(state, LfcPrewarmState::Prewarming)
-            {
+            if let LfcPrewarmState::Prewarming = replace(state, LfcPrewarmState::Prewarming) {
                return false;
            }
        }
+        crate::metrics::LFC_PREWARMS.inc();

        let cloned = self.clone();
        spawn(async move {
@@ -152,32 +151,41 @@ impl ComputeNode {
            .map(|_| ())
    }

-    /// Returns false if there is an offload request ongoing, true otherwise
+    /// If offload request is ongoing, return false, true otherwise
    pub fn offload_lfc(self: &Arc<Self>) -> bool {
-        crate::metrics::LFC_OFFLOAD_REQUESTS.inc();
        {
            let state = &mut self.state.lock().unwrap().lfc_offload_state;
-            if let LfcOffloadState::Offloading =
-                std::mem::replace(state, LfcOffloadState::Offloading)
-            {
+            if replace(state, LfcOffloadState::Offloading) == LfcOffloadState::Offloading {
                return false;
            }
        }
-
        let cloned = self.clone();
-        spawn(async move {
-            let Err(err) = cloned.offload_lfc_impl().await else {
-                cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
-                return;
-            };
-            error!(%err);
-            cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
-                error: err.to_string(),
-            };
-        });
+        spawn(async move { cloned.offload_lfc_with_state_update().await });
        true
    }

+    pub async fn offload_lfc_async(self: &Arc<Self>) {
+        {
+            let state = &mut self.state.lock().unwrap().lfc_offload_state;
+            if replace(state, LfcOffloadState::Offloading) == LfcOffloadState::Offloading {
+                return;
+            }
+        }
+        self.offload_lfc_with_state_update().await
+    }
+
+    async fn offload_lfc_with_state_update(&self) {
+        crate::metrics::LFC_OFFLOADS.inc();
+        let Err(err) = self.offload_lfc_impl().await else {
+            self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
+            return;
+        };
+        error!(%err);
+        self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
+            error: err.to_string(),
+        };
+    }
+
    async fn offload_lfc_impl(&self) -> Result<()> {
        let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?;
        info!(%url, "requesting LFC state from postgres");
--- a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf
+++ b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf
@@ -10,7 +10,13 @@ input(type="imfile" File="{log_directory}/*.log"
  startmsg.regex="^[[:digit:]]{{4}}-[[:digit:]]{{2}}-[[:digit:]]{{2}} [[:digit:]]{{2}}:[[:digit:]]{{2}}:[[:digit:]]{{2}}.[[:digit:]]{{3}} GMT,")

 # the directory to store rsyslog state files
-global(workDirectory="/var/log/rsyslog")
+global(
+  workDirectory="/var/log/rsyslog"
+  DefaultNetstreamDriverCAFile="/etc/ssl/certs/ca-certificates.crt"
+)
+
+# Whether the remote syslog receiver uses tls
+set $.remote_syslog_tls = "{remote_syslog_tls}";

 # Construct json, endpoint_id and project_id as additional metadata
 set $.json_log!endpoint_id = "{endpoint_id}";
@@ -21,5 +27,29 @@ set $.json_log!msg = $msg;
 template(name="PgAuditLog" type="string"
    string="<%PRI%>1 %TIMESTAMP:::date-rfc3339% %HOSTNAME% - - - - %$.json_log%")

-# Forward to remote syslog receiver (@@<hostname>:<port>;format
-local5.info @@{remote_endpoint};PgAuditLog
+# Forward to remote syslog receiver (over TLS)
+if ( $syslogtag == 'pgaudit_log' ) then {{
+  if ( $.remote_syslog_tls == 'true' ) then {{
+    action(type="omfwd" target="{remote_syslog_host}" port="{remote_syslog_port}" protocol="tcp"
+      template="PgAuditLog"
+      queue.type="linkedList"
+      queue.size="1000"
+      action.ResumeRetryCount="10"
+      StreamDriver="gtls"
+      StreamDriverMode="1"
+      StreamDriverAuthMode="x509/name"
+      StreamDriverPermittedPeers="{remote_syslog_host}"
+      StreamDriver.CheckExtendedKeyPurpose="on"
+      StreamDriver.PermitExpiredCerts="off"
+    )
+    stop
+  }} else {{
+    action(type="omfwd" target="{remote_syslog_host}" port="{remote_syslog_port}" protocol="tcp"
+      template="PgAuditLog"
+      queue.type="linkedList"
+      queue.size="1000"
+      action.ResumeRetryCount="10"
+    )
+    stop
+  }}
+}}
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -97,20 +97,18 @@ pub(crate) static PG_TOTAL_DOWNTIME_MS: Lazy<GenericCounter<AtomicU64>> = Lazy::
    .expect("failed to define a metric")
 });

-/// Needed as neon.file_cache_prewarm_batch == 0 doesn't mean we never tried to prewarm.
-/// On the other hand, LFC_PREWARMED_PAGES is excessive as we can GET /lfc/prewarm
-pub(crate) static LFC_PREWARM_REQUESTS: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static LFC_PREWARMS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
-        "compute_ctl_lfc_prewarm_requests_total",
-        "Total number of LFC prewarm requests made by compute_ctl",
+        "compute_ctl_lfc_prewarms_total",
+        "Total number of LFC prewarms requested by compute_ctl or autoprewarm option",
    )
    .expect("failed to define a metric")
 });

-pub(crate) static LFC_OFFLOAD_REQUESTS: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
-        "compute_ctl_lfc_offload_requests_total",
-        "Total number of LFC offload requests made by compute_ctl",
+        "compute_ctl_lfc_offloads_total",
+        "Total number of LFC offloads requested by compute_ctl or lfc_offload_period_seconds option",
    )
    .expect("failed to define a metric")
 });
@@ -124,7 +122,7 @@ pub fn collect() -> Vec<MetricFamily> {
    metrics.extend(AUDIT_LOG_DIR_SIZE.collect());
    metrics.extend(PG_CURR_DOWNTIME_MS.collect());
    metrics.extend(PG_TOTAL_DOWNTIME_MS.collect());
-    metrics.extend(LFC_PREWARM_REQUESTS.collect());
-    metrics.extend(LFC_OFFLOAD_REQUESTS.collect());
+    metrics.extend(LFC_PREWARMS.collect());
+    metrics.extend(LFC_OFFLOADS.collect());
    metrics
 }
--- a/compute_tools/src/rsyslog.rs
+++ b/compute_tools/src/rsyslog.rs
@@ -4,8 +4,10 @@ use std::path::Path;
 use std::process::Command;
 use std::time::Duration;
 use std::{fs::OpenOptions, io::Write};
+use url::{Host, Url};

 use anyhow::{Context, Result, anyhow};
+use hostname_validator;
 use tracing::{error, info, instrument, warn};

 const POSTGRES_LOGS_CONF_PATH: &str = "/etc/rsyslog.d/postgres_logs.conf";
@@ -82,18 +84,84 @@ fn restart_rsyslog() -> Result<()> {
    Ok(())
 }

+fn parse_audit_syslog_address(
+    remote_plain_endpoint: &str,
+    remote_tls_endpoint: &str,
+) -> Result<(String, u16, String)> {
+    let tls;
+    let remote_endpoint = if !remote_tls_endpoint.is_empty() {
+        tls = "true".to_string();
+        remote_tls_endpoint
+    } else {
+        tls = "false".to_string();
+        remote_plain_endpoint
+    };
+    // Urlify the remote_endpoint, so parsing can be done with url::Url.
+    let url_str = format!("http://{remote_endpoint}");
+    let url = Url::parse(&url_str).map_err(|err| {
+        anyhow!("Error parsing {remote_endpoint}, expected host:port, got {err:?}")
+    })?;
+
+    let is_valid = url.scheme() == "http"
+        && url.path() == "/"
+        && url.query().is_none()
+        && url.fragment().is_none()
+        && url.username() == ""
+        && url.password().is_none();
+
+    if !is_valid {
+        return Err(anyhow!(
+            "Invalid address format {remote_endpoint}, expected host:port"
+        ));
+    }
+    let host = match url.host() {
+        Some(Host::Domain(h)) if hostname_validator::is_valid(h) => h.to_string(),
+        Some(Host::Ipv4(ip4)) => ip4.to_string(),
+        Some(Host::Ipv6(ip6)) => ip6.to_string(),
+        _ => return Err(anyhow!("Invalid host")),
+    };
+    let port = url
+        .port()
+        .ok_or_else(|| anyhow!("Invalid port in {remote_endpoint}"))?;
+
+    Ok((host, port, tls))
+}
+
+fn generate_audit_rsyslog_config(
+    log_directory: String,
+    endpoint_id: &str,
+    project_id: &str,
+    remote_syslog_host: &str,
+    remote_syslog_port: u16,
+    remote_syslog_tls: &str,
+) -> String {
+    format!(
+        include_str!("config_template/compute_audit_rsyslog_template.conf"),
+        log_directory = log_directory,
+        endpoint_id = endpoint_id,
+        project_id = project_id,
+        remote_syslog_host = remote_syslog_host,
+        remote_syslog_port = remote_syslog_port,
+        remote_syslog_tls = remote_syslog_tls
+    )
+}
+
 pub fn configure_audit_rsyslog(
    log_directory: String,
    endpoint_id: &str,
    project_id: &str,
    remote_endpoint: &str,
+    remote_tls_endpoint: &str,
 ) -> Result<()> {
-    let config_content: String = format!(
-        include_str!("config_template/compute_audit_rsyslog_template.conf"),
-        log_directory = log_directory,
-        endpoint_id = endpoint_id,
-        project_id = project_id,
-        remote_endpoint = remote_endpoint
+    let (remote_syslog_host, remote_syslog_port, remote_syslog_tls) =
+        parse_audit_syslog_address(remote_endpoint, remote_tls_endpoint).unwrap();
+    let config_content = generate_audit_rsyslog_config(
+        log_directory,
+        endpoint_id,
+        project_id,
+        &remote_syslog_host,
+        remote_syslog_port,
+        &remote_syslog_tls,
    );

    info!("rsyslog config_content: {}", config_content);
@@ -258,6 +326,8 @@ pub fn launch_pgaudit_gc(log_directory: String) {
 mod tests {
    use crate::rsyslog::PostgresLogsRsyslogConfig;

+    use super::{generate_audit_rsyslog_config, parse_audit_syslog_address};
+
    #[test]
    fn test_postgres_logs_config() {
        {
@@ -287,4 +357,146 @@ mod tests {
            assert!(res.is_err());
        }
    }
+
+    #[test]
+    fn test_parse_audit_syslog_address() {
+        {
+            // host:port format (plaintext)
+            let parsed = parse_audit_syslog_address("collector.host.tld:5555", "");
+            assert!(parsed.is_ok());
+            assert_eq!(
+                parsed.unwrap(),
+                (
+                    String::from("collector.host.tld"),
+                    5555,
+                    String::from("false")
+                )
+            );
+        }
+
+        {
+            // host:port format with ipv4 ip address (plaintext)
+            let parsed = parse_audit_syslog_address("10.0.0.1:5555", "");
+            assert!(parsed.is_ok());
+            assert_eq!(
+                parsed.unwrap(),
+                (String::from("10.0.0.1"), 5555, String::from("false"))
+            );
+        }
+
+        {
+            // host:port format with ipv6 ip address (plaintext)
+            let parsed =
+                parse_audit_syslog_address("[7e60:82ed:cb2e:d617:f904:f395:aaca:e252]:5555", "");
+            assert_eq!(
+                parsed.unwrap(),
+                (
+                    String::from("7e60:82ed:cb2e:d617:f904:f395:aaca:e252"),
+                    5555,
+                    String::from("false")
+                )
+            );
+        }
+
+        {
+            // Only TLS host:port defined
+            let parsed = parse_audit_syslog_address("", "tls.host.tld:5556");
+            assert_eq!(
+                parsed.unwrap(),
+                (String::from("tls.host.tld"), 5556, String::from("true"))
+            );
+        }
+
+        {
+            // tls host should take precedence, when both defined
+            let parsed = parse_audit_syslog_address("plaintext.host.tld:5555", "tls.host.tld:5556");
+            assert_eq!(
+                parsed.unwrap(),
+                (String::from("tls.host.tld"), 5556, String::from("true"))
+            );
+        }
+
+        {
+            // host without port (plaintext)
+            let parsed = parse_audit_syslog_address("collector.host.tld", "");
+            assert!(parsed.is_err());
+        }
+
+        {
+            // port without host
+            let parsed = parse_audit_syslog_address(":5555", "");
+            assert!(parsed.is_err());
+        }
+
+        {
+            // valid host with invalid port
+            let parsed = parse_audit_syslog_address("collector.host.tld:90001", "");
+            assert!(parsed.is_err());
+        }
+
+        {
+            // invalid hostname with valid port
+            let parsed = parse_audit_syslog_address("-collector.host.tld:5555", "");
+            assert!(parsed.is_err());
+        }
+
+        {
+            // parse error
+            let parsed = parse_audit_syslog_address("collector.host.tld:::5555", "");
+            assert!(parsed.is_err());
+        }
+    }
+
+    #[test]
+    fn test_generate_audit_rsyslog_config() {
+        {
+            // plaintext version
+            let log_directory = "/tmp/log".to_string();
+            let endpoint_id = "ep-test-endpoint-id";
+            let project_id = "test-project-id";
+            let remote_syslog_host = "collector.host.tld";
+            let remote_syslog_port = 5555;
+            let remote_syslog_tls = "false";
+
+            let conf_str = generate_audit_rsyslog_config(
+                log_directory,
+                endpoint_id,
+                project_id,
+                remote_syslog_host,
+                remote_syslog_port,
+                remote_syslog_tls,
+            );
+
+            assert!(conf_str.contains(r#"set $.remote_syslog_tls = "false";"#));
+            assert!(conf_str.contains(r#"type="omfwd""#));
+            assert!(conf_str.contains(r#"target="collector.host.tld""#));
+            assert!(conf_str.contains(r#"port="5555""#));
+            assert!(conf_str.contains(r#"StreamDriverPermittedPeers="collector.host.tld""#));
+        }
+
+        {
+            // TLS version
+            let log_directory = "/tmp/log".to_string();
+            let endpoint_id = "ep-test-endpoint-id";
+            let project_id = "test-project-id";
+            let remote_syslog_host = "collector.host.tld";
+            let remote_syslog_port = 5556;
+            let remote_syslog_tls = "true";
+
+            let conf_str = generate_audit_rsyslog_config(
+                log_directory,
+                endpoint_id,
+                project_id,
+                remote_syslog_host,
+                remote_syslog_port,
+                remote_syslog_tls,
+            );
+
+            assert!(conf_str.contains(r#"set $.remote_syslog_tls = "true";"#));
+            assert!(conf_str.contains(r#"type="omfwd""#));
+            assert!(conf_str.contains(r#"target="collector.host.tld""#));
+            assert!(conf_str.contains(r#"port="5556""#));
+            assert!(conf_str.contains(r#"StreamDriverPermittedPeers="collector.host.tld""#));
+        }
+    }
 }
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -31,6 +31,7 @@ mod pg_helpers_tests {
 wal_level = logical
 hot_standby = on
 autoprewarm = off
+offload_lfc_interval_seconds = 20
 neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
 wal_log_hints = on
 log_connections = on
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -64,7 +64,9 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);

+#[allow(dead_code)]
 const DEFAULT_PG_VERSION: PgMajorVersion = PgMajorVersion::PG17;
+const DEFAULT_PG_VERSION_NUM: &str = "17";

 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";

@@ -167,7 +169,7 @@ struct TenantCreateCmdArgs {
    #[clap(short = 'c')]
    config: Vec<String>,

-    #[arg(default_value_t = DEFAULT_PG_VERSION)]
+    #[arg(default_value = DEFAULT_PG_VERSION_NUM)]
    #[clap(long, help = "Postgres version to use for the initial timeline")]
    pg_version: PgMajorVersion,

@@ -290,7 +292,7 @@ struct TimelineCreateCmdArgs {
    #[clap(long, help = "Human-readable alias for the new timeline")]
    branch_name: String,

-    #[arg(default_value_t = DEFAULT_PG_VERSION)]
+    #[arg(default_value = DEFAULT_PG_VERSION_NUM)]
    #[clap(long, help = "Postgres version")]
    pg_version: PgMajorVersion,
 }
@@ -322,7 +324,7 @@ struct TimelineImportCmdArgs {
    #[clap(long, help = "Lsn the basebackup ends at")]
    end_lsn: Option<Lsn>,

-    #[arg(default_value_t = DEFAULT_PG_VERSION)]
+    #[arg(default_value = DEFAULT_PG_VERSION_NUM)]
    #[clap(long, help = "Postgres version of the backup being imported")]
    pg_version: PgMajorVersion,
 }
@@ -601,7 +603,7 @@ struct EndpointCreateCmdArgs {
    )]
    config_only: bool,

-    #[arg(default_value_t = DEFAULT_PG_VERSION)]
+    #[arg(default_value = DEFAULT_PG_VERSION_NUM)]
    #[clap(long, help = "Postgres version")]
    pg_version: PgMajorVersion,

@@ -673,6 +675,16 @@ struct EndpointStartCmdArgs {
    #[arg(default_value = "90s")]
    start_timeout: Duration,

+    #[clap(
+        long,
+        help = "Download LFC cache from endpoint storage on endpoint startup",
+        default_value = "false"
+    )]
+    autoprewarm: bool,
+
+    #[clap(long, help = "Upload LFC cache to endpoint storage periodically")]
+    offload_lfc_interval_seconds: Option<std::num::NonZeroU64>,
+
    #[clap(
        long,
        help = "Run in development mode, skipping VM-specific operations like process termination",
@@ -1583,22 +1595,24 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
            let endpoint_storage_token = env.generate_auth_token(&claims)?;
            let endpoint_storage_addr = env.endpoint_storage.listen_addr.to_string();

+            let args = control_plane::endpoint::EndpointStartArgs {
+                auth_token,
+                endpoint_storage_token,
+                endpoint_storage_addr,
+                safekeepers_generation,
+                safekeepers,
+                pageservers,
+                remote_ext_base_url: remote_ext_base_url.clone(),
+                shard_stripe_size: stripe_size.0 as usize,
+                create_test_user: args.create_test_user,
+                start_timeout: args.start_timeout,
+                autoprewarm: args.autoprewarm,
+                offload_lfc_interval_seconds: args.offload_lfc_interval_seconds,
+                dev: args.dev,
+            };
+
            println!("Starting existing endpoint {endpoint_id}...");
-            endpoint
-                .start(
-                    &auth_token,
-                    endpoint_storage_token,
-                    endpoint_storage_addr,
-                    safekeepers_generation,
-                    safekeepers,
-                    pageservers,
-                    remote_ext_base_url.as_ref(),
-                    stripe_size.0 as usize,
-                    args.create_test_user,
-                    args.start_timeout,
-                    args.dev,
-                )
-                .await?;
+            endpoint.start(args).await?;
        }
        EndpointCmd::Reconfigure(args) => {
            let endpoint_id = &args.endpoint_id;
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -373,6 +373,22 @@ impl std::fmt::Display for EndpointTerminateMode {
    }
 }

+pub struct EndpointStartArgs {
+    pub auth_token: Option<String>,
+    pub endpoint_storage_token: String,
+    pub endpoint_storage_addr: String,
+    pub safekeepers_generation: Option<SafekeeperGeneration>,
+    pub safekeepers: Vec<NodeId>,
+    pub pageservers: Vec<(PageserverProtocol, Host, u16)>,
+    pub remote_ext_base_url: Option<String>,
+    pub shard_stripe_size: usize,
+    pub create_test_user: bool,
+    pub start_timeout: Duration,
+    pub autoprewarm: bool,
+    pub offload_lfc_interval_seconds: Option<std::num::NonZeroU64>,
+    pub dev: bool,
+}
+
 impl Endpoint {
    fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
        if !entry.file_type()?.is_dir() {
@@ -677,21 +693,7 @@ impl Endpoint {
        })
    }

-    #[allow(clippy::too_many_arguments)]
-    pub async fn start(
-        &self,
-        auth_token: &Option<String>,
-        endpoint_storage_token: String,
-        endpoint_storage_addr: String,
-        safekeepers_generation: Option<SafekeeperGeneration>,
-        safekeepers: Vec<NodeId>,
-        pageservers: Vec<(PageserverProtocol, Host, u16)>,
-        remote_ext_base_url: Option<&String>,
-        shard_stripe_size: usize,
-        create_test_user: bool,
-        start_timeout: Duration,
-        dev: bool,
-    ) -> Result<()> {
+    pub async fn start(&self, args: EndpointStartArgs) -> Result<()> {
        if self.status() == EndpointStatus::Running {
            anyhow::bail!("The endpoint is already running");
        }
@@ -704,10 +706,10 @@ impl Endpoint {
            std::fs::remove_dir_all(self.pgdata())?;
        }

-        let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
+        let pageserver_connstring = Self::build_pageserver_connstr(&args.pageservers);
        assert!(!pageserver_connstring.is_empty());

-        let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
+        let safekeeper_connstrings = self.build_safekeepers_connstrs(args.safekeepers)?;

        // check for file remote_extensions_spec.json
        // if it is present, read it and pass to compute_ctl
@@ -735,7 +737,7 @@ impl Endpoint {
                    cluster_id: None, // project ID: not used
                    name: None,       // project name: not used
                    state: None,
-                    roles: if create_test_user {
+                    roles: if args.create_test_user {
                        vec![Role {
                            name: PgIdent::from_str("test").unwrap(),
                            encrypted_password: None,
@@ -744,7 +746,7 @@ impl Endpoint {
                    } else {
                        Vec::new()
                    },
-                    databases: if create_test_user {
+                    databases: if args.create_test_user {
                        vec![Database {
                            name: PgIdent::from_str("neondb").unwrap(),
                            owner: PgIdent::from_str("test").unwrap(),
@@ -766,20 +768,21 @@ impl Endpoint {
                endpoint_id: Some(self.endpoint_id.clone()),
                mode: self.mode,
                pageserver_connstring: Some(pageserver_connstring),
-                safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
+                safekeepers_generation: args.safekeepers_generation.map(|g| g.into_inner()),
                safekeeper_connstrings,
-                storage_auth_token: auth_token.clone(),
+                storage_auth_token: args.auth_token.clone(),
                remote_extensions,
                pgbouncer_settings: None,
-                shard_stripe_size: Some(shard_stripe_size),
+                shard_stripe_size: Some(args.shard_stripe_size),
                local_proxy_config: None,
                reconfigure_concurrency: self.reconfigure_concurrency,
                drop_subscriptions_before_start: self.drop_subscriptions_before_start,
                audit_log_level: ComputeAudit::Disabled,
                logs_export_host: None::<String>,
-                endpoint_storage_addr: Some(endpoint_storage_addr),
-                endpoint_storage_token: Some(endpoint_storage_token),
-                autoprewarm: false,
+                endpoint_storage_addr: Some(args.endpoint_storage_addr),
+                endpoint_storage_token: Some(args.endpoint_storage_token),
+                autoprewarm: args.autoprewarm,
+                offload_lfc_interval_seconds: args.offload_lfc_interval_seconds,
                suspend_timeout_seconds: -1, // Only used in neon_local.
            };

@@ -791,7 +794,7 @@ impl Endpoint {
                debug!("spec.cluster {:?}", spec.cluster);

                // fill missing fields again
-                if create_test_user {
+                if args.create_test_user {
                    spec.cluster.roles.push(Role {
                        name: PgIdent::from_str("test").unwrap(),
                        encrypted_password: None,
@@ -826,7 +829,7 @@ impl Endpoint {
        // Launch compute_ctl
        let conn_str = self.connstr("cloud_admin", "postgres");
        println!("Starting postgres node at '{conn_str}'");
-        if create_test_user {
+        if args.create_test_user {
            let conn_str = self.connstr("test", "neondb");
            println!("Also at '{conn_str}'");
        }
@@ -858,11 +861,11 @@ impl Endpoint {
        .stderr(logfile.try_clone()?)
        .stdout(logfile);

-        if let Some(remote_ext_base_url) = remote_ext_base_url {
-            cmd.args(["--remote-ext-base-url", remote_ext_base_url]);
+        if let Some(remote_ext_base_url) = args.remote_ext_base_url {
+            cmd.args(["--remote-ext-base-url", &remote_ext_base_url]);
        }

-        if dev {
+        if args.dev {
            cmd.arg("--dev");
        }

@@ -894,10 +897,11 @@ impl Endpoint {
                Ok(state) => {
                    match state.status {
                        ComputeStatus::Init => {
-                            if Instant::now().duration_since(start_at) > start_timeout {
+                            let timeout = args.start_timeout;
+                            if Instant::now().duration_since(start_at) > timeout {
                                bail!(
                                    "compute startup timed out {:?}; still in Init state",
-                                    start_timeout
+                                    timeout
                                );
                            }
                            // keep retrying
@@ -925,9 +929,10 @@ impl Endpoint {
                    }
                }
                Err(e) => {
-                    if Instant::now().duration_since(start_at) > start_timeout {
+                    if Instant::now().duration_since(start_at) > args.start_timeout {
                        return Err(e).context(format!(
-                            "timed out {start_timeout:?} waiting to connect to compute_ctl HTTP",
+                            "timed out {:?} waiting to connect to compute_ctl HTTP",
+                            args.start_timeout
                        ));
                    }
                }
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -65,12 +65,33 @@ enum Command {
        #[arg(long)]
        scheduling: Option<NodeSchedulingPolicy>,
    },
-    // Set a node status as deleted.
+    /// Exists for backup usage and will be removed in future.
+    /// Use [`Command::NodeStartDelete`] instead, if possible.
    NodeDelete {
        #[arg(long)]
        node_id: NodeId,
    },
+    /// Start deletion of the specified pageserver.
+    NodeStartDelete {
+        #[arg(long)]
+        node_id: NodeId,
+        /// When `force` is true, skip waiting for shards to prewarm during migration.
+        /// This can significantly speed up node deletion since prewarming all shards
+        /// can take considerable time, but may result in slower initial access to
+        /// migrated shards until they warm up naturally.
+        #[arg(long)]
+        force: bool,
+    },
+    /// Cancel deletion of the specified pageserver and wait for `timeout`
+    /// for the operation to be canceled. May be retried.
+    NodeCancelDelete {
+        #[arg(long)]
+        node_id: NodeId,
+        #[arg(long)]
+        timeout: humantime::Duration,
+    },
    /// Delete a tombstone of node from the storage controller.
+    /// This is used when we want to allow the node to be re-registered.
    NodeDeleteTombstone {
        #[arg(long)]
        node_id: NodeId,
@@ -912,10 +933,44 @@ async fn main() -> anyhow::Result<()> {
                .await?;
        }
        Command::NodeDelete { node_id } => {
+            eprintln!("Warning: This command is obsolete and will be removed in a future version");
+            eprintln!("Use `NodeStartDelete` instead, if possible");
            storcon_client
                .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
                .await?;
        }
+        Command::NodeStartDelete { node_id, force } => {
+            let query = if force {
+                format!("control/v1/node/{node_id}/delete?force=true")
+            } else {
+                format!("control/v1/node/{node_id}/delete")
+            };
+            storcon_client
+                .dispatch::<(), ()>(Method::PUT, query, None)
+                .await?;
+            println!("Delete started for {node_id}");
+        }
+        Command::NodeCancelDelete { node_id, timeout } => {
+            storcon_client
+                .dispatch::<(), ()>(
+                    Method::DELETE,
+                    format!("control/v1/node/{node_id}/delete"),
+                    None,
+                )
+                .await?;
+
+            println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
+
+            let final_policy =
+                wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
+                    !matches!(sched, NodeSchedulingPolicy::Deleting)
+                })
+                .await?;
+
+            println!(
+                "Delete was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
+            );
+        }
        Command::NodeDeleteTombstone { node_id } => {
            storcon_client
                .dispatch::<(), ()>(
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -20,7 +20,7 @@ In our case consensus leader is compute (walproposer), and we don't want to wake
 up all computes for the change. Neither we want to fully reimplement the leader
 logic second time outside compute. Because of that the proposed algorithm relies
 for issuing configurations on the external fault tolerant (distributed) strongly
-consisent storage with simple API: CAS (compare-and-swap) on the single key.
+consistent storage with simple API: CAS (compare-and-swap) on the single key.
 Properly configured postgres suits this.

 In the system consensus is implemented at the timeline level, so algorithm below
@@ -34,7 +34,7 @@ A configuration is

 ```
 struct Configuration {
-    generation: Generation, // a number uniquely identifying configuration
+    generation: SafekeeperGeneration, // a number uniquely identifying configuration
    sk_set: Vec<NodeId>, // current safekeeper set
    new_sk_set: Optional<Vec<NodeId>>,
 }
@@ -81,11 +81,11 @@ configuration generation in them is less than its current one. Namely, it
 refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
 response it sends its current configuration generation to let walproposer know.

-Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration`
-accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
+Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/membership`
+accepting `Configuration`. Safekeeper switches to the given conf if it is higher than its
 current one and ignores it otherwise. In any case it replies with
 ```
-struct ConfigurationSwitchResponse {
+struct TimelineMembershipSwitchResponse {
    conf: Configuration,
    term: Term,
    last_log_term: Term,
@@ -108,7 +108,7 @@ establishes this configuration as its own and moves to voting.
 It should stop talking to safekeepers not listed in the configuration at this
 point, though it is not unsafe to continue doing so.

-To be elected it must receive votes from both majorites if `new_sk_set` is present.
+To be elected it must receive votes from both majorities if `new_sk_set` is present.
 Similarly, to commit WAL it must receive flush acknowledge from both majorities.

 If walproposer hears from safekeeper configuration higher than his own (i.e.
@@ -130,7 +130,7 @@ storage are reachable.
 1) Fetch current timeline configuration from the configuration storage.
 2) If it is already joint one and `new_set` is different from `desired_set`
   refuse to change. However, assign join conf to (in memory) var
-   `join_conf` and proceed to step 4 to finish the ongoing change.
+   `joint_conf` and proceed to step 4 to finish the ongoing change.
 3) Else, create joint `joint_conf: Configuration`: increment current conf number
   `n` and put `desired_set` to `new_sk_set`. Persist it in the configuration
   storage by doing CAS on the current generation: change happens only if
@@ -161,11 +161,11 @@ storage are reachable.
   because `pull_timeline` already includes it and plus additionally would be
   broadcast by compute. More importantly, we may proceed to the next step
   only when `<last_log_term, flush_lsn>` on the majority of the new set reached
-   `sync_position`. Similarly, on the happy path no waiting is not needed because
+   `sync_position`. Similarly, on the happy path no waiting is needed because
   `pull_timeline` already includes it. However, we should double
    check to be safe. For example, timeline could have been created earlier e.g.
    manually or after try-to-migrate, abort, try-to-migrate-again sequence.
-7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new
+7) Create `new_conf: Configuration` incrementing `joint_conf` generation and having new
   safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
   storage under one more CAS.
 8) Call `PUT` `configuration` on safekeepers from the new set,
@@ -178,12 +178,12 @@ spec of it.

 Description above focuses on safety. To make the flow practical and live, here a few more
 considerations.
-1) It makes sense to ping new set to ensure it we are migrating to live node(s) before
+1) It makes sense to ping new set to ensure we are migrating to live node(s) before
  step 3.
 2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
   it is safe to rollback to the old conf with one more CAS.
 3) On step 4 timeline might be already created on members of the new set for various reasons;
-   the simplest is the procedure restart. There are more complicated scenarious like mentioned
+   the simplest is the procedure restart. There are more complicated scenarios like mentioned
   in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
   generations, so seems simpler to treat existing timeline as success. However, this also
   has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
@@ -192,7 +192,7 @@ considerations.
 4) In the end timeline should be locally deleted on the safekeeper(s) which are
   in the old set but not in the new one, unless they are unreachable. To be
   safe this also should be done under generation number (deletion proceeds only if
-   current configuration is <= than one in request and safekeeper is not memeber of it).
+   current configuration is <= than one in request and safekeeper is not member of it).
 5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
   jump to step 7, using it as `new_conf`.

@@ -261,14 +261,14 @@ Timeline (branch) creation in cplane should call storage_controller POST
 Response should be augmented with `safekeepers_generation` and `safekeepers`
 fields like described in `/notify-safekeepers` above. Initially (currently)
 these fields may be absent; in this case cplane chooses safekeepers on its own
-like it currently does. The call should be retried until succeeds.
+like it currently does. The call should be retried until it succeeds.

 Timeline deletion and tenant deletion in cplane should call appropriate
 storage_controller endpoints like it currently does for sharded tenants. The
 calls should be retried until they succeed.

-When compute receives safekeepers list from control plane it needs to know the
-generation to checked whether it should be updated (note that compute may get
+When compute receives safekeeper list from control plane it needs to know the
+generation to check whether it should be updated (note that compute may get
 safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers`
 GUC is just a comma separates list of `host:port`. Let's prefix it with
 `g#<generation>:` to this end, so it will look like
@@ -305,8 +305,8 @@ enum MigrationRequest {
 ```

 `FinishPending` requests to run the procedure to ensure state is clean: current
-configuration is not joint and majority of safekeepers are aware of it, but do
-not attempt to migrate anywhere. If current configuration fetched on step 1 is
+configuration is not joint and the majority of safekeepers are aware of it, but do
+not attempt to migrate anywhere. If the current configuration fetched on step 1 is
 not joint it jumps to step 7. It should be run at startup for all timelines (but
 similarly, in the first version it is ok to trigger it manually).

@@ -315,7 +315,7 @@ similarly, in the first version it is ok to trigger it manually).
 `safekeepers` table mirroring current `nodes` should be added, except that for
 `scheduling_policy`: it is enough to have at least in the beginning only 3
 fields: 1) `active` 2) `paused` (initially means only not assign new tlis there
-3) `decomissioned` (node is removed).
+3) `decommissioned` (node is removed).

 `timelines` table:
 ```
@@ -326,9 +326,10 @@ table! {
        tenant_id -> Varchar,
        start_lsn -> pg_lsn,
        generation -> Int4,
-        sk_set -> Array<Int4>, // list of safekeeper ids
+        sk_set -> Array<Int8>, // list of safekeeper ids
        new_sk_set -> Nullable<Array<Int8>>, // list of safekeeper ids, null if not joint conf
        cplane_notified_generation -> Int4,
+        sk_set_notified_generation -> Int4, // the generation a quorum of sk_set knows about
        deleted_at -> Nullable<Timestamptz>,
    }
 }
@@ -338,13 +339,23 @@ table! {
 might also want to add ancestor_timeline_id to preserve the hierarchy, but for
 this RFC it is not needed.

+`cplane_notified_generation` and `sk_set_notified_generation` fields are used to
+track the last stage of the algorithm, when we need to notify safekeeper set and cplane
+with the final configuration after it's already committed to DB.
+
+The timeline is up-to-date (no migration in progress) if `new_sk_set` is null and
+`*_notified_generation` fields are up to date with `generation`. 
+
+It's possible to replace `*_notified_generation` with one boolean field `migration_completed`,
+but for better observability it's nice to have them separately.
+
 #### API

 Node management is similar to pageserver:
-1) POST `/control/v1/safekeepers` inserts safekeeper.
-2) GET `/control/v1/safekeepers` lists safekeepers.
-3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
-4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
+1) POST `/control/v1/safekeeper` inserts safekeeper.
+2) GET `/control/v1/safekeeper` lists safekeepers.
+3) GET `/control/v1/safekeeper/:node_id` gets safekeeper.
+4) PUT `/control/v1/safekeper/:node_id/scheduling_policy` changes status to e.g.
   `offline` or `decomissioned`. Initially it is simpler not to schedule any
    migrations here.

@@ -368,8 +379,8 @@ Migration API: the first version is the simplest and the most imperative:
 all timelines from one safekeeper to another. It accepts json
 ```
 {
-    "src_sk": u32,
-    "dst_sk": u32,
+    "src_sk": NodeId,
+    "dst_sk": NodeId,
    "limit": Optional<u32>,
 }
 ```
@@ -379,12 +390,15 @@ Returns list of scheduled requests.
 2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest`
   to move single timeline to given set of safekeepers:
 ```
-{
-    "desired_set": Vec<u32>,
+struct TimelineSafekeeperMigrateRequest {
+    "new_sk_set": Vec<NodeId>,
 }
 ```

-Returns scheduled request.
+In the first version the handler migrates the timeline to `new_sk_set` synchronously.
+Should be retried until success.
+
+In the future we might change it to asynchronous API and return scheduled request.

 Similar call should be added for the tenant.

@@ -434,6 +448,9 @@ table! {
 }
 ```

+We load all pending ops from the table on startup into the memory.
+The table is needed only to preserve the state between restarts.
+
 `op_type` can be `include` (seed from peers and ensure generation is up to
 date), `exclude` (remove locally) and `delete`. Field is actually not strictly
 needed as it can be computed from current configuration, but gives more explicit
@@ -474,7 +491,7 @@ actions must be idempotent. Now, a tricky point here is timeline start LSN. For
 the initial (tenant creation) call cplane doesn't know it. However, setting
 start_lsn on safekeepers during creation is a good thing -- it provides a
 guarantee that walproposer can always find a common point in WAL histories of
-safekeeper and its own, and so absense of it would be a clear sign of
+safekeeper and its own, and so absence of it would be a clear sign of
 corruption. The following sequence works:
 1) Create timeline (or observe that it exists) on pageserver,
   figuring out last_record_lsn in response.
@@ -497,11 +514,9 @@ corruption. The following sequence works:
   retries the call until 200 response.

   There is a small question how request handler (timeline creation in this
-   case) would interact with per sk reconciler. As always I prefer to do the
-   simplest possible thing and here it seems to be just waking it up so it
-   re-reads the db for work to do. Passing work in memory is faster, but
-   that shouldn't matter, and path to scan db for work will exist anyway, 
-   simpler to reuse it.
+   case) would interact with per sk reconciler. In the current implementation
+   we first persist the request in the DB, and then send an in-memory request
+   to each safekeeper reconciler to process it.

 For pg version / wal segment size: while we may persist them in `timelines`
 table, it is not necessary as initial creation at step 3 can take them from
@@ -509,30 +524,40 @@ pageserver or cplane creation call and later pull_timeline will carry them
 around.

 Timeline migration.
-1) CAS to the db to create joint conf, and in the same transaction create
-   `safekeeper_timeline_pending_ops` `include` entries to initialize new members
-   as well as deliver this conf to current ones; poke per sk reconcilers to work
-   on it. Also any conf change should also poke cplane notifier task(s).
-2) Once it becomes possible per alg description above, get out of joint conf
-   with another CAS. Task should get wakeups from per sk reconcilers because 
-   conf switch is required for advancement; however retries should be sleep
-   based as well as LSN advancement might be needed, though in happy path 
-   it isn't. To see whether further transition is possible on wakup migration
-   executor polls safekeepers per the algorithm. CAS creating new conf with only
-   new members should again insert entries to `safekeeper_timeline_pending_ops`
-   to switch them there, as well as `exclude` rows to remove timeline from 
-   old members.
+1) CAS to the db to create joint conf. Since this moment the migration is considered to be 
+   "in progress". We can detect all "in-progress" migrations looking into the database.
+2) Do steps 4-6 from the algorithm, including `pull_timeline` onto `new_sk_set`, update membership
+   configuration on all safekeepers, notify cplane, etc. All operations are idempotent,
+   so we don't need to persist anything in the database at this stage. If any errors occur,
+   it's safe to retry or abort the migration.
+3) Once it becomes possible per alg description above, get out of joint conf
+   with another CAS. Also should insert `exclude` entries into `safekeeper_timeline_pending_ops`
+   in the same DB transaction. Adding `exclude` entries atomically is nesessary because after
+   CAS we don't have the list of excluded safekeepers in the `timelines` table anymore, but we
+   need to have them persisted somewhere in case the migration is interrupted right after the CAS.
+4) Finish the migration. The final membership configuration is committed to the DB at this stage.
+   So, the migration can not be aborted anymore. But it can still be retried if the migration fails
+   past stage 3. To finish the migration we need to send the new membership configuration to
+   a new quorum of safekeepers, notify cplane with the new safekeeper list and schedule the `exclude`
+   requests to in-memory queue for safekeeper reconciler. If the algrorithm is retried, it's
+   possible that we have already committed `exclude` requests to DB, but didn't send them to
+   the in-memory queue. In this case we need to read them from `safekeeper_timeline_pending_ops`
+   because it's the only place where they are persistent. The fields `sk_set_notified_generation`
+   and `cplane_notified_generation` are updated after each step. The migration is considered
+   fully completed when they match the `generation` field.
+
+In practice, we can report "success" after stage 3 and do the "finish" step in per-timeline
+reconciler (if we implement it). But it's wise to at least try to finish them synchronously,
+so the timeline is always in a "good state" and doesn't require an old quorum to commit
+WAL after the migration reported "success".

 Timeline deletion: just set `deleted_at` on the timeline row and insert
 `safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by
 per sk reconcilers.

-When node is removed (set to `decomissioned`), `safekeeper_timeline_pending_ops`
+When node is removed (set to `decommissioned`), `safekeeper_timeline_pending_ops`
 for it must be cleared in the same transaction.

-One more task pool should infinitely retry notifying control plane about changed
-safekeeper sets (trying making `cplane_notified_generation` equal `generation`).
-
 #### Dealing with multiple instances of storage_controller

 Operations described above executed concurrently might create some errors but do
@@ -541,7 +566,7 @@ of storage_controller it is fine to have it temporarily, e.g. during redeploy.

 To harden against some controller instance creating some work in
 `safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up
-the job per sk reconcilers apart from explicit wakups should scan for work
+the job per sk reconcilers apart from explicit wakeups should scan for work
 periodically. It is possible to remove that though if all db updates are
 protected with leadership token/term -- then such scans are needed only after
 leadership is acquired.
@@ -563,7 +588,7 @@ There should be following layers of tests:
   safekeeper communication and pull_timeline need to be mocked and main switch
   procedure wrapped to as a node (thread) in simulation tests, using these
   mocks. Test would inject migrations like it currently injects
-   safekeeper/walproposer restars. Main assert is the same -- committed WAL must
+   safekeeper/walproposer restarts. Main assert is the same -- committed WAL must
   not be lost.

 3) Since simulation testing injects at relatively high level points (not
@@ -613,7 +638,7 @@ Let's have the following implementation bits for gradual rollout:
  `notify-safekeepers`.

 Then the rollout for a region would be:
- Current situation: safekeepers are choosen by control_plane.
+- Current situation: safekeepers are chosen by control_plane.
 - We manually migrate some timelines, test moving them around.
 - Then we enable `--set-safekeepers` so that all new timelines
  are on storage controller.
--- a/docs/rfcs/044-feature-flag.md
+++ b/docs/rfcs/044-feature-flag.md
@@ -0,0 +1,179 @@
+# Storage Feature Flags
+
+In this RFC, we will describe how we will implement per-tenant feature flags.
+
+## PostHog as Feature Flag Service
+
+Before we start, let's talk about how current feature flag services work. PostHog is the feature flag service we are currently using across multiple user-facing components in the company. PostHog has two modes of operation: HTTP evaluation and server-side local evaluation.
+
+Let's assume we have a storage feature flag called gc-compaction and we want to roll it out to scale-tier users with resident size >= 10GB and <= 100GB.
+
+### Define User Profiles
+
+The first step is to synchronize our user profiles to the PostHog service. We can simply assume that each tenant is a user in PostHog. Each user profile has some properties associated with it. In our case, it will be: plan type (free, scale, enterprise, etc); resident size (in bytes); primary pageserver (string); region (string).
+
+### Define Feature Flags
+
+We would create a feature flag called gc-compaction in PostHog with 4 variants: disabled, stage-1, stage-2, fully-enabled. We will flip the feature flags from disabled to fully-enabled stage by stage for some percentage of our users.
+
+### Option 1: HTTP Evaluation Mode
+
+When using PostHog's HTTP evaluation mode, the client will make request to the PostHog service, asking for the value of a feature flag for a specific user.
+
+* Control plane will report the plan type to PostHog each time it attaches a tenant to the storcon or when the user upgrades/downgrades. It calls the PostHog profile API to associate tenant ID with the plan type. Assume we have X active tenants and such attach or plan change event happens each week, that would be 4X profile update requests per month.
+* Pageservers will report the resident size and the primary pageserver to the PostHog service. Assume we report resident size every 24 hours, that would be 30X requests per month.
+* Each tenant will request the state of the feature flag every 1 hour, that's 720X requests per month.
+* The Rust client would be easy to implement as we only need to call the `/decide` API on PostHog.
+
+Using the HTTP evaluation mode we will issue 754X requests a month.
+
+### Option 2: Local Evaluation Mode
+
+When using PostHog's HTTP evaluation mode, the client (usually the server in a browser/server architecture) will poll the feature flag configuration every 30s (default in the Python client) from PostHog. Such configuration contains data like:
+
+<details>
+
+<summary>Example JSON response from the PostHog local evaluation API</summary>
+
+```
+[
+    {
+        "id": 1,
+        "name": "Beta Feature",
+        "key": "person-flag",
+        "is_simple_flag": True,
+        "active": True,
+        "filters": {
+            "groups": [
+                {
+                    "properties": [
+                        {
+                            "key": "location",
+                            "operator": "exact",
+                            "value": ["Straße"],
+                            "type": "person",
+                        }
+                    ],
+                    "rollout_percentage": 100,
+                },
+                {
+                    "properties": [
+                        {
+                            "key": "star",
+                            "operator": "exact",
+                            "value": ["ſun"],
+                            "type": "person",
+                        }
+                    ],
+                    "rollout_percentage": 100,
+                },
+            ],
+        },
+    }
+]
+```
+
+</details>
+
+Note that the API only contains information like "under what condition => rollout percentage". The user is responsible to provide the properties required to the client for local evaluation, and the PostHog service (web UI) cannot know if a feature is enabled for the tenant or not until the client uses the `capture` API to report the result back. To control the rollout percentage, the user ID gets mapped to a float number in `[0, 1)` on a consistent hash ring. All values <= the percentage will get the feature enabled or set to the desired value.
+
+To use the local evaluation mode, the system needs:
+
+* Assume each pageserver will poll PostHog for the local evaluation JSON every 5 minutes (instead of the 30s default as it's too frequent). That's 8640Y per month, Y is the number of pageservers. Local evaluation requests cost 10x more than the normal decide request, so that's 86400Y request units to bill.
+* Storcon needs to store the plan type in the database and pass that information to the pageserver when attaching the tenant.
+* Storcon also needs to update PostHog with the active tenants, for example, when the tenant gets detached/attached. Assume each active tenant gets detached/attached every week, that would be 4X requests per month.
+* We do not need to update bill type or resident size to PostHog as all these are evaluated locally.
+* After each local evaluation of the feature flag, we need to call PostHog's capture event API to update the result of the evaluation that the feature is enabled. We can do this when the flag gets changed compared with the last cached state in memory. That would be at least 4X (assume we do deployment every week so the cache gets cleared) and maybe an additional multiplifier of 10 assume we have 10 active features.
+
+In this case, we will issue 86400Y + 40X requests per month.
+
+Assume X = 1,000,000 and Y = 100,
+
+|   | HTTP Evaluation  | Local Evaluation  |
+|---|---|---|
+| Latency of propagating the conditions/properties for feature flag  | 24 hours  | available locally  |
+| Latency of applying the feature flag  | 1 hour  | 5 minutes  |
+| Can properties be reported from different services |  Yes |  No  |
+| Do we need to sync billing info etc to pageserver |  No |  Yes  |
+| Cost | 75400$ / month | 4864$ / month |
+
+# Our Solution
+
+We will use PostHog _only_ as an UI to configure the feature flags. Whether a feature is enabled or not can only be queried through storcon/pageserver instead of using the PostHog UI. (We could report it back to PostHog via `capture_event` but it costs $$$.) This allows us to ramp up the feature flag functionality fast at first. At the same time, it would also give us the option to migrate to our own solution once we want to have more properties and more complex evaluation rules in our system.
+
+* We will create several fake users (tenants) in PostHog that contains all the properties we will use for evaluating a feature flag (i.e., resident size, billing type, pageserver id, etc.)
+* We will use PostHog's local evaluation API to poll the configuration of the feature flags and evaluate them locally on each of the pageserver.
+* The evaluation result will not be reported back to PostHog.
+* Storcon needs to pull some information from cplane database.
+* To know if a feature is currently enabled or not, we need to call the storcon/pageserver API; and we won't be able to know if a feature has been enabled on a tenant before easily: we need to look at the Grafana logs.
+
+We only need to pay for the 86400Y local evaluation requests (that would be setting Y=0 in solution 2 => $864/month, and even less if we proxy it through storcon).
+
+## Implementation
+
+* Pageserver: implement a PostHog local evaluation client. The client will be shared across all tenants on the pageserver with a single API: `evaluate(tenant_id, feature_flag, properties) -> json`.
+* Storcon: if we need plan type as the evaluation condition, pull it from cplane database.
+* Storcon/Pageserver: implement an HTTP API `:tenant_id/feature/:feature` to retrieve the current feature flag status.
+* Storcon/Pageserver: a loop to update the feature flag spec on both storcon and pageserver. Pageserver loop will only be activated if storcon does not push the specs to the pageserver.
+
+## Difference from Tenant Config
+
+* Feature flags can be modified by percentage, and the default config for each feature flag can be modified in UI without going through the release process.
+* Feature flags are more flexible and won't be persisted anywhere and will be passed as plain JSON over the wire so that do not need to handle backward/forward compatibility as in tenant config.
+* The expectation of tenant config is that once we add a flag we cannot remove it (or it will be hard to remove), but feature flags are more flexible.
+
+# Final Implementation
+
+* We added a new crate `posthog_lite_client` that supports local feature evaluations.
+* We set up two projects "Storage (staging)" and "Storage (production)" in the PostHog console.
+* Each pageserver reports 10 fake tenants to PostHog so that we can get all combinations of regions (and other properties) in the PostHog UI.
+* Supported properties: AZ, neon_region, pageserver, tenant_id.
+* You may use "Pageserver Feature Flags" dashboard to see the evaluation status.
+* The feature flag spec is polled on storcon every 30s (in each of the region) and storcon will propagate the spec to the pageservers.
+* The pageserver housekeeping loop updates the tenant-specific properties (e.g., remote size) for evaluation.
+
+Each tenant has a `feature_resolver` object. After you add a feature flag in the PostHog console, you can retrieve it with:
+
+```rust
+// Boolean flag
+self
+    .feature_resolver
+    .evaluate_boolean("flag")
+    .is_ok()
+// Multivariate flag
+self
+    .feature_resolver
+    .evaluate_multivariate("gc-comapction-strategy")
+    .ok();
+```
+
+The user needs to handle the case where the evaluation result is an error. This can occur in a variety of cases:
+
+* During the pageserver start, the feature flag spec has not been retrieved.
+* No condition group is matched.
+* The feature flag spec contains an operand/operation not supported by the lite PostHog library.
+
+For boolean flags, the return value is `Result<(), Error>`. `Ok(())` means the flag is evaluated to true. Otherwise,
+there is either an error in evaluation or it does not match any groups.
+
+For multivariate flags, the return value is `Result<String, Error>`. `Ok(variant)` indicates the flag is evaluated
+to a variant. Otherwise, there is either an error in evaluation or it does not match any groups.
+
+The evaluation logic is documented in the PostHog lite library. It compares the consistent hash of a flag key + tenant_id
+with the rollout percentage and determines which tenant to roll out a specific feature.
+
+Users can use the feature flag evaluation API to get the flag evaluation result of a specific tenant for debugging purposes.
+
+```
+curl http://localhost:9898/v1/tenant/:tenant_id/feature_flag?flag=:key&as=multivariate/boolean"
+```
+
+By default, the storcon pushes the feature flag specs to the pageservers every 30 seconds, which means that a change in feature flag in the
+PostHog UI will propagate to the pageservers within 30 seconds.
+
+# Future Works
+
+* Support dynamic tenant properties like logical size as the evaluation condition.
+* Support properties like `plan_type` (needs cplane to pass it down).
+* Report feature flag evaluation result back to PostHog (if the cost is okay).
+* Fast feature flag evaluation cache on critical paths (e.g., cache a feature flag result in `AtomicBool` and use it on the read path).
--- a/docs/rfcs/2025-03-17-compute-prewarm.md
+++ b/docs/rfcs/2025-03-17-compute-prewarm.md
@@ -0,0 +1,399 @@
+# Compute rolling restart with prewarm
+
+Created on 2025-03-17
+Implemented on _TBD_
+Author: Alexey Kondratov (@ololobus)
+
+## Summary
+
+This RFC describes an approach to reduce performance degradation due to missing caches after compute node restart, i.e.:
+
+1. Rolling restart of the running instance via 'warm' replica.
+2. Auto-prewarm compute caches after unplanned restart or scale-to-zero.
+
+## Motivation
+
+Neon currently implements several features that guarantee high uptime of compute nodes:
+
+1. Storage high-availability (HA), i.e. each tenant shard has a secondary pageserver location, so we can quickly switch over compute to it in case of primary pageserver failure.
+2. Fast compute provisioning, i.e. we have a fleet of pre-created empty computes, that are ready to serve workload, so restarting unresponsive compute is very fast.
+3. Preemptive NeonVM compute provisioning in case of k8s node unavailability.
+
+This helps us to be well-within the uptime SLO of 99.95% most of the time. Problems begin when we go up to multi-TB workloads and 32-64 CU computes.
+During restart, compute loses all caches: LFC, shared buffers, file system cache. Depending on the workload, it can take a lot of time to warm up the caches,
+so that performance could be degraded and might be even unacceptable for certain workloads. The latter means that although current approach works well for small to
+medium workloads, we still have to do some additional work to avoid performance degradation after restart of large instances.
+
+## Non Goals
+
+- Details of the persistence storage for prewarm data are out of scope, there is a separate RFC for that: <https://github.com/neondatabase/neon/pull/9661>.
+- Complete compute/Postgres HA setup and flow. Although it was originally in scope of this RFC, during preliminary research it appeared to be a rabbit hole, so it's worth of a separate RFC.
+- Low-level implementation details for Postgres replica-to-primary promotion. There are a lot of things to think and care about: how to start walproposer, [logical replication failover](https://www.postgresql.org/docs/current/logical-replication-failover.html), and so on, but it's worth of at least a separate one-pager design document if not RFC.
+
+## Impacted components
+
+Postgres, compute_ctl, Control plane, Endpoint storage for unlogged storage of compute files.
+For the latter, we will need to implement a uniform abstraction layer on top of S3, ABS, etc., but
+S3 is used in text interchangeably with 'endpoint storage' for simplicity.
+
+## Proposed implementation
+
+### compute_ctl spec changes and auto-prewarm
+
+We are going to extend the current compute spec with the following attributes
+
+```rust
+struct ComputeSpec {
+    /// [All existing attributes]
+    ...
+    /// Whether to do auto-prewarm at start or not.
+    /// Default to `false`.
+    pub lfc_auto_prewarm: bool
+    /// Interval in seconds between automatic dumps of
+    /// LFC state into S3. Default `None`, which means 'off'.
+    pub lfc_dump_interval_sec: Option<i32>
+}
+```
+
+When `lfc_dump_interval_sec` is set to `N`, `compute_ctl` will periodically dump the LFC state
+and store it in S3, so that it could be used either for auto-prewarm after restart or by replica
+during the rolling restart. For enabling periodic dumping, we should consider the following value
+`lfc_dump_interval_sec=300` (5 minutes), same as in the upstream's `pg_prewarm.autoprewarm_interval`.
+
+When `lfc_auto_prewarm` is set to `true`, `compute_ctl` will start prewarming the LFC upon restart
+iif some of the previous states is present in S3.
+
+### compute_ctl API
+
+1. `POST /store_lfc_state` -- dump LFC state using Postgres SQL interface and store result in S3.
+    This has to be a blocking call, i.e. it will return only after the state is stored in S3.
+    If there is any concurrent request in progress, we should return `429 Too Many Requests`,
+    and let the caller to retry.
+
+2. `GET /dump_lfc_state` -- dump LFC state using Postgres SQL interface and return it as is
+    in text format suitable for the future restore/prewarm. This API is not strictly needed at
+    the end state, but could be useful for a faster prototyping of a complete rolling restart flow
+    with prewarm, as it doesn't require persistent for LFC state storage.
+
+3. `POST /restore_lfc_state` -- restore/prewarm LFC state with request
+
+    ```yaml
+    RestoreLFCStateRequest:
+      oneOf:
+        - type: object
+          required:
+            - lfc_state
+          properties:
+            lfc_state:
+              type: string
+              description: Raw LFC content dumped with GET `/dump_lfc_state`
+        - type: object
+          required:
+            - lfc_cache_key
+          properties:
+            lfc_cache_key:
+              type: string
+              description: |
+                endpoint_id of the source endpoint on the same branch
+                to use as a 'donor' for LFC content. Compute will look up
+                LFC content dump in S3 using this key and do prewarm.
+    ```
+
+    where `lfc_state` and `lfc_cache_key` are mutually exclusive.
+
+    The actual prewarming will happen asynchronously, so the caller need to check the
+    prewarm status using the compute's standard `GET /status` API.
+
+4. `GET /status` -- extend existing API with following attributes
+
+    ```rust
+    struct ComputeStatusResponse {
+        // [All existing attributes]
+        ...
+        pub prewarm_state: PrewarmState
+    }
+
+    /// Compute prewarm state. Will be stored in the shared Compute state
+    /// in compute_ctl
+    struct PrewarmState {
+        pub status: PrewarmStatus
+        /// Total number of pages to prewarm
+        pub pages_total: i64
+        /// Number of pages prewarmed so far
+        pub pages_processed: i64
+        /// Optional prewarm error
+        pub error: Option<String>
+    }
+
+    pub enum PrewarmStatus {
+        /// Prewarming was never requested on this compute
+        Off,
+        /// Prewarming was requested, but not started yet
+        Pending,
+        /// Prewarming is in progress. The caller should follow
+        /// `PrewarmState::progress`.
+        InProgress,
+        /// Prewarming has been successfully completed
+        Completed,
+        /// Prewarming failed. The caller should look at
+        /// `PrewarmState::error` for the reason.
+        Failed,
+        /// It is intended to be used by auto-prewarm if none of
+        /// the previous LFC states is available in S3.
+        /// This is a distinct state from the `Failed` because
+        /// technically it's not a failure and could happen if
+        /// compute was restart before it dumped anything into S3,
+        /// or just after the initial rollout of the feature.
+        Skipped,
+    }
+    ```
+
+5. `POST /promote` -- this is a **blocking** API call to promote compute replica into primary.
+    This API should be very similar to the existing `POST /configure` API, i.e. accept the
+    spec (primary spec, because originally compute was started as replica). It's a distinct
+    API method because semantics and response codes are different:
+
+    - If promotion is done successfully, it will return `200 OK`.
+    - If compute is already primary, the call will be no-op and `compute_ctl`
+      will return `412 Precondition Failed`.
+    - If, for some reason, second request reaches compute that is in progress of promotion,
+      it will respond with `429 Too Many Requests`.
+    - If compute hit any permanent failure during promotion `500 Internal Server Error`
+      will be returned.
+
+### Control plane operations
+
+The complete flow will be present as a sequence diagram in the next section, but here
+we just want to list some important steps that have to be done by control plane during
+the rolling restart via warm replica, but without much of low-level implementation details.
+
+1. Register the 'intent' of the instance restart, but not yet interrupt any workload at
+    primary and also accept new connections. This may require some endpoint state machine
+    changes, e.g. introduction of the `pending_restart` state. Being in this state also
+    **mustn't prevent any other operations except restart**: suspend, live-reconfiguration
+    (e.g. due to notify-attach call from the storage controller), deletion.
+
+2. Start new replica compute on the same timeline and start prewarming it. This process
+    may take quite a while, so the same concurrency considerations as in 1. should be applied
+    here as well.
+
+3. When warm replica is ready, control plane should:
+
+    3.1. Terminate the primary compute. Starting from here, **this is a critical section**,
+        if anything goes off, the only option is to start the primary normally and proceed
+        with auto-prewarm.
+
+    3.2. Send cache invalidation message to all proxies, notifying them that all new connections
+        should request and wait for the new connection details. At this stage, proxy has to also
+        drop any existing connections to the old primary, so they didn't do stale reads.
+
+    3.3. Attach warm replica compute to the primary endpoint inside control plane metadata
+        database.
+
+    3.4. Promote replica to primary.
+
+    3.5. When everything is done, finalize the endpoint state to be just `active`.
+
+### Complete rolling restart flow
+
+```mermaid
+  sequenceDiagram
+
+  autonumber
+
+  participant proxy as Neon proxy
+
+  participant cplane as Control plane
+
+  participant primary as Compute (primary)
+  box Compute (replica)
+    participant ctl as compute_ctl
+    participant pg as Postgres
+  end
+
+  box Endpoint unlogged storage
+    participant s3proxy as Endpoint storage service
+    participant s3 as S3/ABS/etc.
+  end
+
+
+  cplane ->> primary: POST /store_lfc_state
+  primary -->> cplane: 200 OK
+
+  cplane ->> ctl: POST /restore_lfc_state
+  activate ctl
+  ctl -->> cplane: 202 Accepted
+
+  activate cplane
+  cplane ->> ctl: GET /status: poll prewarm status
+  ctl ->> s3proxy: GET /read_file
+  s3proxy ->> s3: read file
+  s3 -->> s3proxy: file content
+  s3proxy -->> ctl: 200 OK: file content
+
+  proxy ->> cplane: GET /proxy_wake_compute
+  cplane -->> proxy: 200 OK: old primary conninfo
+
+  ctl ->> pg: prewarm LFC
+  activate pg
+  pg -->> ctl: prewarm is completed
+  deactivate pg
+
+  ctl -->> cplane: 200 OK: prewarm is completed
+  deactivate ctl
+  deactivate cplane
+
+  cplane -->> cplane: reassign replica compute to endpoint,<br>start terminating the old primary compute
+  activate cplane
+  cplane ->> proxy: invalidate caches
+
+  proxy ->> cplane: GET /proxy_wake_compute
+
+  cplane -x primary: POST /terminate
+  primary -->> cplane: 200 OK
+  note over primary: old primary<br>compute terminated
+
+  cplane ->> ctl: POST /promote
+  activate ctl
+  ctl ->> pg: pg_ctl promote
+  activate pg
+  pg -->> ctl: done
+  deactivate pg
+  ctl -->> cplane: 200 OK
+  deactivate ctl
+
+  cplane -->> cplane: finalize operation
+  cplane -->> proxy: 200 OK: new primary conninfo
+  deactivate cplane
+```
+
+### Network bandwidth and prewarm speed
+
+It's currently known that pageserver can sustain about 3000 RPS per shard for a few running computes.
+Large tenants are usually split into 8 shards, so the final formula may look like this:
+
+```text
+8 shards * 3000 RPS * 8 KB =~ 190 MB/s
+```
+
+so depending on the LFC size, prewarming will take at least:
+
+- ~5s for 1 GB
+- ~50s for 10 GB
+- ~5m for 100 GB
+- \>1h for 1 TB
+
+In total, one pageserver is normally capped by 30k RPS, so it obviously can't sustain many computes
+doing prewarm at the same time. Later, we may need an additional mechanism for computes to throttle
+the prewarming requests gracefully.
+
+### Reliability, failure modes and corner cases
+
+We consider following failures while implementing this RFC:
+
+1. Compute got interrupted/crashed/restarted during prewarm. The caller -- control plane -- should
+    detect that and start prewarm from the beginning.
+
+2. Control plane promotion request timed out or hit network issues. If it never reached the
+    compute, control plane should just repeat it. If it did reach the compute, then during
+    retry control plane can hit `409` as previous request triggered the promotion already.
+    In this case, control plane need to retry until either `200` or
+    permanent error `500` is returned.
+
+3. Compute got interrupted/crashed/restarted during promotion. At restart it will ask for
+    a spec from control plane, and its content should signal compute to start as **primary**,
+    so it's expected that control plane will continue polling for certain period of time and
+    will discover that compute is ready to accept connections if restart is fast enough.
+
+4. Any other unexpected failure or timeout during prewarming. This **failure mustn't be fatal**,
+    control plane has to report failure, terminate replica and keep primary running.
+
+5. Any other unexpected failure or timeout during promotion. Unfortunately, at this moment
+    we already have the primary node stopped, so the only option is to start primary again
+    and proceed with auto-prewarm.
+
+6. Any unexpected failure during auto-prewarm. This **failure mustn't be fatal**,
+    `compute_ctl` has to report the failure, but do not crash the compute.
+
+7. Control plane failed to confirm that old primary has terminated. This can happen, especially
+    in the future HA setup. In this case, control plane has to ensure that it sent VM deletion
+    and pod termination requests to k8s, so long-term we do not have two running primaries
+    on the same timeline.
+
+### Security implications
+
+There are two security implications to consider:
+
+1. Access to `compute_ctl` API. It has to be accessible from the outside of compute, so all
+    new API methods have to be exposed on the **external** HTTP port and **must** be authenticated
+    with JWT.
+
+2. Read/write only your own LFC state data in S3. Although it's not really a security concern,
+    since LFC state is just a mapping of blocks present in LFC at certain moment in time;
+    it still has to be highly restricted, so that i) only computes on the same timeline can
+    read S3 state; ii) each compute can only write to the path that contains it's `endpoint_id`.
+    Both of this must be validated by Endpoint storage service using the JWT token provided by `compute_ctl`.
+
+### Unresolved questions
+
+#### Billing, metrics and monitoring
+
+Currently, we only label computes with `endpoint_id` after attaching them to the endpoint.
+In this proposal, this means that temporary replica will remain unlabelled until it's promoted
+to primary. We can also hide it from users in the control plane API, but what to do with
+billing and monitoring is still unclear.
+
+We can probably mark it as 'billable' and tag with `project_id`, so it will be billed, but
+not interfere in any way with the current primary monitoring.
+
+Another thing to consider is how logs and metrics export will switch to the new compute.
+It's expected that OpenTelemetry collector will auto-discover the new compute and start
+scraping metrics from it.
+
+#### Auto-prewarm
+
+It's still an open question whether we need auto-prewarm at all. The author's gut-feeling is
+that yes, we need it, but might be not for all workloads, so it could end up exposed as a
+user-controllable knob on the endpoint. There are two arguments for that:
+
+1. Auto-prewarm existing in upstream's `pg_prewarm`, _probably for a reason_.
+
+2. There are still could be 2 flows when we cannot perform the rolling restart via the warm
+    replica: i) any failure or interruption during promotion; ii) wake up after scale-to-zero.
+    The latter might be challenged as well, i.e. one can argue that auto-prewarm may and will
+    compete with user-workload for storage resources. This is correct, but it might as well
+    reduce the time to get warm LFC and good performance.
+
+#### Low-level details of the replica promotion
+
+There are many things to consider here, but three items just off the top of my head:
+
+1. How to properly start the `walproposer` inside Postgres.
+
+2. What to do with logical replication. Currently, we do not include logical replication slots
+    inside basebackup, because nobody advances them at replica, so they just prevent the WAL
+    deletion. Yet, we do need to have them at primary after promotion. Starting with Postgres 17,
+    there is a new feature called
+    [logical replication failover](https://www.postgresql.org/docs/current/logical-replication-failover.html)
+    and `synchronized_standby_slots` setting, but we need a plan for the older versions. Should we
+    request a new basebackup during promotion?
+
+3. How do we guarantee that replica will receive all the latest WAL from safekeepers? Do some
+    'shallow' version of sync safekeepers without data copying? Or just a standard version of
+    sync safekeepers?
+
+## Alternative implementation
+
+The proposal already assumes one of the alternatives -- do not have any persistent storage for
+LFC state. This is possible to implement faster with the proposed API, but it means that
+we do not implement auto-prewarm yet.
+
+## Definition of Done
+
+At the end of implementing this RFC we should have two high-level settings that enable:
+
+1. Auto-prewarm of user computes upon restart.
+2. Perform primary compute restart via the warm replica promotion.
+
+It also has to be decided what's the criteria for enabling one or both of these flows for
+certain clients.
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -58,7 +58,7 @@ pub enum LfcPrewarmState {
    },
 }

-#[derive(Serialize, Default, Debug, Clone)]
+#[derive(Serialize, Default, Debug, Clone, PartialEq)]
 #[serde(tag = "status", rename_all = "snake_case")]
 pub enum LfcOffloadState {
    #[default]
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -181,10 +181,14 @@ pub struct ComputeSpec {
    /// JWT for authorizing requests to endpoint storage service
    pub endpoint_storage_token: Option<String>,

-    /// Download LFC state from endpoint_storage and pass it to Postgres on startup
    #[serde(default)]
+    /// Download LFC state from endpoint storage and pass it to Postgres on compute startup
    pub autoprewarm: bool,

+    #[serde(default)]
+    /// Upload LFC state to endpoint storage periodically. Default value (None) means "don't upload"
+    pub offload_lfc_interval_seconds: Option<std::num::NonZeroU64>,
+
    /// Suspend timeout in seconds.
    ///
    /// We use this value to derive other values, such as the installed extensions metric.
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -90,6 +90,11 @@
                "value": "off",
                "vartype": "bool"
            },
+            {
+                "name": "offload_lfc_interval_seconds",
+                "value": "20",
+                "vartype": "integer"
+            },
            {
                "name": "neon.safekeepers",
                "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501",
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -386,6 +386,7 @@ pub enum NodeSchedulingPolicy {
    Pause,
    PauseForRestart,
    Draining,
+    Deleting,
 }

 impl FromStr for NodeSchedulingPolicy {
@@ -398,6 +399,7 @@ impl FromStr for NodeSchedulingPolicy {
            "pause" => Ok(Self::Pause),
            "pause_for_restart" => Ok(Self::PauseForRestart),
            "draining" => Ok(Self::Draining),
+            "deleting" => Ok(Self::Deleting),
            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
        }
    }
@@ -412,6 +414,7 @@ impl From<NodeSchedulingPolicy> for String {
            Pause => "pause",
            PauseForRestart => "pause_for_restart",
            Draining => "draining",
+            Deleting => "deleting",
        }
        .to_string()
    }
@@ -420,6 +423,7 @@ impl From<NodeSchedulingPolicy> for String {
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum SkSchedulingPolicy {
    Active,
+    Activating,
    Pause,
    Decomissioned,
 }
@@ -430,6 +434,7 @@ impl FromStr for SkSchedulingPolicy {
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        Ok(match s {
            "active" => Self::Active,
+            "activating" => Self::Activating,
            "pause" => Self::Pause,
            "decomissioned" => Self::Decomissioned,
            _ => {
@@ -446,6 +451,7 @@ impl From<SkSchedulingPolicy> for String {
        use SkSchedulingPolicy::*;
        match value {
            Active => "active",
+            Activating => "activating",
            Pause => "pause",
            Decomissioned => "decomissioned",
        }
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -78,7 +78,13 @@ pub fn is_expected_io_error(e: &io::Error) -> bool {
    use io::ErrorKind::*;
    matches!(
        e.kind(),
-        BrokenPipe | ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut
+        HostUnreachable
+            | NetworkUnreachable
+            | BrokenPipe
+            | ConnectionRefused
+            | ConnectionAborted
+            | ConnectionReset
+            | TimedOut,
    )
 }

--- a/libs/proxy/json/Cargo.toml
+++ b/libs/proxy/json/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "json"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+ryu = "1"
+itoa = "1"
+
+[dev-dependencies]
+futures = "0.3"
--- a/libs/proxy/json/src/lib.rs
+++ b/libs/proxy/json/src/lib.rs
@@ -0,0 +1,412 @@
+//! A JSON serialization lib, designed for more flexibility than `serde_json` offers.
+//!
+//! Features:
+//!
+//! ## Dynamic construction
+//!
+//! Sometimes you have dynamic values you want to serialize, that are not already in a serde-aware model like a struct or a Vec etc.
+//! To achieve this with serde, you need to implement a lot of different traits on a lot of different new-types.
+//! Because of this, it's often easier to give-in and pull all the data into a serde-aware model (`serde_json::Value` or some intermediate struct),
+//! but that is often not very efficient.
+//!
+//! This crate allows full control over the JSON encoding without needing to implement any extra traits. Just call the
+//! relevant functions, and it will guarantee a correctly encoded JSON value.
+//!
+//! ## Async construction
+//!
+//! Similar to the above, sometimes the values arrive asynchronously. Often collecting those values in memory
+//! is more expensive than writing them as JSON, since the overheads of `Vec` and `String` is much higher, however
+//! there are exceptions.
+//!
+//! Serializing to JSON all in one go is also more CPU intensive and can cause lag spikes,
+//! whereas serializing values incrementally spreads out the CPU load and reduces lag.
+//!
+//! ## Examples
+//!
+//! To represent the following JSON as a compact string
+//!
+//! ```json
+//! {
+//!   "results": {
+//!     "rows": [
+//!       {
+//!         "id": 1,
+//!         "value": null
+//!       },
+//!       {
+//!         "id": 2,
+//!         "value": "hello"
+//!       }
+//!     ]
+//!   }
+//! }
+//! ```
+//!
+//! We can use the following code:
+//!
+//! ```
+//! // create the outer object
+//! let s = json::value_to_string!(|v| json::value_as_object!(|v| {
+//!     // create an entry with key "results" and start an object value associated with it.
+//!     let results = v.key("results");
+//!     json::value_as_object!(|results| {
+//!         // create an entry with key "rows" and start an list value associated with it.
+//!         let rows = results.key("rows");
+//!         json::value_as_list!(|rows| {
+//!             // create a list entry and start an object value associated with it.
+//!             let row = rows.entry();
+//!             json::value_as_object!(|row| {
+//!                 // add entry "id": 1
+//!                 row.entry("id", 1);
+//!                 // add entry "value": null
+//!                 row.entry("value", json::Null);
+//!             });
+//!
+//!             // create a list entry and start an object value associated with it.
+//!             let row = rows.entry();
+//!             json::value_as_object!(|row| {
+//!                 // add entry "id": 2
+//!                 row.entry("id", 2);
+//!                 // add entry "value": "hello"
+//!                 row.entry("value", "hello");
+//!             });
+//!         });
+//!     });
+//! }));
+//!
+//! assert_eq!(s, r#"{"results":{"rows":[{"id":1,"value":null},{"id":2,"value":"hello"}]}}"#);
+//! ```
+
+mod macros;
+mod str;
+mod value;
+
+pub use value::{Null, ValueEncoder};
+
+#[must_use]
+/// Serialize a single json value.
+pub struct ValueSer<'buf> {
+    buf: &'buf mut Vec<u8>,
+    start: usize,
+}
+
+impl<'buf> ValueSer<'buf> {
+    /// Create a new json value serializer.
+    pub fn new(buf: &'buf mut Vec<u8>) -> Self {
+        Self { buf, start: 0 }
+    }
+
+    /// Borrow the underlying buffer
+    pub fn as_buffer(&self) -> &[u8] {
+        self.buf
+    }
+
+    #[inline]
+    pub fn value(self, e: impl ValueEncoder) {
+        e.encode(self);
+    }
+
+    /// Write raw bytes to the buf. This must be already JSON encoded.
+    #[inline]
+    pub fn write_raw_json(self, data: &[u8]) {
+        self.buf.extend_from_slice(data);
+        self.finish();
+    }
+
+    /// Start a new object serializer.
+    #[inline]
+    pub fn object(self) -> ObjectSer<'buf> {
+        ObjectSer::new(self)
+    }
+
+    /// Start a new list serializer.
+    #[inline]
+    pub fn list(self) -> ListSer<'buf> {
+        ListSer::new(self)
+    }
+
+    /// Finish the value ser.
+    #[inline]
+    fn finish(self) {
+        // don't trigger the drop handler which triggers a rollback.
+        // this won't cause memory leaks because `ValueSet` owns no allocations.
+        std::mem::forget(self);
+    }
+}
+
+impl Drop for ValueSer<'_> {
+    fn drop(&mut self) {
+        self.buf.truncate(self.start);
+    }
+}
+
+#[must_use]
+/// Serialize a json object.
+pub struct ObjectSer<'buf> {
+    value: ValueSer<'buf>,
+    start: usize,
+}
+
+impl<'buf> ObjectSer<'buf> {
+    /// Start a new object serializer.
+    #[inline]
+    pub fn new(value: ValueSer<'buf>) -> Self {
+        value.buf.push(b'{');
+        let start = value.buf.len();
+        Self { value, start }
+    }
+
+    /// Borrow the underlying buffer
+    pub fn as_buffer(&self) -> &[u8] {
+        self.value.as_buffer()
+    }
+
+    /// Start a new object entry with the given string key, returning a [`ValueSer`] for the associated value.
+    #[inline]
+    pub fn key(&mut self, key: impl KeyEncoder) -> ValueSer<'_> {
+        key.write_key(self)
+    }
+
+    /// Write an entry (key-value pair) to the object.
+    #[inline]
+    pub fn entry(&mut self, key: impl KeyEncoder, val: impl ValueEncoder) {
+        self.key(key).value(val);
+    }
+
+    #[inline]
+    fn entry_inner(&mut self, f: impl FnOnce(&mut Vec<u8>)) -> ValueSer<'_> {
+        // track before the separator so we the value is rolled back it also removes the separator.
+        let start = self.value.buf.len();
+
+        // push separator if necessary
+        if self.value.buf.len() > self.start {
+            self.value.buf.push(b',');
+        }
+        // push key
+        f(self.value.buf);
+        // push value separator
+        self.value.buf.push(b':');
+
+        // return value writer.
+        ValueSer {
+            buf: self.value.buf,
+            start,
+        }
+    }
+
+    /// Reset the buffer back to before this object was started.
+    #[inline]
+    pub fn rollback(self) -> ValueSer<'buf> {
+        // Do not fully reset the value, only reset it to before the `{`.
+        // This ensures any `,` before this value are not clobbered.
+        self.value.buf.truncate(self.start - 1);
+        self.value
+    }
+
+    /// Finish the object ser.
+    #[inline]
+    pub fn finish(self) {
+        self.value.buf.push(b'}');
+        self.value.finish();
+    }
+}
+
+pub trait KeyEncoder {
+    fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a>;
+}
+
+#[must_use]
+/// Serialize a json object.
+pub struct ListSer<'buf> {
+    value: ValueSer<'buf>,
+    start: usize,
+}
+
+impl<'buf> ListSer<'buf> {
+    /// Start a new list serializer.
+    #[inline]
+    pub fn new(value: ValueSer<'buf>) -> Self {
+        value.buf.push(b'[');
+        let start = value.buf.len();
+        Self { value, start }
+    }
+
+    /// Borrow the underlying buffer
+    pub fn as_buffer(&self) -> &[u8] {
+        self.value.as_buffer()
+    }
+
+    /// Write an value to the list.
+    #[inline]
+    pub fn push(&mut self, val: impl ValueEncoder) {
+        self.entry().value(val);
+    }
+
+    /// Start a new value entry in this list.
+    #[inline]
+    pub fn entry(&mut self) -> ValueSer<'_> {
+        // track before the separator so we the value is rolled back it also removes the separator.
+        let start = self.value.buf.len();
+
+        // push separator if necessary
+        if self.value.buf.len() > self.start {
+            self.value.buf.push(b',');
+        }
+
+        // return value writer.
+        ValueSer {
+            buf: self.value.buf,
+            start,
+        }
+    }
+
+    /// Reset the buffer back to before this object was started.
+    #[inline]
+    pub fn rollback(self) -> ValueSer<'buf> {
+        // Do not fully reset the value, only reset it to before the `[`.
+        // This ensures any `,` before this value are not clobbered.
+        self.value.buf.truncate(self.start - 1);
+        self.value
+    }
+
+    /// Finish the object ser.
+    #[inline]
+    pub fn finish(self) {
+        self.value.buf.push(b']');
+        self.value.finish();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{Null, ValueSer};
+
+    #[test]
+    fn object() {
+        let mut buf = vec![];
+        let mut object = ValueSer::new(&mut buf).object();
+        object.entry("foo", "bar");
+        object.entry("baz", Null);
+        object.finish();
+
+        assert_eq!(buf, br#"{"foo":"bar","baz":null}"#);
+    }
+
+    #[test]
+    fn list() {
+        let mut buf = vec![];
+        let mut list = ValueSer::new(&mut buf).list();
+        list.entry().value("bar");
+        list.entry().value(Null);
+        list.finish();
+
+        assert_eq!(buf, br#"["bar",null]"#);
+    }
+
+    #[test]
+    fn object_macro() {
+        let res = crate::value_to_string!(|obj| {
+            crate::value_as_object!(|obj| {
+                obj.entry("foo", "bar");
+                obj.entry("baz", Null);
+            })
+        });
+
+        assert_eq!(res, r#"{"foo":"bar","baz":null}"#);
+    }
+
+    #[test]
+    fn list_macro() {
+        let res = crate::value_to_string!(|list| {
+            crate::value_as_list!(|list| {
+                list.entry().value("bar");
+                list.entry().value(Null);
+            })
+        });
+
+        assert_eq!(res, r#"["bar",null]"#);
+    }
+
+    #[test]
+    fn rollback_on_drop() {
+        let res = crate::value_to_string!(|list| {
+            crate::value_as_list!(|list| {
+                list.entry().value("bar");
+
+                'cancel: {
+                    let nested_list = list.entry();
+                    crate::value_as_list!(|nested_list| {
+                        nested_list.entry().value(1);
+
+                        assert_eq!(nested_list.as_buffer(), br#"["bar",[1"#);
+                        if true {
+                            break 'cancel;
+                        }
+                    })
+                }
+
+                assert_eq!(list.as_buffer(), br#"["bar""#);
+
+                list.entry().value(Null);
+            })
+        });
+
+        assert_eq!(res, r#"["bar",null]"#);
+    }
+
+    #[test]
+    fn rollback_object() {
+        let res = crate::value_to_string!(|obj| {
+            crate::value_as_object!(|obj| {
+                let entry = obj.key("1");
+                entry.value(1_i32);
+
+                let entry = obj.key("2");
+                let entry = {
+                    let mut nested_obj = entry.object();
+                    nested_obj.entry("foo", "bar");
+                    nested_obj.rollback()
+                };
+
+                entry.value(2_i32);
+            })
+        });
+
+        assert_eq!(res, r#"{"1":1,"2":2}"#);
+    }
+
+    #[test]
+    fn rollback_list() {
+        let res = crate::value_to_string!(|list| {
+            crate::value_as_list!(|list| {
+                let entry = list.entry();
+                entry.value(1_i32);
+
+                let entry = list.entry();
+                let entry = {
+                    let mut nested_list = entry.list();
+                    nested_list.push("foo");
+                    nested_list.rollback()
+                };
+
+                entry.value(2_i32);
+            })
+        });
+
+        assert_eq!(res, r#"[1,2]"#);
+    }
+
+    #[test]
+    fn string_escaping() {
+        let mut buf = vec![];
+        let mut object = ValueSer::new(&mut buf).object();
+
+        let key = "hello";
+        let value = "\n world";
+
+        object.entry(format_args!("{key:?}"), value);
+        object.finish();
+
+        assert_eq!(buf, br#"{"\"hello\"":"\n world"}"#);
+    }
+}
--- a/libs/proxy/json/src/macros.rs
+++ b/libs/proxy/json/src/macros.rs
@@ -0,0 +1,86 @@
+//! # Examples
+//!
+//! ```
+//! use futures::{StreamExt, TryStream, TryStreamExt};
+//!
+//! async fn stream_to_json_list<S, T, E>(mut s: S) -> Result<String, E>
+//! where
+//!     S: TryStream<Ok = T, Error = E> + Unpin,
+//!     T: json::ValueEncoder
+//! {
+//!     Ok(json::value_to_string!(|val| json::value_as_list!(|val| {
+//!         // note how we can use `.await` and `?` in here.
+//!         while let Some(value) = s.try_next().await? {
+//!             val.push(value);
+//!         }
+//!     })))
+//! }
+//!
+//! let stream = futures::stream::iter([1, 2, 3]).map(Ok::<i32, ()>);
+//! let json_string = futures::executor::block_on(stream_to_json_list(stream)).unwrap();
+//! assert_eq!(json_string, "[1,2,3]");
+//! ```
+
+/// A helper to create a new JSON vec.
+///
+/// Implemented as a macro to preserve all control flow.
+#[macro_export]
+macro_rules! value_to_vec {
+    (|$val:ident| $body:expr) => {{
+        let mut buf = vec![];
+        let $val = $crate::ValueSer::new(&mut buf);
+        let _: () = $body;
+        buf
+    }};
+}
+
+/// A helper to create a new JSON string.
+///
+/// Implemented as a macro to preserve all control flow.
+#[macro_export]
+macro_rules! value_to_string {
+    (|$val:ident| $body:expr) => {{
+        ::std::string::String::from_utf8($crate::value_to_vec!(|$val| $body))
+            .expect("json should be valid utf8")
+    }};
+}
+
+/// A helper that ensures the [`ObjectSer::finish`](crate::ObjectSer::finish) method is called on completion.
+///
+/// Consumes `$val` and assigns it as an [`ObjectSer`](crate::ObjectSer) serializer.
+/// The serializer is only 'finished' if the body completes.
+/// The serializer is rolled back if `break`/`return` escapes the body.
+///
+/// Implemented as a macro to preserve all control flow.
+#[macro_export]
+macro_rules! value_as_object {
+    (|$val:ident| $body:expr) => {{
+        let mut obj = $crate::ObjectSer::new($val);
+
+        let $val = &mut obj;
+        let res = $body;
+
+        obj.finish();
+        res
+    }};
+}
+
+/// A helper that ensures the [`ListSer::finish`](crate::ListSer::finish) method is called on completion.
+///
+/// Consumes `$val` and assigns it as an [`ListSer`](crate::ListSer) serializer.
+/// The serializer is only 'finished' if the body completes.
+/// The serializer is rolled back if `break`/`return` escapes the body.
+///
+/// Implemented as a macro to preserve all control flow.
+#[macro_export]
+macro_rules! value_as_list {
+    (|$val:ident| $body:expr) => {{
+        let mut list = $crate::ListSer::new($val);
+
+        let $val = &mut list;
+        let res = $body;
+
+        list.finish();
+        res
+    }};
+}
--- a/libs/proxy/json/src/str.rs
+++ b/libs/proxy/json/src/str.rs
@@ -0,0 +1,166 @@
+//! Helpers for serializing escaped strings.
+//!
+//! ## License
+//!
+//! <https://github.com/serde-rs/json/blob/c1826ebcccb1a520389c6b78ad3da15db279220d/src/ser.rs#L1514-L1552>
+//! <https://github.com/serde-rs/json/blob/c1826ebcccb1a520389c6b78ad3da15db279220d/src/ser.rs#L2081-L2157>
+//! Licensed by David Tolnay under MIT or Apache-2.0.
+//!
+//! With modifications by Conrad Ludgate on behalf of Databricks.
+
+use std::fmt::{self, Write};
+
+/// Represents a character escape code in a type-safe manner.
+pub enum CharEscape {
+    /// An escaped quote `"`
+    Quote,
+    /// An escaped reverse solidus `\`
+    ReverseSolidus,
+    // /// An escaped solidus `/`
+    // Solidus,
+    /// An escaped backspace character (usually escaped as `\b`)
+    Backspace,
+    /// An escaped form feed character (usually escaped as `\f`)
+    FormFeed,
+    /// An escaped line feed character (usually escaped as `\n`)
+    LineFeed,
+    /// An escaped carriage return character (usually escaped as `\r`)
+    CarriageReturn,
+    /// An escaped tab character (usually escaped as `\t`)
+    Tab,
+    /// An escaped ASCII plane control character (usually escaped as
+    /// `\u00XX` where `XX` are two hex characters)
+    AsciiControl(u8),
+}
+
+impl CharEscape {
+    #[inline]
+    fn from_escape_table(escape: u8, byte: u8) -> CharEscape {
+        match escape {
+            self::BB => CharEscape::Backspace,
+            self::TT => CharEscape::Tab,
+            self::NN => CharEscape::LineFeed,
+            self::FF => CharEscape::FormFeed,
+            self::RR => CharEscape::CarriageReturn,
+            self::QU => CharEscape::Quote,
+            self::BS => CharEscape::ReverseSolidus,
+            self::UU => CharEscape::AsciiControl(byte),
+            _ => unreachable!(),
+        }
+    }
+}
+
+pub(crate) fn format_escaped_str(writer: &mut Vec<u8>, value: &str) {
+    writer.reserve(2 + value.len());
+
+    writer.push(b'"');
+
+    let rest = format_escaped_str_contents(writer, value);
+    writer.extend_from_slice(rest);
+
+    writer.push(b'"');
+}
+
+pub(crate) fn format_escaped_fmt(writer: &mut Vec<u8>, args: fmt::Arguments) {
+    writer.push(b'"');
+
+    Collect { buf: writer }
+        .write_fmt(args)
+        .expect("formatting should not error");
+
+    writer.push(b'"');
+}
+
+struct Collect<'buf> {
+    buf: &'buf mut Vec<u8>,
+}
+
+impl fmt::Write for Collect<'_> {
+    fn write_str(&mut self, s: &str) -> fmt::Result {
+        let last = format_escaped_str_contents(self.buf, s);
+        self.buf.extend(last);
+        Ok(())
+    }
+}
+
+// writes any escape sequences, and returns the suffix still needed to be written.
+fn format_escaped_str_contents<'a>(writer: &mut Vec<u8>, value: &'a str) -> &'a [u8] {
+    let bytes = value.as_bytes();
+
+    let mut start = 0;
+
+    for (i, &byte) in bytes.iter().enumerate() {
+        let escape = ESCAPE[byte as usize];
+        if escape == 0 {
+            continue;
+        }
+
+        writer.extend_from_slice(&bytes[start..i]);
+
+        let char_escape = CharEscape::from_escape_table(escape, byte);
+        write_char_escape(writer, char_escape);
+
+        start = i + 1;
+    }
+
+    &bytes[start..]
+}
+
+const BB: u8 = b'b'; // \x08
+const TT: u8 = b't'; // \x09
+const NN: u8 = b'n'; // \x0A
+const FF: u8 = b'f'; // \x0C
+const RR: u8 = b'r'; // \x0D
+const QU: u8 = b'"'; // \x22
+const BS: u8 = b'\\'; // \x5C
+const UU: u8 = b'u'; // \x00...\x1F except the ones above
+const __: u8 = 0;
+
+// Lookup table of escape sequences. A value of b'x' at index i means that byte
+// i is escaped as "\x" in JSON. A value of 0 means that byte i is not escaped.
+static ESCAPE: [u8; 256] = [
+    //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+    UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0
+    UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1
+    __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
+    __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
+];
+
+fn write_char_escape(writer: &mut Vec<u8>, char_escape: CharEscape) {
+    let s = match char_escape {
+        CharEscape::Quote => b"\\\"",
+        CharEscape::ReverseSolidus => b"\\\\",
+        // CharEscape::Solidus => b"\\/",
+        CharEscape::Backspace => b"\\b",
+        CharEscape::FormFeed => b"\\f",
+        CharEscape::LineFeed => b"\\n",
+        CharEscape::CarriageReturn => b"\\r",
+        CharEscape::Tab => b"\\t",
+        CharEscape::AsciiControl(byte) => {
+            static HEX_DIGITS: [u8; 16] = *b"0123456789abcdef";
+            let bytes = &[
+                b'\\',
+                b'u',
+                b'0',
+                b'0',
+                HEX_DIGITS[(byte >> 4) as usize],
+                HEX_DIGITS[(byte & 0xF) as usize],
+            ];
+            return writer.extend_from_slice(bytes);
+        }
+    };
+
+    writer.extend_from_slice(s);
+}
--- a/libs/proxy/json/src/value.rs
+++ b/libs/proxy/json/src/value.rs
@@ -0,0 +1,168 @@
+use core::fmt;
+use std::collections::{BTreeMap, HashMap};
+
+use crate::str::{format_escaped_fmt, format_escaped_str};
+use crate::{KeyEncoder, ObjectSer, ValueSer, value_as_list, value_as_object};
+
+/// Write a value to the underlying json representation.
+pub trait ValueEncoder {
+    fn encode(self, v: ValueSer<'_>);
+}
+
+pub(crate) fn write_int(x: impl itoa::Integer, b: &mut Vec<u8>) {
+    b.extend_from_slice(itoa::Buffer::new().format(x).as_bytes());
+}
+
+pub(crate) fn write_float(x: impl ryu::Float, b: &mut Vec<u8>) {
+    b.extend_from_slice(ryu::Buffer::new().format(x).as_bytes());
+}
+
+impl<T: Copy + ValueEncoder> ValueEncoder for &T {
+    #[inline]
+    fn encode(self, v: ValueSer<'_>) {
+        T::encode(*self, v);
+    }
+}
+
+impl ValueEncoder for &str {
+    #[inline]
+    fn encode(self, v: ValueSer<'_>) {
+        format_escaped_str(v.buf, self);
+        v.finish();
+    }
+}
+
+impl ValueEncoder for fmt::Arguments<'_> {
+    #[inline]
+    fn encode(self, v: ValueSer<'_>) {
+        if let Some(s) = self.as_str() {
+            format_escaped_str(v.buf, s);
+        } else {
+            format_escaped_fmt(v.buf, self);
+        }
+        v.finish();
+    }
+}
+
+macro_rules! int {
+    [$($t:ty),*] => {
+        $(
+            impl ValueEncoder for $t {
+                #[inline]
+                fn encode(self, v: ValueSer<'_>) {
+                    write_int(self, v.buf);
+                    v.finish();
+                }
+            }
+        )*
+    };
+}
+
+int![u8, u16, u32, u64, usize, u128];
+int![i8, i16, i32, i64, isize, i128];
+
+macro_rules! float {
+    [$($t:ty),*] => {
+        $(
+            impl ValueEncoder for $t {
+                #[inline]
+                fn encode(self, v: ValueSer<'_>) {
+                    write_float(self, v.buf);
+                    v.finish();
+                }
+            }
+        )*
+    };
+}
+
+float![f32, f64];
+
+impl ValueEncoder for bool {
+    #[inline]
+    fn encode(self, v: ValueSer<'_>) {
+        v.write_raw_json(if self { b"true" } else { b"false" });
+    }
+}
+
+impl<T: ValueEncoder> ValueEncoder for Option<T> {
+    #[inline]
+    fn encode(self, v: ValueSer<'_>) {
+        match self {
+            Some(value) => value.encode(v),
+            None => Null.encode(v),
+        }
+    }
+}
+
+impl KeyEncoder for &str {
+    #[inline]
+    fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a> {
+        let obj = &mut *obj;
+        obj.entry_inner(|b| format_escaped_str(b, self))
+    }
+}
+
+impl KeyEncoder for fmt::Arguments<'_> {
+    #[inline]
+    fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a> {
+        if let Some(key) = self.as_str() {
+            obj.entry_inner(|b| format_escaped_str(b, key))
+        } else {
+            obj.entry_inner(|b| format_escaped_fmt(b, self))
+        }
+    }
+}
+
+/// Represents the JSON null value.
+pub struct Null;
+
+impl ValueEncoder for Null {
+    #[inline]
+    fn encode(self, v: ValueSer<'_>) {
+        v.write_raw_json(b"null");
+    }
+}
+
+impl<T: ValueEncoder> ValueEncoder for Vec<T> {
+    #[inline]
+    fn encode(self, v: ValueSer<'_>) {
+        value_as_list!(|v| {
+            for t in self {
+                v.entry().value(t);
+            }
+        });
+    }
+}
+
+impl<T: Copy + ValueEncoder> ValueEncoder for &[T] {
+    #[inline]
+    fn encode(self, v: ValueSer<'_>) {
+        value_as_list!(|v| {
+            for t in self {
+                v.entry().value(t);
+            }
+        });
+    }
+}
+
+impl<K: KeyEncoder, V: ValueEncoder, S> ValueEncoder for HashMap<K, V, S> {
+    #[inline]
+    fn encode(self, o: ValueSer<'_>) {
+        value_as_object!(|o| {
+            for (k, v) in self {
+                o.entry(k, v);
+            }
+        });
+    }
+}
+
+impl<K: KeyEncoder, V: ValueEncoder> ValueEncoder for BTreeMap<K, V> {
+    #[inline]
+    fn encode(self, o: ValueSer<'_>) {
+        value_as_object!(|o| {
+            for (k, v) in self {
+                o.entry(k, v);
+            }
+        });
+    }
+}
--- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
+++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
@@ -52,7 +52,7 @@ pub(crate) async fn hi(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
        }
        // yield every ~250us
        // hopefully reduces tail latencies
-        if i % 1024 == 0 {
+        if i.is_multiple_of(1024) {
            yield_now().await
        }
    }
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -90,7 +90,7 @@ pub struct InnerClient {
 }

 impl InnerClient {
-    pub fn start(&mut self) -> Result<PartialQuery, Error> {
+    pub fn start(&mut self) -> Result<PartialQuery<'_>, Error> {
        self.responses.waiting += 1;
        Ok(PartialQuery(Some(self)))
    }
@@ -227,7 +227,7 @@ impl Client {
        &mut self,
        statement: &str,
        params: I,
-    ) -> Result<RowStream, Error>
+    ) -> Result<RowStream<'_>, Error>
    where
        S: AsRef<str>,
        I: IntoIterator<Item = Option<S>>,
@@ -262,7 +262,7 @@ impl Client {
    pub(crate) async fn simple_query_raw(
        &mut self,
        query: &str,
-    ) -> Result<SimpleQueryStream, Error> {
+    ) -> Result<SimpleQueryStream<'_>, Error> {
        simple_query::simple_query(self.inner_mut(), query).await
    }

--- a/libs/proxy/tokio-postgres2/src/generic_client.rs
+++ b/libs/proxy/tokio-postgres2/src/generic_client.rs
@@ -12,7 +12,11 @@ mod private {
 /// This trait is "sealed", and cannot be implemented outside of this crate.
 pub trait GenericClient: private::Sealed {
    /// Like `Client::query_raw_txt`.
-    async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
+    async fn query_raw_txt<S, I>(
+        &mut self,
+        statement: &str,
+        params: I,
+    ) -> Result<RowStream<'_>, Error>
    where
        S: AsRef<str> + Sync + Send,
        I: IntoIterator<Item = Option<S>> + Sync + Send,
@@ -22,7 +26,11 @@ pub trait GenericClient: private::Sealed {
 impl private::Sealed for Client {}

 impl GenericClient for Client {
-    async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
+    async fn query_raw_txt<S, I>(
+        &mut self,
+        statement: &str,
+        params: I,
+    ) -> Result<RowStream<'_>, Error>
    where
        S: AsRef<str> + Sync + Send,
        I: IntoIterator<Item = Option<S>> + Sync + Send,
@@ -35,7 +43,11 @@ impl GenericClient for Client {
 impl private::Sealed for Transaction<'_> {}

 impl GenericClient for Transaction<'_> {
-    async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
+    async fn query_raw_txt<S, I>(
+        &mut self,
+        statement: &str,
+        params: I,
+    ) -> Result<RowStream<'_>, Error>
    where
        S: AsRef<str> + Sync + Send,
        I: IntoIterator<Item = Option<S>> + Sync + Send,
--- a/libs/proxy/tokio-postgres2/src/transaction.rs
+++ b/libs/proxy/tokio-postgres2/src/transaction.rs
@@ -47,7 +47,7 @@ impl<'a> Transaction<'a> {
        &mut self,
        statement: &str,
        params: I,
-    ) -> Result<RowStream, Error>
+    ) -> Result<RowStream<'_>, Error>
    where
        S: AsRef<str>,
        I: IntoIterator<Item = Option<S>>,
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -221,7 +221,7 @@ pub struct TimelineMembershipSwitchRequest {
 pub struct TimelineMembershipSwitchResponse {
    pub previous_conf: Configuration,
    pub current_conf: Configuration,
-    pub term: Term,
+    pub last_log_term: Term,
    pub flush_lsn: Lsn,
 }

--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -24,12 +24,28 @@ macro_rules! critical {
        if cfg!(debug_assertions) {
            panic!($($arg)*);
        }
+        // Increment both metrics
        $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
        let backtrace = std::backtrace::Backtrace::capture();
        tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*));
    }};
 }

+#[macro_export]
+macro_rules! critical_timeline {
+    ($tenant_shard_id:expr, $timeline_id:expr, $($arg:tt)*) => {{
+        if cfg!(debug_assertions) {
+            panic!($($arg)*);
+        }
+        // Increment both metrics
+        $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
+        $crate::logging::HADRON_CRITICAL_STORAGE_EVENT_COUNT_METRIC.inc(&$tenant_shard_id.to_string(), &$timeline_id.to_string());
+        let backtrace = std::backtrace::Backtrace::capture();
+        tracing::error!("CRITICAL: [tenant_shard_id: {}, timeline_id: {}] {}\n{backtrace}",
+                       $tenant_shard_id, $timeline_id, format!($($arg)*));
+    }};
+}
+
 #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
 #[strum(serialize_all = "snake_case")]
 pub enum LogFormat {
@@ -61,6 +77,36 @@ pub struct TracingEventCountMetric {
    trace: IntCounter,
 }

+// Begin Hadron: Add a HadronCriticalStorageEventCountMetric metric that is sliced by tenant_id and timeline_id
+pub struct HadronCriticalStorageEventCountMetric {
+    critical: IntCounterVec,
+}
+
+pub static HADRON_CRITICAL_STORAGE_EVENT_COUNT_METRIC: Lazy<HadronCriticalStorageEventCountMetric> =
+    Lazy::new(|| {
+        let vec = metrics::register_int_counter_vec!(
+            "hadron_critical_storage_event_count",
+            "Number of critical storage events, by tenant_id and timeline_id",
+            &["tenant_shard_id", "timeline_id"]
+        )
+        .expect("failed to define metric");
+        HadronCriticalStorageEventCountMetric::new(vec)
+    });
+
+impl HadronCriticalStorageEventCountMetric {
+    fn new(vec: IntCounterVec) -> Self {
+        Self { critical: vec }
+    }
+
+    // Allow public access from `critical!` macro.
+    pub fn inc(&self, tenant_shard_id: &str, timeline_id: &str) {
+        self.critical
+            .with_label_values(&[tenant_shard_id, timeline_id])
+            .inc();
+    }
+}
+// End Hadron
+
 pub static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
    let vec = metrics::register_int_counter_vec!(
        "libmetrics_tracing_event_count",
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -28,6 +28,7 @@ use reqwest::Url;
 use storage_broker::Uri;
 use utils::id::{NodeId, TimelineId};
 use utils::logging::{LogFormat, SecretString};
+use utils::serde_percent::Percent;

 use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -459,7 +460,16 @@ impl PageServerConf {
            metric_collection_endpoint,
            metric_collection_bucket,
            synthetic_size_calculation_interval,
-            disk_usage_based_eviction,
+            disk_usage_based_eviction: Some(disk_usage_based_eviction.unwrap_or(
+                DiskUsageEvictionTaskConfig {
+                    max_usage_pct: Percent::new(80).unwrap(),
+                    min_avail_bytes: 2_000_000_000,
+                    period: Duration::from_secs(60),
+                    #[cfg(feature = "testing")]
+                    mock_statvfs: None,
+                    eviction_order: Default::default(),
+                },
+            )),
            test_remote_failures,
            ondemand_download_behavior_treat_error_as_warn,
            background_task_maximum_delay,
@@ -697,6 +707,8 @@ impl ConfigurableSemaphore {
 #[cfg(test)]
 mod tests {

+    use std::time::Duration;
+
    use camino::Utf8PathBuf;
    use rstest::rstest;
    use utils::id::NodeId;
@@ -798,4 +810,20 @@ mod tests {
        PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
            .expect("parse_and_validate");
    }
+
+    #[test]
+    fn test_config_disk_usage_based_eviction_is_valid() {
+        let input = r#"
+            control_plane_api = "http://localhost:6666"
+        "#;
+        let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
+            .expect("disk_usage_based_eviction is valid");
+        let workdir = Utf8PathBuf::from("/nonexistent");
+        let config = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir).unwrap();
+        let disk_usage_based_eviction = config.disk_usage_based_eviction.unwrap();
+        assert_eq!(disk_usage_based_eviction.max_usage_pct.get(), 80);
+        assert_eq!(disk_usage_based_eviction.min_avail_bytes, 2_000_000_000);
+        assert_eq!(disk_usage_based_eviction.period, Duration::from_secs(60));
+        assert_eq!(disk_usage_based_eviction.eviction_order, Default::default());
+    }
 }
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -99,7 +99,7 @@ pub(super) async fn upload_metrics_bucket(

    // Compose object path
    let datetime: DateTime<Utc> = SystemTime::now().into();
-    let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ");
+    let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/hour=%H/%H:%M:%SZ");
    let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?;

    // Set up a gzip writer into a buffer
@@ -109,7 +109,7 @@ pub(super) async fn upload_metrics_bucket(

    // Serialize and write into compressed buffer
    let started_at = std::time::Instant::now();
-    for res in serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys) {
+    for res in serialize_in_chunks_ndjson(CHUNK_SIZE, metrics, idempotency_keys) {
        let (_chunk, body) = res?;
        gzip_writer.write_all(&body).await?;
    }
@@ -216,6 +216,86 @@ fn serialize_in_chunks<'a>(
    }
 }

+/// Serializes the input metrics as NDJSON in chunks of chunk_size. Each event
+/// is serialized as a separate JSON object on its own line. The provided
+/// idempotency keys are injected into the corresponding metric events (reused
+/// across different metrics sinks), and must have the same length as input.
+fn serialize_in_chunks_ndjson<'a>(
+    chunk_size: usize,
+    input: &'a [NewRawMetric],
+    idempotency_keys: &'a [IdempotencyKey<'a>],
+) -> impl ExactSizeIterator<Item = Result<(&'a [NewRawMetric], bytes::Bytes), serde_json::Error>> + 'a
+{
+    use bytes::BufMut;
+
+    assert_eq!(input.len(), idempotency_keys.len());
+
+    struct Iter<'a> {
+        inner: std::slice::Chunks<'a, NewRawMetric>,
+        idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>,
+        chunk_size: usize,
+
+        // write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
+        buffer: bytes::BytesMut,
+        // chunk amount of events are reused to produce the serialized document
+        scratch: Vec<Event<Ids, Name>>,
+    }
+
+    impl<'a> Iterator for Iter<'a> {
+        type Item = Result<(&'a [NewRawMetric], bytes::Bytes), serde_json::Error>;
+
+        fn next(&mut self) -> Option<Self::Item> {
+            let chunk = self.inner.next()?;
+
+            if self.scratch.is_empty() {
+                // first round: create events with N strings
+                self.scratch.extend(
+                    chunk
+                        .iter()
+                        .zip(&mut self.idempotency_keys)
+                        .map(|(raw_metric, key)| raw_metric.as_event(key)),
+                );
+            } else {
+                // next rounds: update_in_place to reuse allocations
+                assert_eq!(self.scratch.len(), self.chunk_size);
+                itertools::izip!(self.scratch.iter_mut(), chunk, &mut self.idempotency_keys)
+                    .for_each(|(slot, raw_metric, key)| raw_metric.update_in_place(slot, key));
+            }
+
+            // Serialize each event as NDJSON (one JSON object per line)
+            for event in self.scratch[..chunk.len()].iter() {
+                let res = serde_json::to_writer((&mut self.buffer).writer(), event);
+                if let Err(e) = res {
+                    return Some(Err(e));
+                }
+                // Add newline after each event to follow NDJSON format
+                self.buffer.put_u8(b'\n');
+            }
+
+            Some(Ok((chunk, self.buffer.split().freeze())))
+        }
+
+        fn size_hint(&self) -> (usize, Option<usize>) {
+            self.inner.size_hint()
+        }
+    }
+
+    impl ExactSizeIterator for Iter<'_> {}
+
+    let buffer = bytes::BytesMut::new();
+    let inner = input.chunks(chunk_size);
+    let idempotency_keys = idempotency_keys.iter();
+    let scratch = Vec::new();
+
+    Iter {
+        inner,
+        idempotency_keys,
+        chunk_size,
+        buffer,
+        scratch,
+    }
+}
+
 trait RawMetricExt {
    fn as_event(&self, key: &IdempotencyKey<'_>) -> Event<Ids, Name>;
    fn update_in_place(&self, event: &mut Event<Ids, Name>, key: &IdempotencyKey<'_>);
@@ -479,6 +559,43 @@ mod tests {
        }
    }

+    #[test]
+    fn chunked_serialization_ndjson() {
+        let examples = metric_samples();
+        assert!(examples.len() > 1);
+
+        let now = Utc::now();
+        let idempotency_keys = (0..examples.len())
+            .map(|i| FixedGen::new(now, "1", i as u16).generate())
+            .collect::<Vec<_>>();
+
+        // Parse NDJSON format - each line is a separate JSON object
+        let parse_ndjson = |body: &[u8]| -> Vec<Event<Ids, Name>> {
+            let body_str = std::str::from_utf8(body).unwrap();
+            body_str
+                .trim_end_matches('\n')
+                .lines()
+                .filter(|line| !line.is_empty())
+                .map(|line| serde_json::from_str::<Event<Ids, Name>>(line).unwrap())
+                .collect()
+        };
+
+        let correct = serialize_in_chunks_ndjson(examples.len(), &examples, &idempotency_keys)
+            .map(|res| res.unwrap().1)
+            .flat_map(|body| parse_ndjson(&body))
+            .collect::<Vec<_>>();
+
+        for chunk_size in 1..examples.len() {
+            let actual = serialize_in_chunks_ndjson(chunk_size, &examples, &idempotency_keys)
+                .map(|res| res.unwrap().1)
+                .flat_map(|body| parse_ndjson(&body))
+                .collect::<Vec<_>>();
+
+            // if these are equal, it means that multi-chunking version works as well
+            assert_eq!(correct, actual);
+        }
+    }
+
    #[derive(Clone, Copy)]
    struct FixedGen<'a>(chrono::DateTime<chrono::Utc>, &'a str, u16);

--- a/pageserver/src/feature_resolver.rs
+++ b/pageserver/src/feature_resolver.rs
@@ -6,12 +6,13 @@ use posthog_client_lite::{
    CaptureEvent, FeatureResolverBackgroundLoop, PostHogEvaluationError,
    PostHogFlagFilterPropertyValue,
 };
+use rand::Rng;
 use remote_storage::RemoteStorageKind;
 use serde_json::json;
 use tokio_util::sync::CancellationToken;
 use utils::id::TenantId;

-use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION};
+use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION, tenant::TenantShard};

 const DEFAULT_POSTHOG_REFRESH_INTERVAL: Duration = Duration::from_secs(600);

@@ -138,6 +139,7 @@ impl FeatureResolver {
                }
                Arc::new(properties)
            };
+
            let fake_tenants = {
                let mut tenants = Vec::new();
                for i in 0..10 {
@@ -147,9 +149,16 @@ impl FeatureResolver {
                        conf.id,
                        i
                    );
+
+                    let tenant_properties = PerTenantProperties {
+                        remote_size_mb: Some(rand::thread_rng().gen_range(100.0..1000000.00)),
+                    }
+                    .into_posthog_properties();
+
                    let properties = Self::collect_properties_inner(
                        distinct_id.clone(),
                        Some(&internal_properties),
+                        &tenant_properties,
                    );
                    tenants.push(CaptureEvent {
                        event: "initial_tenant_report".to_string(),
@@ -183,6 +192,7 @@ impl FeatureResolver {
    fn collect_properties_inner(
        tenant_id: String,
        internal_properties: Option<&HashMap<String, PostHogFlagFilterPropertyValue>>,
+        tenant_properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
    ) -> HashMap<String, PostHogFlagFilterPropertyValue> {
        let mut properties = HashMap::new();
        if let Some(internal_properties) = internal_properties {
@@ -194,6 +204,9 @@ impl FeatureResolver {
            "tenant_id".to_string(),
            PostHogFlagFilterPropertyValue::String(tenant_id),
        );
+        for (key, value) in tenant_properties.iter() {
+            properties.insert(key.clone(), value.clone());
+        }
        properties
    }

@@ -201,8 +214,13 @@ impl FeatureResolver {
    pub(crate) fn collect_properties(
        &self,
        tenant_id: TenantId,
+        tenant_properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
    ) -> HashMap<String, PostHogFlagFilterPropertyValue> {
-        Self::collect_properties_inner(tenant_id.to_string(), self.internal_properties.as_deref())
+        Self::collect_properties_inner(
+            tenant_id.to_string(),
+            self.internal_properties.as_deref(),
+            tenant_properties,
+        )
    }

    /// Evaluate a multivariate feature flag. Currently, we do not support any properties.
@@ -214,6 +232,7 @@ impl FeatureResolver {
        &self,
        flag_key: &str,
        tenant_id: TenantId,
+        tenant_properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
    ) -> Result<String, PostHogEvaluationError> {
        let force_overrides = self.force_overrides_for_testing.load();
        if let Some(value) = force_overrides.get(flag_key) {
@@ -224,7 +243,7 @@ impl FeatureResolver {
            let res = inner.feature_store().evaluate_multivariate(
                flag_key,
                &tenant_id.to_string(),
-                &self.collect_properties(tenant_id),
+                &self.collect_properties(tenant_id, tenant_properties),
            );
            match &res {
                Ok(value) => {
@@ -257,6 +276,7 @@ impl FeatureResolver {
        &self,
        flag_key: &str,
        tenant_id: TenantId,
+        tenant_properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
    ) -> Result<(), PostHogEvaluationError> {
        let force_overrides = self.force_overrides_for_testing.load();
        if let Some(value) = force_overrides.get(flag_key) {
@@ -271,7 +291,7 @@ impl FeatureResolver {
            let res = inner.feature_store().evaluate_boolean(
                flag_key,
                &tenant_id.to_string(),
-                &self.collect_properties(tenant_id),
+                &self.collect_properties(tenant_id, tenant_properties),
            );
            match &res {
                Ok(()) => {
@@ -317,3 +337,78 @@ impl FeatureResolver {
            .store(Arc::new(force_overrides));
    }
 }
+
+struct PerTenantProperties {
+    pub remote_size_mb: Option<f64>,
+}
+
+impl PerTenantProperties {
+    pub fn into_posthog_properties(self) -> HashMap<String, PostHogFlagFilterPropertyValue> {
+        let mut properties = HashMap::new();
+        if let Some(remote_size_mb) = self.remote_size_mb {
+            properties.insert(
+                "tenant_remote_size_mb".to_string(),
+                PostHogFlagFilterPropertyValue::Number(remote_size_mb),
+            );
+        }
+        properties
+    }
+}
+
+#[derive(Clone)]
+pub struct TenantFeatureResolver {
+    inner: FeatureResolver,
+    tenant_id: TenantId,
+    cached_tenant_properties: Arc<ArcSwap<HashMap<String, PostHogFlagFilterPropertyValue>>>,
+}
+
+impl TenantFeatureResolver {
+    pub fn new(inner: FeatureResolver, tenant_id: TenantId) -> Self {
+        Self {
+            inner,
+            tenant_id,
+            cached_tenant_properties: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))),
+        }
+    }
+
+    pub fn evaluate_multivariate(&self, flag_key: &str) -> Result<String, PostHogEvaluationError> {
+        self.inner.evaluate_multivariate(
+            flag_key,
+            self.tenant_id,
+            &self.cached_tenant_properties.load(),
+        )
+    }
+
+    pub fn evaluate_boolean(&self, flag_key: &str) -> Result<(), PostHogEvaluationError> {
+        self.inner.evaluate_boolean(
+            flag_key,
+            self.tenant_id,
+            &self.cached_tenant_properties.load(),
+        )
+    }
+
+    pub fn collect_properties(&self) -> HashMap<String, PostHogFlagFilterPropertyValue> {
+        self.inner
+            .collect_properties(self.tenant_id, &self.cached_tenant_properties.load())
+    }
+
+    pub fn is_feature_flag_boolean(&self, flag_key: &str) -> Result<bool, PostHogEvaluationError> {
+        self.inner.is_feature_flag_boolean(flag_key)
+    }
+
+    pub fn update_cached_tenant_properties(&self, tenant_shard: &TenantShard) {
+        let mut remote_size_mb = None;
+        for timeline in tenant_shard.list_timelines() {
+            let size = timeline.metrics.resident_physical_size_get();
+            if size == 0 {
+                remote_size_mb = None;
+            }
+            if let Some(ref mut remote_size_mb) = remote_size_mb {
+                *remote_size_mb += size as f64 / 1024.0 / 1024.0;
+            }
+        }
+        self.cached_tenant_properties.store(Arc::new(
+            PerTenantProperties { remote_size_mb }.into_posthog_properties(),
+        ));
+    }
+}
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2438,6 +2438,7 @@ async fn timeline_offload_handler(
            .map_err(|e| {
                match e {
                    OffloadError::Cancelled => ApiError::ResourceUnavailable("Timeline shutting down".into()),
+                    OffloadError::AlreadyInProgress => ApiError::Conflict("Timeline already being offloaded or deleted".into()),
                    _ => ApiError::InternalServerError(anyhow!(e))
                }
            })?;
@@ -3697,23 +3698,25 @@ async fn tenant_evaluate_feature_flag(
        let tenant = state
            .tenant_manager
            .get_attached_tenant_shard(tenant_shard_id)?;
-        let properties = tenant.feature_resolver.collect_properties(tenant_shard_id.tenant_id);
+        // TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s) 
+        // and we don't need to worry about it for now.
+        let properties = tenant.feature_resolver.collect_properties();
        if as_type.as_deref() == Some("boolean") {
-            let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
+            let result = tenant.feature_resolver.evaluate_boolean(&flag);
            let result = result.map(|_| true).map_err(|e| e.to_string());
            json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
        } else if as_type.as_deref() == Some("multivariate") {
-            let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
+            let result = tenant.feature_resolver.evaluate_multivariate(&flag).map_err(|e| e.to_string());
            json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
        } else {
            // Auto infer the type of the feature flag.
            let is_boolean = tenant.feature_resolver.is_feature_flag_boolean(&flag).map_err(|e| ApiError::InternalServerError(anyhow::anyhow!("{e}")))?;
            if is_boolean {
-                let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
+                let result = tenant.feature_resolver.evaluate_boolean(&flag);
                let result = result.map(|_| true).map_err(|e| e.to_string());
                json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
            } else {
-                let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
+                let result = tenant.feature_resolver.evaluate_multivariate(&flag).map_err(|e| e.to_string());
                json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
            }
        }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -50,6 +50,7 @@ use tokio::io::{AsyncRead, AsyncReadExt as _, AsyncWrite, AsyncWriteExt as _, Bu
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tonic::service::Interceptor as _;
+use tonic::transport::server::TcpConnectInfo;
 use tracing::*;
 use utils::auth::{Claims, Scope, SwappableJwtAuth};
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
@@ -3685,8 +3686,15 @@ impl proto::PageService for GrpcPageServiceHandler {
                yield match result {
                    Ok(resp) => resp,
                    // Convert per-request errors to GetPageResponses as appropriate, or terminate
-                    // the stream with a tonic::Status.
-                    Err(err) => page_api::GetPageResponse::try_from_status(err, req_id)?.into(),
+                    // the stream with a tonic::Status. Log the error regardless, since
+                    // ObservabilityLayer can't automatically log stream errors.
+                    Err(status) => {
+                        // TODO: it would be nice if we could propagate the get_page() fields here.
+                        span.in_scope(|| {
+                            warn!("request failed with {:?}: {}", status.code(), status.message());
+                        });
+                        page_api::GetPageResponse::try_from_status(status, req_id)?.into()
+                    }
                }
            }
        };
@@ -3824,40 +3832,85 @@ impl<S: tonic::server::NamedService> tonic::server::NamedService for Observabili
    const NAME: &'static str = S::NAME; // propagate inner service name
 }

-impl<S, B> tower::Service<http::Request<B>> for ObservabilityLayerService<S>
+impl<S, Req, Resp> tower::Service<http::Request<Req>> for ObservabilityLayerService<S>
 where
-    S: tower::Service<http::Request<B>>,
+    S: tower::Service<http::Request<Req>, Response = http::Response<Resp>> + Send,
    S::Future: Send + 'static,
 {
    type Response = S::Response;
    type Error = S::Error;
    type Future = BoxFuture<'static, Result<Self::Response, Self::Error>>;

-    fn call(&mut self, mut req: http::Request<B>) -> Self::Future {
+    fn call(&mut self, mut req: http::Request<Req>) -> Self::Future {
        // Record the request start time as a request extension.
        //
        // TODO: we should start a timer here instead, but it currently requires a timeline handle
        // and SmgrQueryType, which we don't have yet. Refactor it to provide it later.
        req.extensions_mut().insert(ReceivedAt(Instant::now()));

-        // Create a basic tracing span. Enter the span for the current thread (to use it for inner
-        // sync code like interceptors), and instrument the future (to use it for inner async code
-        // like the page service itself).
+        // Extract the peer address and gRPC method.
+        let peer = req
+            .extensions()
+            .get::<TcpConnectInfo>()
+            .and_then(|info| info.remote_addr())
+            .map(|addr| addr.to_string())
+            .unwrap_or_default();
+
+        let method = req
+            .uri()
+            .path()
+            .split('/')
+            .nth(2)
+            .unwrap_or(req.uri().path())
+            .to_string();
+
+        // Create a basic tracing span.
        //
-        // The instrument() call below is not sufficient. It only affects the returned future, and
-        // only takes effect when the caller polls it. Any sync code executed when we call
-        // self.inner.call() below (such as interceptors) runs outside of the returned future, and
-        // is not affected by it. We therefore have to enter the span on the current thread too.
+        // Enter the span for the current thread and instrument the future. It is not sufficient to
+        // only instrument the future, since it only takes effect after the future is returned and
+        // polled, not when the inner service is called below (e.g. during interceptor execution).
        let span = info_span!(
            "grpc:pageservice",
-            // Set by TenantMetadataInterceptor.
+            // These will be populated by TenantMetadataInterceptor.
            tenant_id = field::Empty,
            timeline_id = field::Empty,
            shard_id = field::Empty,
+            // NB: empty fields must be listed first above. Otherwise, the field names will be
+            // clobbered when the empty fields are populated. They will be output last regardless.
+            %peer,
+            %method,
        );
        let _guard = span.enter();

-        Box::pin(self.inner.call(req).instrument(span.clone()))
+        // Construct a future for calling the inner service, but don't await it. This avoids having
+        // to clone the inner service into the future below.
+        let call = self.inner.call(req);
+
+        async move {
+            // Await the inner service call.
+            let result = call.await;
+
+            // Log gRPC error statuses. This won't include request info from handler spans, but it
+            // will catch all errors (even those emitted before handler spans are constructed). Only
+            // unary request errors are logged here, not streaming response errors.
+            if let Ok(ref resp) = result
+                && let Some(status) = tonic::Status::from_header_map(resp.headers())
+                && status.code() != tonic::Code::Ok
+            {
+                // TODO: it would be nice if we could propagate the handler span's request fields
+                // here. This could e.g. be done by attaching the request fields to
+                // tonic::Status::metadata via a proc macro.
+                warn!(
+                    "request failed with {:?}: {}",
+                    status.code(),
+                    status.message()
+                );
+            }
+
+            result
+        }
+        .instrument(span.clone())
+        .boxed()
    }

    fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -86,7 +86,7 @@ use crate::context;
 use crate::context::RequestContextBuilder;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
-use crate::feature_resolver::FeatureResolver;
+use crate::feature_resolver::{FeatureResolver, TenantFeatureResolver};
 use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::{
    BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
@@ -386,7 +386,7 @@ pub struct TenantShard {

    l0_flush_global_state: L0FlushGlobalState,

-    pub(crate) feature_resolver: FeatureResolver,
+    pub(crate) feature_resolver: TenantFeatureResolver,
 }
 impl std::fmt::Debug for TenantShard {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3263,7 +3263,7 @@ impl TenantShard {
                };
                let gc_compaction_strategy = self
                    .feature_resolver
-                    .evaluate_multivariate("gc-comapction-strategy", self.tenant_shard_id.tenant_id)
+                    .evaluate_multivariate("gc-comapction-strategy")
                    .ok();
                let span = if let Some(gc_compaction_strategy) = gc_compaction_strategy {
                    info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id, strategy = %gc_compaction_strategy)
@@ -3285,6 +3285,7 @@ impl TenantShard {
                    .or_else(|err| match err {
                        // Ignore this, we likely raced with unarchival.
                        OffloadError::NotArchived => Ok(()),
+                        OffloadError::AlreadyInProgress => Ok(()),
                        err => Err(err),
                    })?;
            }
@@ -3408,6 +3409,9 @@ impl TenantShard {
        if let Some(ref walredo_mgr) = self.walredo_mgr {
            walredo_mgr.maybe_quiesce(WALREDO_IDLE_TIMEOUT);
        }
+
+        // Update the feature resolver with the latest tenant-spcific data.
+        self.feature_resolver.update_cached_tenant_properties(self);
    }

    pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool {
@@ -4490,7 +4494,10 @@ impl TenantShard {
            gc_block: Default::default(),
            l0_flush_global_state,
            basebackup_cache,
-            feature_resolver,
+            feature_resolver: TenantFeatureResolver::new(
+                feature_resolver,
+                tenant_shard_id.tenant_id,
+            ),
        }
    }

--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -182,7 +182,7 @@ impl BatchLayerWriter {
 /// An image writer that takes images and produces multiple image layers.
 #[must_use]
 pub struct SplitImageLayerWriter<'a> {
-    inner: ImageLayerWriter,
+    inner: Option<ImageLayerWriter>,
    target_layer_size: u64,
    lsn: Lsn,
    conf: &'static PageServerConf,
@@ -196,7 +196,7 @@ pub struct SplitImageLayerWriter<'a> {

 impl<'a> SplitImageLayerWriter<'a> {
    #[allow(clippy::too_many_arguments)]
-    pub async fn new(
+    pub fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
@@ -205,22 +205,10 @@ impl<'a> SplitImageLayerWriter<'a> {
        target_layer_size: u64,
        gate: &'a utils::sync::gate::Gate,
        cancel: CancellationToken,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
-        Ok(Self {
+    ) -> Self {
+        Self {
            target_layer_size,
-            // XXX make this lazy like in SplitDeltaLayerWriter?
-            inner: ImageLayerWriter::new(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                &(start_key..Key::MAX),
-                lsn,
-                gate,
-                cancel.clone(),
-                ctx,
-            )
-            .await?,
+            inner: None,
            conf,
            timeline_id,
            tenant_shard_id,
@@ -229,7 +217,7 @@ impl<'a> SplitImageLayerWriter<'a> {
            start_key,
            gate,
            cancel,
-        })
+        }
    }

    pub async fn put_image(
@@ -238,12 +226,31 @@ impl<'a> SplitImageLayerWriter<'a> {
        img: Bytes,
        ctx: &RequestContext,
    ) -> Result<(), PutError> {
+        if self.inner.is_none() {
+            self.inner = Some(
+                ImageLayerWriter::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_shard_id,
+                    &(self.start_key..Key::MAX),
+                    self.lsn,
+                    self.gate,
+                    self.cancel.clone(),
+                    ctx,
+                )
+                .await
+                .map_err(PutError::Other)?,
+            );
+        }
+
+        let inner = self.inner.as_mut().unwrap();
+
        // The current estimation is an upper bound of the space that the key/image could take
        // because we did not consider compression in this estimation. The resulting image layer
        // could be smaller than the target size.
        let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
+        if inner.num_keys() >= 1
+            && inner.estimated_size() + addition_size_estimation >= self.target_layer_size
        {
            let next_image_writer = ImageLayerWriter::new(
                self.conf,
@@ -257,7 +264,7 @@ impl<'a> SplitImageLayerWriter<'a> {
            )
            .await
            .map_err(PutError::Other)?;
-            let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
+            let prev_image_writer = std::mem::replace(inner, next_image_writer);
            self.batches.add_unfinished_image_writer(
                prev_image_writer,
                self.start_key..key,
@@ -265,7 +272,7 @@ impl<'a> SplitImageLayerWriter<'a> {
            );
            self.start_key = key;
        }
-        self.inner.put_image(key, img, ctx).await
+        inner.put_image(key, img, ctx).await
    }

    pub(crate) async fn finish_with_discard_fn<D, F>(
@@ -282,8 +289,10 @@ impl<'a> SplitImageLayerWriter<'a> {
        let Self {
            mut batches, inner, ..
        } = self;
-        if inner.num_keys() != 0 {
-            batches.add_unfinished_image_writer(inner, self.start_key..end_key, self.lsn);
+        if let Some(inner) = inner {
+            if inner.num_keys() != 0 {
+                batches.add_unfinished_image_writer(inner, self.start_key..end_key, self.lsn);
+            }
        }
        batches.finish_with_discard_fn(tline, ctx, discard_fn).await
    }
@@ -498,10 +507,7 @@ mod tests {
            4 * 1024 * 1024,
            &tline.gate,
            tline.cancel.clone(),
-            &ctx,
-        )
-        .await
-        .unwrap();
+        );

        let mut delta_writer = SplitDeltaLayerWriter::new(
            tenant.conf,
@@ -577,10 +583,7 @@ mod tests {
            4 * 1024 * 1024,
            &tline.gate,
            tline.cancel.clone(),
-            &ctx,
-        )
-        .await
-        .unwrap();
+        );
        let mut delta_writer = SplitDeltaLayerWriter::new(
            tenant.conf,
            tline.timeline_id,
@@ -676,10 +679,7 @@ mod tests {
            4 * 1024,
            &tline.gate,
            tline.cancel.clone(),
-            &ctx,
-        )
-        .await
-        .unwrap();
+        );

        let mut delta_writer = SplitDeltaLayerWriter::new(
            tenant.conf,
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -78,7 +78,7 @@ use utils::rate_limit::RateLimit;
 use utils::seqwait::SeqWait;
 use utils::simple_rcu::{Rcu, RcuReadGuard};
 use utils::sync::gate::{Gate, GateGuard};
-use utils::{completion, critical, fs_ext, pausable_failpoint};
+use utils::{completion, critical_timeline, fs_ext, pausable_failpoint};
 #[cfg(test)]
 use wal_decoder::models::value::Value;
 use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
@@ -106,7 +106,7 @@ use crate::context::{
    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
 };
 use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32};
-use crate::feature_resolver::FeatureResolver;
+use crate::feature_resolver::TenantFeatureResolver;
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::l0_flush::{self, L0FlushGlobalState};
 use crate::metrics::{
@@ -202,7 +202,7 @@ pub struct TimelineResources {
    pub l0_compaction_trigger: Arc<Notify>,
    pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
    pub basebackup_cache: Arc<BasebackupCache>,
-    pub feature_resolver: FeatureResolver,
+    pub feature_resolver: TenantFeatureResolver,
 }

 pub struct Timeline {
@@ -450,7 +450,7 @@ pub struct Timeline {
    /// A channel to send async requests to prepare a basebackup for the basebackup cache.
    basebackup_cache: Arc<BasebackupCache>,

-    feature_resolver: FeatureResolver,
+    feature_resolver: TenantFeatureResolver,
 }

 pub(crate) enum PreviousHeatmap {
@@ -2144,14 +2144,31 @@ impl Timeline {
        debug_assert_current_span_has_tenant_and_timeline_id();

        // Regardless of whether we're going to try_freeze_and_flush
-        // or not, stop ingesting any more data.
+        // cancel walreceiver to stop ingesting more data asap.
+        //
+        // Note that we're accepting a race condition here where we may
+        // do the final flush below, before walreceiver observes the
+        // cancellation and exits.
+        // This means we may open a new InMemoryLayer after the final flush below.
+        // Flush loop is also still running for a short while, so, in theory, it
+        // could also make its way into the upload queue.
+        //
+        // If we wait for the shutdown of the walreceiver before moving on to the
+        // flush, then that would be avoided. But we don't do it because the
+        // walreceiver entertains reads internally, which means that it possibly
+        // depends on the download of layers. Layer download is only sensitive to
+        // the cancellation of the entire timeline, so cancelling the walreceiver
+        // will have no effect on the individual get requests.
+        // This would cause problems when there is a lot of ongoing downloads or
+        // there is S3 unavailabilities, i.e. detach, deletion, etc would hang,
+        // and we can't deallocate resources of the timeline, etc.
        let walreceiver = self.walreceiver.lock().unwrap().take();
        tracing::debug!(
            is_some = walreceiver.is_some(),
            "Waiting for WalReceiverManager..."
        );
        if let Some(walreceiver) = walreceiver {
-            walreceiver.shutdown().await;
+            walreceiver.cancel().await;
        }
        // ... and inform any waiters for newer LSNs that there won't be any.
        self.last_record_lsn.shutdown();
@@ -4729,7 +4746,7 @@ impl Timeline {
                }

                // Fetch the next layer to flush, if any.
-                let (layer, l0_count, frozen_count, frozen_size) = {
+                let (layer, l0_count, frozen_count, frozen_size, open_layer_size) = {
                    let layers = self.layers.read(LayerManagerLockHolder::FlushLoop).await;
                    let Ok(lm) = layers.layer_map() else {
                        info!("dropping out of flush loop for timeline shutdown");
@@ -4742,8 +4759,13 @@ impl Timeline {
                        .iter()
                        .map(|l| l.estimated_in_mem_size())
                        .sum();
+                    let open_layer_size: u64 = lm
+                        .open_layer
+                        .as_ref()
+                        .map(|l| l.estimated_in_mem_size())
+                        .unwrap_or(0);
                    let layer = lm.frozen_layers.front().cloned();
-                    (layer, l0_count, frozen_count, frozen_size)
+                    (layer, l0_count, frozen_count, frozen_size, open_layer_size)
                    // drop 'layers' lock
                };
                let Some(layer) = layer else {
@@ -4756,7 +4778,7 @@ impl Timeline {
                    if l0_count >= stall_threshold {
                        warn!(
                            "stalling layer flushes for compaction backpressure at {l0_count} \
-                            L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)"
+                            L0 layers ({frozen_count} frozen layers with {frozen_size} bytes, {open_layer_size} bytes in open layer)"
                        );
                        let stall_timer = self
                            .metrics
@@ -4809,7 +4831,7 @@ impl Timeline {
                        let delay = flush_duration.as_secs_f64();
                        info!(
                            "delaying layer flush by {delay:.3}s for compaction backpressure at \
-                            {l0_count} L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)"
+                            {l0_count} L0 layers ({frozen_count} frozen layers with {frozen_size} bytes, {open_layer_size} bytes in open layer)"
                        );
                        let _delay_timer = self
                            .metrics
@@ -5308,6 +5330,7 @@ impl Timeline {
        ctx: &RequestContext,
        img_range: Range<Key>,
        io_concurrency: IoConcurrency,
+        progress: Option<(usize, usize)>,
    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
        let mut wrote_keys = false;

@@ -5384,11 +5407,15 @@ impl Timeline {
            }
        }

+        let progress_report = progress
+            .map(|(idx, total)| format!("({idx}/{total}) "))
+            .unwrap_or_default();
        if wrote_keys {
            // Normal path: we have written some data into the new image layer for this
            // partition, so flush it to disk.
            info!(
-                "produced image layer for rel {}",
+                "{} produced image layer for rel {}",
+                progress_report,
                ImageLayerName {
                    key_range: img_range.clone(),
                    lsn
@@ -5398,7 +5425,12 @@ impl Timeline {
                unfinished_image_layer: image_layer_writer,
            })
        } else {
-            tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
+            tracing::debug!(
+                "{} no data in range {}-{}",
+                progress_report,
+                img_range.start,
+                img_range.end
+            );
            Ok(ImageLayerCreationOutcome::Empty)
        }
    }
@@ -5633,7 +5665,8 @@ impl Timeline {
            }
        }

-        for partition in partition_parts.iter() {
+        let total = partition_parts.len();
+        for (idx, partition) in partition_parts.iter().enumerate() {
            if self.cancel.is_cancelled() {
                return Err(CreateImageLayersError::Cancelled);
            }
@@ -5718,6 +5751,7 @@ impl Timeline {
                    ctx,
                    img_range.clone(),
                    io_concurrency,
+                    Some((idx, total)),
                )
                .await?
            } else {
@@ -6807,7 +6841,11 @@ impl Timeline {
                    Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
                    Err(walredo::Error::Other(err)) => {
                        if fire_critical_error {
-                            critical!("walredo failure during page reconstruction: {err:?}");
+                            critical_timeline!(
+                                self.tenant_shard_id,
+                                self.timeline_id,
+                                "walredo failure during page reconstruction: {err:?}"
+                            );
                        }
                        return Err(PageReconstructError::WalRedo(
                            err.context("reconstruct a page image"),
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -9,7 +9,7 @@ use std::ops::{Deref, Range};
 use std::sync::Arc;
 use std::time::{Duration, Instant};

-use super::layer_manager::{LayerManagerLockHolder, LayerManagerReadGuard};
+use super::layer_manager::LayerManagerLockHolder;
 use super::{
    CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder,
    GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration,
@@ -36,7 +36,7 @@ use serde::Serialize;
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio_util::sync::CancellationToken;
 use tracing::{Instrument, debug, error, info, info_span, trace, warn};
-use utils::critical;
+use utils::critical_timeline;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
 use wal_decoder::models::record::NeonWalRecord;
@@ -101,7 +101,11 @@ pub enum GcCompactionQueueItem {
        /// Whether the compaction is triggered automatically (determines whether we need to update L2 LSN)
        auto: bool,
    },
-    SubCompactionJob(CompactOptions),
+    SubCompactionJob {
+        i: usize,
+        total: usize,
+        options: CompactOptions,
+    },
    Notify(GcCompactionJobId, Option<Lsn>),
 }

@@ -163,7 +167,7 @@ impl GcCompactionQueueItem {
                running,
                job_id: id.0,
            }),
-            GcCompactionQueueItem::SubCompactionJob(options) => Some(CompactInfoResponse {
+            GcCompactionQueueItem::SubCompactionJob { options, .. } => Some(CompactInfoResponse {
                compact_key_range: options.compact_key_range,
                compact_lsn_range: options.compact_lsn_range,
                sub_compaction: options.sub_compaction,
@@ -489,7 +493,7 @@ impl GcCompactionQueue {
                .map(|job| job.compact_lsn_range.end)
                .max()
                .unwrap();
-            for job in jobs {
+            for (i, job) in jobs.into_iter().enumerate() {
                // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
                // until we do further refactors to allow directly call `compact_with_gc`.
                let mut flags: EnumSet<CompactFlags> = EnumSet::default();
@@ -507,7 +511,11 @@ impl GcCompactionQueue {
                    compact_lsn_range: Some(job.compact_lsn_range.into()),
                    sub_compaction_max_job_size_mb: None,
                };
-                pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options));
+                pending_tasks.push(GcCompactionQueueItem::SubCompactionJob {
+                    options,
+                    i,
+                    total: jobs_len,
+                });
            }

            if !auto {
@@ -651,7 +659,7 @@ impl GcCompactionQueue {
                    }
                }
            }
-            GcCompactionQueueItem::SubCompactionJob(options) => {
+            GcCompactionQueueItem::SubCompactionJob { options, i, total } => {
                // TODO: error handling, clear the queue if any task fails?
                let _gc_guard = match gc_block.start().await {
                    Ok(guard) => guard,
@@ -663,6 +671,7 @@ impl GcCompactionQueue {
                        )));
                    }
                };
+                info!("running gc-compaction subcompaction job {}/{}", i, total);
                let res = timeline.compact_with_options(cancel, options, ctx).await;
                let compaction_result = match res {
                    Ok(res) => res,
@@ -1310,7 +1319,7 @@ impl Timeline {
            || cfg!(feature = "testing")
            || self
                .feature_resolver
-                .evaluate_boolean("image-compaction-boundary", self.tenant_shard_id.tenant_id)
+                .evaluate_boolean("image-compaction-boundary")
                .is_ok()
        {
            let last_repartition_lsn = self.partitioning.read().1;
@@ -1381,7 +1390,11 @@ impl Timeline {
                            GetVectoredError::MissingKey(_),
                        ) = err
                        {
-                            critical!("missing key during compaction: {err:?}");
+                            critical_timeline!(
+                                self.tenant_shard_id,
+                                self.timeline_id,
+                                "missing key during compaction: {err:?}"
+                            );
                        }
                    })?;

@@ -1409,7 +1422,11 @@ impl Timeline {

            // Alert on critical errors that indicate data corruption.
            Err(err) if err.is_critical() => {
-                critical!("could not compact, repartitioning keyspace failed: {err:?}");
+                critical_timeline!(
+                    self.tenant_shard_id,
+                    self.timeline_id,
+                    "could not compact, repartitioning keyspace failed: {err:?}"
+                );
            }

            // Log other errors. No partitioning? This is normal, if the timeline was just created
@@ -1591,13 +1608,15 @@ impl Timeline {
        let started = Instant::now();

        let mut replace_image_layers = Vec::new();
+        let total = layers_to_rewrite.len();

-        for layer in layers_to_rewrite {
+        for (i, layer) in layers_to_rewrite.into_iter().enumerate() {
            if self.cancel.is_cancelled() {
                return Err(CompactionError::ShuttingDown);
            }

-            info!(layer=%layer, "rewriting layer after shard split");
+            info!(layer=%layer, "rewriting layer after shard split: {}/{}", i, total);
+
            let mut image_layer_writer = ImageLayerWriter::new(
                self.conf,
                self.timeline_id,
@@ -1779,20 +1798,14 @@ impl Timeline {
        } = {
            let phase1_span = info_span!("compact_level0_phase1");
            let ctx = ctx.attached_child();
-            let mut stats = CompactLevel0Phase1StatsBuilder {
+            let stats = CompactLevel0Phase1StatsBuilder {
                version: Some(2),
                tenant_id: Some(self.tenant_shard_id),
                timeline_id: Some(self.timeline_id),
                ..Default::default()
            };

-            let begin = tokio::time::Instant::now();
-            let phase1_layers_locked = self.layers.read(LayerManagerLockHolder::Compaction).await;
-            let now = tokio::time::Instant::now();
-            stats.read_lock_acquisition_micros =
-                DurationRecorder::Recorded(RecordedDuration(now - begin), now);
            self.compact_level0_phase1(
-                phase1_layers_locked,
                stats,
                target_file_size,
                force_compaction_ignore_threshold,
@@ -1813,16 +1826,19 @@ impl Timeline {
    }

    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
-    async fn compact_level0_phase1<'a>(
-        self: &'a Arc<Self>,
-        guard: LayerManagerReadGuard<'a>,
+    async fn compact_level0_phase1(
+        self: &Arc<Self>,
        mut stats: CompactLevel0Phase1StatsBuilder,
        target_file_size: u64,
        force_compaction_ignore_threshold: bool,
        ctx: &RequestContext,
    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
-        stats.read_lock_held_spawn_blocking_startup_micros =
-            stats.read_lock_acquisition_micros.till_now(); // set by caller
+        let begin = tokio::time::Instant::now();
+        let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
+        let now = tokio::time::Instant::now();
+        stats.read_lock_acquisition_micros =
+            DurationRecorder::Recorded(RecordedDuration(now - begin), now);
+
        let layers = guard.layer_map()?;
        let level0_deltas = layers.level0_deltas();
        stats.level0_deltas_count = Some(level0_deltas.len());
@@ -1857,6 +1873,12 @@ impl Timeline {
            .map(|x| guard.get_from_desc(x))
            .collect::<Vec<_>>();

+        drop_layer_manager_rlock(guard);
+
+        // The is the last LSN that we have seen for L0 compaction in the timeline. This LSN might be updated
+        // by the time we finish the compaction. So we need to get it here.
+        let l0_last_record_lsn = self.get_last_record_lsn();
+
        // Gather the files to compact in this iteration.
        //
        // Start with the oldest Level 0 delta file, and collect any other
@@ -1944,9 +1966,7 @@ impl Timeline {
        // we don't accidentally use it later in the function.
        drop(level0_deltas);

-        stats.read_lock_held_prerequisites_micros = stats
-            .read_lock_held_spawn_blocking_startup_micros
-            .till_now();
+        stats.compaction_prerequisites_micros = stats.read_lock_acquisition_micros.till_now();

        // TODO: replace with streaming k-merge
        let all_keys = {
@@ -1968,7 +1988,7 @@ impl Timeline {
            all_keys
        };

-        stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
+        stats.read_lock_held_key_sort_micros = stats.compaction_prerequisites_micros.till_now();

        // Determine N largest holes where N is number of compacted layers. The vec is sorted by key range start.
        //
@@ -2002,7 +2022,6 @@ impl Timeline {
                }
            }
            let max_holes = deltas_to_compact.len();
-            let last_record_lsn = self.get_last_record_lsn();
            let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
            let min_hole_coverage_size = 3; // TODO: something more flexible?
            // min-heap (reserve space for one more element added before eviction)
@@ -2021,8 +2040,12 @@ impl Timeline {
                        // has not so much sense, because largest holes will corresponds field1/field2 changes.
                        // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
                        // That is why it is better to measure size of hole as number of covering image layers.
-                        let coverage_size =
-                            layers.image_coverage(&key_range, last_record_lsn).len();
+                        let coverage_size = {
+                            // TODO: optimize this with copy-on-write layer map.
+                            let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
+                            let layers = guard.layer_map()?;
+                            layers.image_coverage(&key_range, l0_last_record_lsn).len()
+                        };
                        if coverage_size >= min_hole_coverage_size {
                            heap.push(Hole {
                                key_range,
@@ -2041,7 +2064,6 @@ impl Timeline {
            holes
        };
        stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
-        drop_layer_manager_rlock(guard);

        if self.cancel.is_cancelled() {
            return Err(CompactionError::ShuttingDown);
@@ -2382,9 +2404,8 @@ struct CompactLevel0Phase1StatsBuilder {
    tenant_id: Option<TenantShardId>,
    timeline_id: Option<TimelineId>,
    read_lock_acquisition_micros: DurationRecorder,
-    read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
    read_lock_held_key_sort_micros: DurationRecorder,
-    read_lock_held_prerequisites_micros: DurationRecorder,
+    compaction_prerequisites_micros: DurationRecorder,
    read_lock_held_compute_holes_micros: DurationRecorder,
    read_lock_drop_micros: DurationRecorder,
    write_layer_files_micros: DurationRecorder,
@@ -2399,9 +2420,8 @@ struct CompactLevel0Phase1Stats {
    tenant_id: TenantShardId,
    timeline_id: TimelineId,
    read_lock_acquisition_micros: RecordedDuration,
-    read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
    read_lock_held_key_sort_micros: RecordedDuration,
-    read_lock_held_prerequisites_micros: RecordedDuration,
+    compaction_prerequisites_micros: RecordedDuration,
    read_lock_held_compute_holes_micros: RecordedDuration,
    read_lock_drop_micros: RecordedDuration,
    write_layer_files_micros: RecordedDuration,
@@ -2426,16 +2446,12 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
                .read_lock_acquisition_micros
                .into_recorded()
                .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?,
-            read_lock_held_spawn_blocking_startup_micros: value
-                .read_lock_held_spawn_blocking_startup_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
            read_lock_held_key_sort_micros: value
                .read_lock_held_key_sort_micros
                .into_recorded()
                .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?,
-            read_lock_held_prerequisites_micros: value
-                .read_lock_held_prerequisites_micros
+            compaction_prerequisites_micros: value
+                .compaction_prerequisites_micros
                .into_recorded()
                .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?,
            read_lock_held_compute_holes_micros: value
@@ -3503,22 +3519,16 @@ impl Timeline {
        // Only create image layers when there is no ancestor branches. TODO: create covering image layer
        // when some condition meet.
        let mut image_layer_writer = if !has_data_below {
-            Some(
-                SplitImageLayerWriter::new(
-                    self.conf,
-                    self.timeline_id,
-                    self.tenant_shard_id,
-                    job_desc.compaction_key_range.start,
-                    lowest_retain_lsn,
-                    self.get_compaction_target_size(),
-                    &self.gate,
-                    self.cancel.clone(),
-                    ctx,
-                )
-                .await
-                .context("failed to create image layer writer")
-                .map_err(CompactionError::Other)?,
-            )
+            Some(SplitImageLayerWriter::new(
+                self.conf,
+                self.timeline_id,
+                self.tenant_shard_id,
+                job_desc.compaction_key_range.start,
+                lowest_retain_lsn,
+                self.get_compaction_target_size(),
+                &self.gate,
+                self.cancel.clone(),
+            ))
        } else {
            None
        };
@@ -4352,6 +4362,7 @@ impl TimelineAdaptor {
                ctx,
                key_range.clone(),
                IoConcurrency::sequential(),
+                None,
            )
            .await?;

--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -182,6 +182,7 @@ pub(crate) async fn generate_tombstone_image_layer(
    detached: &Arc<Timeline>,
    ancestor: &Arc<Timeline>,
    ancestor_lsn: Lsn,
+    historic_layers_to_copy: &Vec<Layer>,
    ctx: &RequestContext,
 ) -> Result<Option<ResidentLayer>, Error> {
    tracing::info!(
@@ -199,6 +200,20 @@ pub(crate) async fn generate_tombstone_image_layer(
    let image_lsn = ancestor_lsn;

    {
+        for layer in historic_layers_to_copy {
+            let desc = layer.layer_desc();
+            if !desc.is_delta
+                && desc.lsn_range.start == image_lsn
+                && overlaps_with(&key_range, &desc.key_range)
+            {
+                tracing::info!(
+                    layer=%layer, "will copy tombstone from ancestor instead of creating a new one"
+                );
+
+                return Ok(None);
+            }
+        }
+
        let layers = detached
            .layers
            .read(LayerManagerLockHolder::DetachAncestor)
@@ -450,7 +465,8 @@ pub(super) async fn prepare(
        Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len() + 1);

    if let Some(tombstone_layer) =
-        generate_tombstone_image_layer(detached, &ancestor, ancestor_lsn, ctx).await?
+        generate_tombstone_image_layer(detached, &ancestor, ancestor_lsn, &rest_of_historic, ctx)
+            .await?
    {
        new_layers.push(tombstone_layer.into());
    }
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -19,6 +19,8 @@ pub(crate) enum OffloadError {
    NotArchived,
    #[error(transparent)]
    RemoteStorage(anyhow::Error),
+    #[error("Offload or deletion already in progress")]
+    AlreadyInProgress,
    #[error("Unexpected offload error: {0}")]
    Other(anyhow::Error),
 }
@@ -44,20 +46,26 @@ pub(crate) async fn offload_timeline(
        timeline.timeline_id,
        TimelineDeleteGuardKind::Offload,
    );
-    if let Err(DeleteTimelineError::HasChildren(children)) = delete_guard_res {
-        let is_archived = timeline.is_archived();
-        if is_archived == Some(true) {
-            tracing::error!("timeline is archived but has non-archived children: {children:?}");
+    let (timeline, guard) = match delete_guard_res {
+        Ok(timeline_and_guard) => timeline_and_guard,
+        Err(DeleteTimelineError::HasChildren(children)) => {
+            let is_archived = timeline.is_archived();
+            if is_archived == Some(true) {
+                tracing::error!("timeline is archived but has non-archived children: {children:?}");
+                return Err(OffloadError::NotArchived);
+            }
+            tracing::info!(
+                ?is_archived,
+                "timeline is not archived and has unarchived children"
+            );
            return Err(OffloadError::NotArchived);
        }
-        tracing::info!(
-            ?is_archived,
-            "timeline is not archived and has unarchived children"
-        );
-        return Err(OffloadError::NotArchived);
+        Err(DeleteTimelineError::AlreadyInProgress(_)) => {
+            tracing::info!("timeline offload or deletion already in progress");
+            return Err(OffloadError::AlreadyInProgress);
+        }
+        Err(e) => return Err(OffloadError::Other(anyhow::anyhow!(e))),
    };
-    let (timeline, guard) =
-        delete_guard_res.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;

    let TimelineOrOffloaded::Timeline(timeline) = timeline else {
        tracing::error!("timeline already offloaded, but given timeline object");
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -63,7 +63,6 @@ pub struct WalReceiver {
    /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token.
    /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`.
    cancel: CancellationToken,
-    task: tokio::task::JoinHandle<()>,
 }

 impl WalReceiver {
@@ -80,7 +79,7 @@ impl WalReceiver {
        let loop_status = Arc::new(std::sync::RwLock::new(None));
        let manager_status = Arc::clone(&loop_status);
        let cancel = timeline.cancel.child_token();
-        let task = WALRECEIVER_RUNTIME.spawn({
+        let _task = WALRECEIVER_RUNTIME.spawn({
            let cancel = cancel.clone();
            async move {
                debug_assert_current_span_has_tenant_and_timeline_id();
@@ -121,25 +120,14 @@ impl WalReceiver {
        Self {
            manager_status,
            cancel,
-            task,
        }
    }

    #[instrument(skip_all, level = tracing::Level::DEBUG)]
-    pub async fn shutdown(self) {
+    pub async fn cancel(self) {
        debug_assert_current_span_has_tenant_and_timeline_id();
        debug!("cancelling walreceiver tasks");
        self.cancel.cancel();
-        match self.task.await {
-            Ok(()) => debug!("Shutdown success"),
-            Err(je) if je.is_cancelled() => unreachable!("not used"),
-            Err(je) if je.is_panic() => {
-                // already logged by panic hook
-            }
-            Err(je) => {
-                error!("shutdown walreceiver task join error: {je}")
-            }
-        }
    }

    pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -100,6 +100,7 @@ pub(super) async fn connection_manager_loop_step(
    // with other streams on this client (other connection managers). When
    // object goes out of scope, stream finishes in drop() automatically.
    let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
+    let mut broker_reset_interval = tokio::time::interval(tokio::time::Duration::from_secs(30));
    debug!("Subscribed for broker timeline updates");

    loop {
@@ -156,7 +157,10 @@ pub(super) async fn connection_manager_loop_step(
            // Got a new update from the broker
            broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => {
                match broker_update {
-                    Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
+                    Ok(Some(broker_update)) => {
+                        broker_reset_interval.reset();
+                        connection_manager_state.register_timeline_update(broker_update);
+                    },
                    Err(status) => {
                        match status.code() {
                            Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") || status.message().contains("error reading a body from connection") => {
@@ -178,6 +182,14 @@ pub(super) async fn connection_manager_loop_step(
                }
            },

+            _ = broker_reset_interval.tick() => {
+                if wait_lsn_status.borrow().is_some() {
+                    tracing::warn!("No broker updates received for a while, but waiting for WAL. Re-setting stream ...")
+                }
+
+                broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
+            },
+
            new_event = async {
                // Reminder: this match arm needs to be cancellation-safe.
                loop {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -25,7 +25,7 @@ use tokio_postgres::replication::ReplicationStream;
 use tokio_postgres::{Client, SimpleQueryMessage, SimpleQueryRow};
 use tokio_util::sync::CancellationToken;
 use tracing::{Instrument, debug, error, info, trace, warn};
-use utils::critical;
+use utils::critical_timeline;
 use utils::id::NodeId;
 use utils::lsn::Lsn;
 use utils::pageserver_feedback::PageserverFeedback;
@@ -275,20 +275,12 @@ pub(super) async fn handle_walreceiver_connection(
    let copy_stream = replication_client.copy_both_simple(&query).await?;
    let mut physical_stream = pin!(ReplicationStream::new(copy_stream));

-    let walingest_future = WalIngest::new(timeline.as_ref(), startpoint, &ctx);
-    let walingest_res = select! {
-        walingest_res = walingest_future => walingest_res,
-        _ = cancellation.cancelled() => {
-            // We are doing reads in WalIngest::new, and those can hang as they come from the network.
-            // Timeline cancellation hits the walreceiver cancellation token before it hits the timeline global one.
-            debug!("Connection cancelled");
-            return Err(WalReceiverError::Cancelled);
-        },
-    };
-    let mut walingest = walingest_res.map_err(|e| match e.kind {
-        crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
-        _ => WalReceiverError::Other(e.into()),
-    })?;
+    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
+        .await
+        .map_err(|e| match e.kind {
+            crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
+            _ => WalReceiverError::Other(e.into()),
+        })?;

    let (format, compression) = match protocol {
        PostgresClientProtocol::Interpreted {
@@ -368,9 +360,13 @@ pub(super) async fn handle_walreceiver_connection(
                        match raw_wal_start_lsn.cmp(&expected_wal_start) {
                            std::cmp::Ordering::Greater => {
                                let msg = format!(
-                                    "Gap in streamed WAL: [{expected_wal_start}, {raw_wal_start_lsn})"
+                                    "Gap in streamed WAL: [{expected_wal_start}, {raw_wal_start_lsn}"
+                                );
+                                critical_timeline!(
+                                    timeline.tenant_shard_id,
+                                    timeline.timeline_id,
+                                    "{msg}"
                                );
-                                critical!("{msg}");
                                return Err(WalReceiverError::Other(anyhow!(msg)));
                            }
                            std::cmp::Ordering::Less => {
@@ -383,7 +379,11 @@ pub(super) async fn handle_walreceiver_connection(
                                            "Received record with next_record_lsn multiple times ({} < {})",
                                            first_rec.next_record_lsn, expected_wal_start
                                        );
-                                        critical!("{msg}");
+                                        critical_timeline!(
+                                            timeline.tenant_shard_id,
+                                            timeline.timeline_id,
+                                            "{msg}"
+                                        );
                                        return Err(WalReceiverError::Other(anyhow!(msg)));
                                    }
                                }
@@ -452,7 +452,11 @@ pub(super) async fn handle_walreceiver_connection(
                            // TODO: we can't differentiate cancellation errors with
                            // anyhow::Error, so just ignore it if we're cancelled.
                            if !cancellation.is_cancelled() && !timeline.is_stopping() {
-                                critical!("{err:?}")
+                                critical_timeline!(
+                                    timeline.tenant_shard_id,
+                                    timeline.timeline_id,
+                                    "{err:?}"
+                                );
                            }
                        })?;

--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -40,7 +40,7 @@ use tracing::*;
 use utils::bin_ser::{DeserializeError, SerializeError};
 use utils::lsn::Lsn;
 use utils::rate_limit::RateLimit;
-use utils::{critical, failpoint_support};
+use utils::{critical_timeline, failpoint_support};
 use wal_decoder::models::record::NeonWalRecord;
 use wal_decoder::models::*;

@@ -418,18 +418,30 @@ impl WalIngest {
        // as there has historically been cases where PostgreSQL has cleared spurious VM pages. See:
        // https://github.com/neondatabase/neon/pull/10634.
        let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else {
-            critical!("clear_vm_bits for unknown VM relation {vm_rel}");
+            critical_timeline!(
+                modification.tline.tenant_shard_id,
+                modification.tline.timeline_id,
+                "clear_vm_bits for unknown VM relation {vm_rel}"
+            );
            return Ok(());
        };
        if let Some(blknum) = new_vm_blk {
            if blknum >= vm_size {
-                critical!("new_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
+                critical_timeline!(
+                    modification.tline.tenant_shard_id,
+                    modification.tline.timeline_id,
+                    "new_vm_blk {blknum} not in {vm_rel} of size {vm_size}"
+                );
                new_vm_blk = None;
            }
        }
        if let Some(blknum) = old_vm_blk {
            if blknum >= vm_size {
-                critical!("old_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
+                critical_timeline!(
+                    modification.tline.tenant_shard_id,
+                    modification.tline.timeline_id,
+                    "old_vm_blk {blknum} not in {vm_rel} of size {vm_size}"
+                );
                old_vm_blk = None;
            }
        }
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -22,7 +22,8 @@ OBJS = \
 	walproposer.o \
 	walproposer_pg.o \
 	neon_ddl_handler.o \
-	walsender_hooks.o
+	walsender_hooks.o \
+	$(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a

 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
@@ -54,6 +55,17 @@ WALPROP_OBJS = \
 	neon_utils.o \
 	walproposer_compat.o

+# libcommunicator.a is built by cargo from the Rust sources under communicator/
+# subdirectory. `cargo build` also generates communicator_bindings.h.
+neon.o: communicator/communicator_bindings.h
+
+$(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h &:
+	(cd $(srcdir)/communicator && cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE))
+
+# Force `cargo build` every time. Some of the Rust sources might have
+# changed.
+.PHONY: $(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h
+
 .PHONY: walproposer-lib
 walproposer-lib: CPPFLAGS += -DWALPROPOSER_LIB
 walproposer-lib: libwalproposer.a;
--- a/pgxn/neon/communicator/.gitignore
+++ b/pgxn/neon/communicator/.gitignore
@@ -0,0 +1,2 @@
+# generated file (with cbindgen, see build.rs)
+communicator_bindings.h
--- a/pgxn/neon/communicator/Cargo.toml
+++ b/pgxn/neon/communicator/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "communicator"
+version = "0.1.0"
+license.workspace = true
+edition.workspace = true
+
+[lib]
+crate-type = ["staticlib"]
+
+[features]
+# 'testing' feature is currently unused in the communicator, but we accept it for convenience of
+# calling build scripts, so that you can pass the same feature to all packages.
+testing = []
+
+[dependencies]
+neon-shmem.workspace = true
+workspace_hack = { version = "0.1", path = "../../../workspace_hack" }
+
+[build-dependencies]
+cbindgen.workspace = true
--- a/pgxn/neon/communicator/README.md
+++ b/pgxn/neon/communicator/README.md
@@ -0,0 +1,8 @@
+This package will evolve into a "compute-pageserver communicator"
+process and machinery. For now, it's just a dummy that doesn't do
+anything interesting, but it allows us to test the compilation and
+linking of Rust code into the Postgres extensions.
+
+At compilation time, pgxn/neon/communicator/ produces a static
+library, libcommunicator.a. It is linked to the neon.so extension
+library.
--- a/pgxn/neon/communicator/build.rs
+++ b/pgxn/neon/communicator/build.rs
@@ -0,0 +1,20 @@
+use std::env;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
+
+    match cbindgen::generate(crate_dir) {
+        Ok(bindings) => {
+            bindings.write_to_file("communicator_bindings.h");
+        }
+        Err(cbindgen::Error::ParseSyntaxError { .. }) => {
+            // This means there was a syntax error in the Rust sources. Don't panic, because
+            // we want the build to continue and the Rust compiler to hit the error. The
+            // Rust compiler produces a better error message than cbindgen.
+            eprintln!("Generating C bindings failed because of a Rust syntax error");
+        }
+        Err(err) => panic!("Unable to generate C bindings: {err:?}"),
+    };
+
+    Ok(())
+}
--- a/pgxn/neon/communicator/cbindgen.toml
+++ b/pgxn/neon/communicator/cbindgen.toml
@@ -0,0 +1,4 @@
+language = "C"
+
+[enum]
+prefix_with_name = true
--- a/pgxn/neon/communicator/src/lib.rs
+++ b/pgxn/neon/communicator/src/lib.rs
@@ -0,0 +1,6 @@
+/// dummy function, just to test linking Rust functions into the C
+/// extension
+#[unsafe(no_mangle)]
+pub extern "C" fn communicator_dummy(arg: u32) -> u32 {
+    arg + 1
+}
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -43,6 +43,9 @@
 #include "storage/ipc.h"
 #endif

+/* the rust bindings, generated by cbindgen */
+#include "communicator/communicator_bindings.h"
+
 PG_MODULE_MAGIC;
 void		_PG_init(void);

@@ -87,6 +90,14 @@ static const struct config_enum_entry running_xacts_overflow_policies[] = {
 	{NULL, 0, false}
 };

+static const struct config_enum_entry debug_compare_local_modes[] = {
+	{"none", DEBUG_COMPARE_LOCAL_NONE, false},
+	{"prefetch", DEBUG_COMPARE_LOCAL_PREFETCH, false},
+	{"lfc", DEBUG_COMPARE_LOCAL_LFC, false},
+	{"all", DEBUG_COMPARE_LOCAL_ALL, false},
+	{NULL, 0, false}
+};
+
 /*
 * XXX: These private to procarray.c, but we need them here.
 */
@@ -444,6 +455,9 @@ _PG_init(void)
 	shmem_startup_hook = neon_shmem_startup_hook;
 #endif

+	/* dummy call to a Rust function in the communicator library, to check that it works */
+	(void) communicator_dummy(123);
+
 	pg_init_libpagestore();
 	lfc_init();
 	pg_init_walproposer();
@@ -519,6 +533,16 @@ _PG_init(void)
 							GUC_UNIT_KB,
 							NULL, NULL, NULL);

+	DefineCustomEnumVariable(
+							"neon.debug_compare_local",
+							"Debug mode for compaing content of pages in prefetch ring/LFC/PS and local disk",
+							NULL,
+							&debug_compare_local,
+							DEBUG_COMPARE_LOCAL_NONE,
+							debug_compare_local_modes,
+							PGC_POSTMASTER,
+							0,
+							NULL, NULL, NULL);
 	/*
 	 * Important: This must happen after other parts of the extension are
 	 * loaded, otherwise any settings to GUCs that were set before the
--- a/pgxn/neon/neon_ddl_handler.c
+++ b/pgxn/neon/neon_ddl_handler.c
@@ -98,12 +98,14 @@ typedef struct
 typedef struct DdlHashTable
 {
 	struct DdlHashTable *prev_table;
+	size_t		subtrans_level;
 	HTAB	   *db_table;
 	HTAB	   *role_table;
 } DdlHashTable;

 static DdlHashTable RootTable;
 static DdlHashTable *CurrentDdlTable = &RootTable;
+static int SubtransLevel; /* current nesting level of subtransactions */

 static void
 PushKeyValue(JsonbParseState **state, char *key, char *value)
@@ -332,9 +334,25 @@ SendDeltasToControlPlane()
 	}
 }

+static void
+InitCurrentDdlTableIfNeeded()
+{
+	/* Lazy construction of DllHashTable chain */
+	if (SubtransLevel > CurrentDdlTable->subtrans_level)
+	{
+		DdlHashTable *new_table = MemoryContextAlloc(CurTransactionContext, sizeof(DdlHashTable));
+		new_table->prev_table = CurrentDdlTable;
+		new_table->subtrans_level = SubtransLevel;
+		new_table->role_table = NULL;
+		new_table->db_table = NULL;
+		CurrentDdlTable = new_table;
+	}
+}
+
 static void
 InitDbTableIfNeeded()
 {
+	InitCurrentDdlTableIfNeeded();
 	if (!CurrentDdlTable->db_table)
 	{
 		HASHCTL		db_ctl = {};
@@ -353,6 +371,7 @@ InitDbTableIfNeeded()
 static void
 InitRoleTableIfNeeded()
 {
+	InitCurrentDdlTableIfNeeded();
 	if (!CurrentDdlTable->role_table)
 	{
 		HASHCTL		role_ctl = {};
@@ -371,19 +390,21 @@ InitRoleTableIfNeeded()
 static void
 PushTable()
 {
-	DdlHashTable *new_table = MemoryContextAlloc(CurTransactionContext, sizeof(DdlHashTable));
-
-	new_table->prev_table = CurrentDdlTable;
-	new_table->role_table = NULL;
-	new_table->db_table = NULL;
-	CurrentDdlTable = new_table;
+	SubtransLevel += 1;
 }

 static void
 MergeTable()
 {
-	DdlHashTable *old_table = CurrentDdlTable;
+	DdlHashTable *old_table;

+	Assert(SubtransLevel >= CurrentDdlTable->subtrans_level);
+	if (--SubtransLevel >= CurrentDdlTable->subtrans_level)
+	{
+		return;
+	}
+
+	old_table = CurrentDdlTable;
 	CurrentDdlTable = old_table->prev_table;

 	if (old_table->db_table)
@@ -476,11 +497,15 @@ MergeTable()
 static void
 PopTable()
 {
-	/*
-	 * Current table gets freed because it is allocated in aborted
-	 * subtransaction's memory context.
-	 */
-	CurrentDdlTable = CurrentDdlTable->prev_table;
+	Assert(SubtransLevel >= CurrentDdlTable->subtrans_level);
+	if (--SubtransLevel < CurrentDdlTable->subtrans_level)
+	{
+		/*
+		 * Current table gets freed because it is allocated in aborted
+		 * subtransaction's memory context.
+		 */
+		CurrentDdlTable = CurrentDdlTable->prev_table;
+	}
 }

 static void
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -177,6 +177,22 @@ extern StringInfoData nm_pack_request(NeonRequest *msg);
 extern NeonResponse *nm_unpack_response(StringInfo s);
 extern char *nm_to_string(NeonMessage *msg);

+/*
+ * If debug_compare_local>DEBUG_COMPARE_LOCAL_NONE, we pass through all the SMGR API
+ * calls to md.c, and *also* do the calls to the Page Server. On every
+ * read, compare the versions we read from local disk and Page Server,
+ * and Assert that they are identical.
+ */
+typedef enum
+{
+	DEBUG_COMPARE_LOCAL_NONE,     /* normal mode - pages are storted locally only for unlogged relations */
+	DEBUG_COMPARE_LOCAL_PREFETCH, /* if page is found in prefetch ring, then compare it with local and return */
+	DEBUG_COMPARE_LOCAL_LFC,      /* if page is found in LFC or prefetch ring, then compare it with local and return */
+	DEBUG_COMPARE_LOCAL_ALL       /* always fetch page from PS and compare it with local */
+} DebugCompareLocalMode;
+
+extern int debug_compare_local;
+
 /*
 * API
 */
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -76,21 +76,11 @@
 typedef PGAlignedBlock PGIOAlignedBlock;
 #endif

-/*
- * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API
- * calls to md.c, and *also* do the calls to the Page Server. On every
- * read, compare the versions we read from local disk and Page Server,
- * and Assert that they are identical.
- */
-/* #define DEBUG_COMPARE_LOCAL */
-
-#ifdef DEBUG_COMPARE_LOCAL
 #include "access/nbtree.h"
 #include "storage/bufpage.h"
 #include "access/xlog_internal.h"

 static char *hexdump_page(char *page);
-#endif

 #define IS_LOCAL_REL(reln) (\
 	NInfoGetDbOid(InfoFromSMgrRel(reln)) != 0 && \
@@ -108,6 +98,8 @@ typedef enum
 	UNLOGGED_BUILD_NOT_PERMANENT
 } UnloggedBuildPhase;

+int debug_compare_local;
+
 static NRelFileInfo unlogged_build_rel_info;
 static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;

@@ -478,9 +470,10 @@ neon_init(void)
 	old_redo_read_buffer_filter = redo_read_buffer_filter;
 	redo_read_buffer_filter = neon_redo_read_buffer_filter;

-#ifdef DEBUG_COMPARE_LOCAL
-	mdinit();
-#endif
+	if (debug_compare_local)
+	{
+		mdinit();
+	}
 }

 /*
@@ -803,13 +796,16 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)

 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
-#ifdef DEBUG_COMPARE_LOCAL
-			mdcreate(reln, forkNum, forkNum == INIT_FORKNUM || isRedo);
-			if (forkNum == MAIN_FORKNUM)
-				mdcreate(reln, INIT_FORKNUM, true);
-#else
-			mdcreate(reln, forkNum, isRedo);
-#endif
+			if (debug_compare_local)
+			{
+				mdcreate(reln, forkNum, forkNum == INIT_FORKNUM || isRedo);
+				if (forkNum == MAIN_FORKNUM)
+					mdcreate(reln, INIT_FORKNUM, true);
+			}
+			else
+			{
+				mdcreate(reln, forkNum, isRedo);
+			}
 			return;

 		default:
@@ -848,10 +844,11 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	else
 		set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);

-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdcreate(reln, forkNum, isRedo);
-#endif
+	if (debug_compare_local)
+	{
+		if (IS_LOCAL_REL(reln))
+			mdcreate(reln, forkNum, isRedo);
+	}
 }

 /*
@@ -877,7 +874,7 @@ neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo)
 {
 	/*
 	 * Might or might not exist locally, depending on whether it's an unlogged
-	 * or permanent relation (or if DEBUG_COMPARE_LOCAL is set). Try to
+	 * or permanent relation (or if debug_compare_local is set). Try to
 	 * unlink, it won't do any harm if the file doesn't exist.
 	 */
 	mdunlink(rinfo, forkNum, isRedo);
@@ -973,10 +970,11 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,

 	lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);

-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdextend(reln, forkNum, blkno, buffer, skipFsync);
-#endif
+	if (debug_compare_local)
+	{
+		if (IS_LOCAL_REL(reln))
+			mdextend(reln, forkNum, blkno, buffer, skipFsync);
+	}

 	/*
 	 * smgr_extend is often called with an all-zeroes page, so
@@ -1051,10 +1049,11 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 						relpath(reln->smgr_rlocator, forkNum),
 						InvalidBlockNumber)));

-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync);
-#endif
+	if (debug_compare_local)
+	{
+		if (IS_LOCAL_REL(reln))
+			mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync);
+	}

 	/* Don't log any pages if we're not allowed to do so. */
 	if (!XLogInsertAllowed())
@@ -1265,10 +1264,11 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,

 	communicator_prefetch_pump_state();

-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdwriteback(reln, forknum, blocknum, nblocks);
-#endif
+	if (debug_compare_local)
+	{
+		if (IS_LOCAL_REL(reln))
+			mdwriteback(reln, forknum, blocknum, nblocks);
+	}
 }

 /*
@@ -1282,7 +1282,6 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
 }

-#ifdef DEBUG_COMPARE_LOCAL
 static void
 compare_with_local(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void* buffer, XLogRecPtr request_lsn)
 {
@@ -1364,7 +1363,6 @@ compare_with_local(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, voi
 		}
 	}
 }
-#endif


 #if PG_MAJORVERSION_NUM < 17
@@ -1417,22 +1415,28 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
 	{
 		/* Prefetch hit */
-#ifdef DEBUG_COMPARE_LOCAL
-		compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
-#else
-		return;
-#endif
+		if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
+		{
+			compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
+		}
+		if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH)
+		{
+			return;
+		}
 	}

 	/* Try to read from local file cache */
 	if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
 	{
 		MyNeonCounters->file_cache_hits_total++;
-#ifdef DEBUG_COMPARE_LOCAL
-		compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
-#else
-		return;
-#endif
+		if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
+		{
+			compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
+		}
+		if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC)
+		{
+			return;
+		}
 	}

 	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
@@ -1442,15 +1446,15 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	 */
 	communicator_prefetch_pump_state();

-#ifdef DEBUG_COMPARE_LOCAL
-	compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
-#endif
+	if (debug_compare_local)
+	{
+		compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
+	}
 }
 #endif /* PG_MAJORVERSION_NUM <= 16 */

 #if PG_MAJORVERSION_NUM >= 17

-#ifdef DEBUG_COMPARE_LOCAL
 static void
 compare_with_localv(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void** buffers, BlockNumber nblocks, neon_request_lsns* request_lsns, bits8* read_pages)
 {
@@ -1465,7 +1469,6 @@ compare_with_localv(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, vo
 		}
 	}
 }
-#endif


 static void
@@ -1516,13 +1519,19 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 													blocknum, request_lsns, nblocks,
 													buffers, read_pages);

-#ifdef DEBUG_COMPARE_LOCAL
-	compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
-	memset(read_pages, 0, sizeof(read_pages));
-#else
-	if (prefetch_result == nblocks)
+	if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
+	{
+		compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
+	}
+	if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH && prefetch_result == nblocks)
+	{
 		return;
-#endif
+	}
+	if (debug_compare_local > DEBUG_COMPARE_LOCAL_PREFETCH)
+	{
+		memset(read_pages, 0, sizeof(read_pages));
+	}
+

 	/* Try to read from local file cache */
 	lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
@@ -1531,14 +1540,19 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	if (lfc_result > 0)
 		MyNeonCounters->file_cache_hits_total += lfc_result;

-#ifdef DEBUG_COMPARE_LOCAL
-	compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
-	memset(read_pages, 0, sizeof(read_pages));
-#else
-	/* Read all blocks from LFC, so we're done */
-	if (prefetch_result + lfc_result == nblocks)
+	if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
+	{
+		compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
+	}
+	if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC && prefetch_result + lfc_result == nblocks)
+	{
+		/* Read all blocks from LFC, so we're done */
 		return;
-#endif
+	}
+	if (debug_compare_local > DEBUG_COMPARE_LOCAL_LFC)
+	{
+		memset(read_pages, 0, sizeof(read_pages));
+	}

 	communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
 							  buffers, nblocks, read_pages);
@@ -1548,14 +1562,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	 */
 	communicator_prefetch_pump_state();

-#ifdef DEBUG_COMPARE_LOCAL
-	memset(read_pages, 0xFF, sizeof(read_pages));
-	compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
-#endif
+	if (debug_compare_local)
+	{
+		memset(read_pages, 0xFF, sizeof(read_pages));
+		compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
+	}
 }
 #endif

-#ifdef DEBUG_COMPARE_LOCAL
 static char *
 hexdump_page(char *page)
 {
@@ -1574,7 +1588,6 @@ hexdump_page(char *page)

 	return result.data;
 }
-#endif

 #if PG_MAJORVERSION_NUM < 17
 /*
@@ -1596,12 +1609,8 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-#ifndef DEBUG_COMPARE_LOCAL
 			/* This is a bit tricky. Check if the relation exists locally */
-			if (mdexists(reln, forknum))
-#else
-			if (mdexists(reln, INIT_FORKNUM))
-#endif
+			if (mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum))
 			{
 				/* It exists locally. Guess it's unlogged then. */
 #if PG_MAJORVERSION_NUM >= 17
@@ -1656,14 +1665,17 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo

 	communicator_prefetch_pump_state();

-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
+	if (debug_compare_local)
+	{
+		if (IS_LOCAL_REL(reln))
+		{
 		#if PG_MAJORVERSION_NUM >= 17
-		mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
+			mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
 		#else
-		mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+			mdwrite(reln, forknum, blocknum, buffer, skipFsync);
 		#endif
-#endif
+		}
+	}
 }
 #endif

@@ -1677,12 +1689,8 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-#ifndef DEBUG_COMPARE_LOCAL
 			/* This is a bit tricky. Check if the relation exists locally */
-			if (mdexists(reln, forknum))
-#else
-			if (mdexists(reln, INIT_FORKNUM))
-#endif
+			if (mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum))
 			{
 				/* It exists locally. Guess it's unlogged then. */
 				mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
@@ -1720,10 +1728,11 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,

 	communicator_prefetch_pump_state();

-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
-#endif
+	if (debug_compare_local)
+	{
+		if (IS_LOCAL_REL(reln))
+			mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
+	}
 }

 #endif
@@ -1862,10 +1871,11 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
 	 */
 	neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum);

-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdtruncate(reln, forknum, old_blocks, nblocks);
-#endif
+	if (debug_compare_local)
+	{
+		if (IS_LOCAL_REL(reln))
+			mdtruncate(reln, forknum, old_blocks, nblocks);
+	}
 }

 /*
@@ -1904,10 +1914,11 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)

 	communicator_prefetch_pump_state();

-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdimmedsync(reln, forknum);
-#endif
+	if (debug_compare_local)
+	{
+		if (IS_LOCAL_REL(reln))
+			mdimmedsync(reln, forknum);
+	}
 }

 #if PG_MAJORVERSION_NUM >= 17
@@ -1934,10 +1945,11 @@ neon_registersync(SMgrRelation reln, ForkNumber forknum)

 	neon_log(SmgrTrace, "[NEON_SMGR] registersync noop");

-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdimmedsync(reln, forknum);
-#endif
+	if (debug_compare_local)
+	{
+		if (IS_LOCAL_REL(reln))
+			mdimmedsync(reln, forknum);
+	}
 }
 #endif

@@ -1978,10 +1990,11 @@ neon_start_unlogged_build(SMgrRelation reln)
 		case RELPERSISTENCE_UNLOGGED:
 			unlogged_build_rel_info = InfoFromSMgrRel(reln);
 			unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
-#ifdef DEBUG_COMPARE_LOCAL
-			if (!IsParallelWorker())
-				mdcreate(reln, INIT_FORKNUM, true);
-#endif
+			if (debug_compare_local)
+			{
+				if (!IsParallelWorker())
+					mdcreate(reln, INIT_FORKNUM, true);
+			}
 			return;

 		default:
@@ -2009,11 +2022,7 @@ neon_start_unlogged_build(SMgrRelation reln)
 	 */
 	if (!IsParallelWorker())
 	{
-#ifndef DEBUG_COMPARE_LOCAL
-		mdcreate(reln, MAIN_FORKNUM, false);
-#else
-		mdcreate(reln, INIT_FORKNUM, true);
-#endif
+		mdcreate(reln, debug_compare_local ? INIT_FORKNUM : MAIN_FORKNUM, false);
 	}
 }

@@ -2107,14 +2116,14 @@ neon_end_unlogged_build(SMgrRelation reln)
 			lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);

 			mdclose(reln, forknum);
-#ifndef DEBUG_COMPARE_LOCAL
-			/* use isRedo == true, so that we drop it immediately */
-			mdunlink(rinfob, forknum, true);
-#endif
+			if (!debug_compare_local)
+			{
+				/* use isRedo == true, so that we drop it immediately */
+				mdunlink(rinfob, forknum, true);
+			}
 		}
-#ifdef DEBUG_COMPARE_LOCAL
-		mdunlink(rinfob, INIT_FORKNUM, true);
-#endif
+		if (debug_compare_local)
+			mdunlink(rinfob, INIT_FORKNUM, true);
 	}
 	NRelFileInfoInvalidate(unlogged_build_rel_info);
 	unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -138,3 +138,62 @@ Now from client you can start a new session:
 ```sh
 PGSSLROOTCERT=./server.crt psql  "postgresql://proxy:password@endpoint.local.neon.build:4432/postgres?sslmode=verify-full"
 ```
+
+## auth broker setup:
+
+Create a postgres instance:
+```sh
+docker run \
+  --detach \
+  --name proxy-postgres \
+  --env POSTGRES_HOST_AUTH_METHOD=trust \
+  --env POSTGRES_USER=authenticated \
+  --env POSTGRES_DB=database \
+  --publish 5432:5432 \
+  postgres:17-bookworm
+```
+
+Create a configuration file called `local_proxy.json` in the root of the repo (used also by the auth broker to validate JWTs)
+```sh
+{
+    "jwks": [
+        {
+            "id": "1",
+            "role_names": ["authenticator", "authenticated", "anon"],
+            "jwks_url": "https://climbing-minnow-11.clerk.accounts.dev/.well-known/jwks.json",
+            "provider_name": "foo",
+            "jwt_audience": null
+        }
+    ]
+}
+```
+
+Start the local proxy:
+```sh
+cargo run --bin local_proxy -- \
+  --disable_pg_session_jwt true \
+  --http 0.0.0.0:7432
+```
+
+Start the auth broker:
+```sh
+LOGFMT=text OTEL_SDK_DISABLED=true cargo run --bin proxy --features testing -- \
+  -c server.crt -k server.key \
+  --is-auth-broker true \
+  --wss 0.0.0.0:8080 \
+  --http 0.0.0.0:7002 \
+  --auth-backend local
+```
+
+Create a JWT in your auth provider (e.g. Clerk) and set it in the `NEON_JWT` environment variable.
+```sh
+export NEON_JWT="..."
+```
+
+Run a query against the auth broker:
+```sh
+curl -k "https://foo.local.neon.build:8080/sql" \
+  -H "Authorization: Bearer $NEON_JWT" \
+  -H "neon-connection-string: postgresql://authenticator@foo.local.neon.build/database" \
+  -d '{"query":"select 1","params":[]}'
+```
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -164,21 +164,20 @@ async fn authenticate(
        })?
        .map_err(ConsoleRedirectError::from)?;

-    if auth_config.ip_allowlist_check_enabled {
-        if let Some(allowed_ips) = &db_info.allowed_ips {
-            if !auth::check_peer_addr_is_in_list(&ctx.peer_addr(), allowed_ips) {
-                return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
-            }
-        }
+    if auth_config.ip_allowlist_check_enabled
+        && let Some(allowed_ips) = &db_info.allowed_ips
+        && !auth::check_peer_addr_is_in_list(&ctx.peer_addr(), allowed_ips)
+    {
+        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
    }

    // Check if the access over the public internet is allowed, otherwise block. Note that
    // the console redirect is not behind the VPC service endpoint, so we don't need to check
    // the VPC endpoint ID.
-    if let Some(public_access_allowed) = db_info.public_access_allowed {
-        if !public_access_allowed {
-            return Err(auth::AuthError::NetworkNotAllowed);
-        }
+    if let Some(public_access_allowed) = db_info.public_access_allowed
+        && !public_access_allowed
+    {
+        return Err(auth::AuthError::NetworkNotAllowed);
    }

    client.write_message(BeMessage::NoticeResponse("Connecting to database."));
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -399,36 +399,36 @@ impl JwkCacheEntryLock {

        tracing::debug!(?payload, "JWT signature valid with claims");

-        if let Some(aud) = expected_audience {
-            if payload.audience.0.iter().all(|s| s != aud) {
-                return Err(JwtError::InvalidClaims(
-                    JwtClaimsError::InvalidJwtTokenAudience,
-                ));
-            }
+        if let Some(aud) = expected_audience
+            && payload.audience.0.iter().all(|s| s != aud)
+        {
+            return Err(JwtError::InvalidClaims(
+                JwtClaimsError::InvalidJwtTokenAudience,
+            ));
        }

        let now = SystemTime::now();

-        if let Some(exp) = payload.expiration {
-            if now >= exp + CLOCK_SKEW_LEEWAY {
-                return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired(
-                    exp.duration_since(SystemTime::UNIX_EPOCH)
-                        .unwrap_or_default()
-                        .as_secs(),
-                )));
-            }
+        if let Some(exp) = payload.expiration
+            && now >= exp + CLOCK_SKEW_LEEWAY
+        {
+            return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired(
+                exp.duration_since(SystemTime::UNIX_EPOCH)
+                    .unwrap_or_default()
+                    .as_secs(),
+            )));
        }

-        if let Some(nbf) = payload.not_before {
-            if nbf >= now + CLOCK_SKEW_LEEWAY {
-                return Err(JwtError::InvalidClaims(
-                    JwtClaimsError::JwtTokenNotYetReadyToUse(
-                        nbf.duration_since(SystemTime::UNIX_EPOCH)
-                            .unwrap_or_default()
-                            .as_secs(),
-                    ),
-                ));
-            }
+        if let Some(nbf) = payload.not_before
+            && nbf >= now + CLOCK_SKEW_LEEWAY
+        {
+            return Err(JwtError::InvalidClaims(
+                JwtClaimsError::JwtTokenNotYetReadyToUse(
+                    nbf.duration_since(SystemTime::UNIX_EPOCH)
+                        .unwrap_or_default()
+                        .as_secs(),
+                ),
+            ));
        }

        Ok(ComputeCredentialKeys::JwtPayload(payloadb))
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -171,7 +171,6 @@ impl ComputeUserInfo {
 pub(crate) enum ComputeCredentialKeys {
    AuthKeys(AuthKeys),
    JwtPayload(Vec<u8>),
-    None,
 }

 impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
@@ -346,15 +345,13 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
                    Err(e) => {
                        // The password could have been changed, so we invalidate the cache.
                        // We should only invalidate the cache if the TTL might have expired.
-                        if e.is_password_failed() {
-                            #[allow(irrefutable_let_patterns)]
-                            if let ControlPlaneClient::ProxyV1(api) = &*api {
-                                if let Some(ep) = &user_info.endpoint_id {
-                                    api.caches
-                                        .project_info
-                                        .maybe_invalidate_role_secret(ep, &user_info.user);
-                                }
-                            }
+                        if e.is_password_failed()
+                            && let ControlPlaneClient::ProxyV1(api) = &*api
+                            && let Some(ep) = &user_info.endpoint_id
+                        {
+                            api.caches
+                                .project_info
+                                .maybe_invalidate_role_secret(ep, &user_info.user);
                        }

                        Err(e)
--- a/proxy/src/binary/local_proxy.rs
+++ b/proxy/src/binary/local_proxy.rs
@@ -1,43 +1,37 @@
 use std::net::SocketAddr;
 use std::pin::pin;
-use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;

-use anyhow::{Context, bail, ensure};
+use anyhow::bail;
 use arc_swap::ArcSwapOption;
-use camino::{Utf8Path, Utf8PathBuf};
+use camino::Utf8PathBuf;
 use clap::Parser;
-use compute_api::spec::LocalProxySpec;
 use futures::future::Either;
-use thiserror::Error;
 use tokio::net::TcpListener;
 use tokio::sync::Notify;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, warn};
+use tracing::{debug, error, info};
 use utils::sentry_init::init_sentry;
 use utils::{pid_file, project_build_tag, project_git_version};

 use crate::auth::backend::jwt::JwkCache;
-use crate::auth::backend::local::{JWKS_ROLE_MAP, LocalBackend};
+use crate::auth::backend::local::LocalBackend;
 use crate::auth::{self};
 use crate::cancellation::CancellationHandler;
 use crate::config::{
    self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig,
+    refresh_config_loop,
 };
 use crate::control_plane::locks::ApiLocks;
-use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings};
-use crate::ext::TaskExt;
 use crate::http::health_server::AppMetrics;
-use crate::intern::RoleNameInt;
 use crate::metrics::{Metrics, ThreadPoolMetrics};
 use crate::rate_limiter::{EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo};
 use crate::scram::threadpool::ThreadPool;
 use crate::serverless::cancel_set::CancelSet;
 use crate::serverless::{self, GlobalConnPoolOptions};
 use crate::tls::client_config::compute_client_config_with_root_certs;
-use crate::types::RoleName;
 use crate::url::ApiUrl;

 project_git_version!(GIT_VERSION);
@@ -82,6 +76,11 @@ struct LocalProxyCliArgs {
    /// Path of the local proxy PID file
    #[clap(long, default_value = "./local_proxy.pid")]
    pid_path: Utf8PathBuf,
+    /// Disable pg_session_jwt extension installation
+    /// This is useful for testing the local proxy with vanilla postgres.
+    #[clap(long, default_value = "false")]
+    #[cfg(feature = "testing")]
+    disable_pg_session_jwt: bool,
 }

 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -282,6 +281,8 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
        wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?,
        connect_compute_locks,
        connect_to_compute: compute_config,
+        #[cfg(feature = "testing")]
+        disable_pg_session_jwt: args.disable_pg_session_jwt,
    })))
 }

@@ -293,132 +294,3 @@ fn build_auth_backend(args: &LocalProxyCliArgs) -> &'static auth::Backend<'stati

    Box::leak(Box::new(auth_backend))
 }
-
-#[derive(Error, Debug)]
-enum RefreshConfigError {
-    #[error(transparent)]
-    Read(#[from] std::io::Error),
-    #[error(transparent)]
-    Parse(#[from] serde_json::Error),
-    #[error(transparent)]
-    Validate(anyhow::Error),
-    #[error(transparent)]
-    Tls(anyhow::Error),
-}
-
-async fn refresh_config_loop(config: &ProxyConfig, path: Utf8PathBuf, rx: Arc<Notify>) {
-    let mut init = true;
-    loop {
-        rx.notified().await;
-
-        match refresh_config_inner(config, &path).await {
-            Ok(()) => {}
-            // don't log for file not found errors if this is the first time we are checking
-            // for computes that don't use local_proxy, this is not an error.
-            Err(RefreshConfigError::Read(e))
-                if init && e.kind() == std::io::ErrorKind::NotFound =>
-            {
-                debug!(error=?e, ?path, "could not read config file");
-            }
-            Err(RefreshConfigError::Tls(e)) => {
-                error!(error=?e, ?path, "could not read TLS certificates");
-            }
-            Err(e) => {
-                error!(error=?e, ?path, "could not read config file");
-            }
-        }
-
-        init = false;
-    }
-}
-
-async fn refresh_config_inner(
-    config: &ProxyConfig,
-    path: &Utf8Path,
-) -> Result<(), RefreshConfigError> {
-    let bytes = tokio::fs::read(&path).await?;
-    let data: LocalProxySpec = serde_json::from_slice(&bytes)?;
-
-    let mut jwks_set = vec![];
-
-    fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result<JwksSettings> {
-        let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?;
-
-        ensure!(
-            jwks_url.has_authority()
-                && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"),
-            "Invalid JWKS url. Must be HTTP",
-        );
-
-        ensure!(
-            jwks_url.host().is_some_and(|h| h != url::Host::Domain("")),
-            "Invalid JWKS url. No domain listed",
-        );
-
-        // clear username, password and ports
-        jwks_url
-            .set_username("")
-            .expect("url can be a base and has a valid host and is not a file. should not error");
-        jwks_url
-            .set_password(None)
-            .expect("url can be a base and has a valid host and is not a file. should not error");
-        // local testing is hard if we need to have a specific restricted port
-        if cfg!(not(feature = "testing")) {
-            jwks_url.set_port(None).expect(
-                "url can be a base and has a valid host and is not a file. should not error",
-            );
-        }
-
-        // clear query params
-        jwks_url.set_fragment(None);
-        jwks_url.query_pairs_mut().clear().finish();
-
-        if jwks_url.scheme() != "https" {
-            // local testing is hard if we need to set up https support.
-            if cfg!(not(feature = "testing")) {
-                jwks_url
-                    .set_scheme("https")
-                    .expect("should not error to set the scheme to https if it was http");
-            } else {
-                warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS");
-            }
-        }
-
-        Ok(JwksSettings {
-            id: jwks.id,
-            jwks_url,
-            _provider_name: jwks.provider_name,
-            jwt_audience: jwks.jwt_audience,
-            role_names: jwks
-                .role_names
-                .into_iter()
-                .map(RoleName::from)
-                .map(|s| RoleNameInt::from(&s))
-                .collect(),
-        })
-    }
-
-    for jwks in data.jwks.into_iter().flatten() {
-        jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?);
-    }
-
-    info!("successfully loaded new config");
-    JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set })));
-
-    if let Some(tls_config) = data.tls {
-        let tls_config = tokio::task::spawn_blocking(move || {
-            crate::tls::server_config::configure_tls(
-                tls_config.key_path.as_ref(),
-                tls_config.cert_path.as_ref(),
-                None,
-                false,
-            )
-        })
-        .await
-        .propagate_task_panic()
-        .map_err(RefreshConfigError::Tls)?;
-        config.tls_config.store(Some(Arc::new(tls_config)));
-    }
-
-    Ok(())
-}
--- a/proxy/src/binary/pg_sni_router.rs
+++ b/proxy/src/binary/pg_sni_router.rs
@@ -4,6 +4,7 @@
 //! This allows connecting to pods/services running in the same Kubernetes cluster from
 //! the outside. Similar to an ingress controller for HTTPS.

+use std::io;
 use std::net::SocketAddr;
 use std::path::Path;
 use std::sync::Arc;
@@ -229,7 +230,6 @@ pub(super) async fn task_main(
                    .set_nodelay(true)
                    .context("failed to set socket option")?;

-                info!(%peer_addr, "serving");
                let ctx = RequestContext::new(
                    session_id,
                    ConnectionInfo {
@@ -241,6 +241,14 @@ pub(super) async fn task_main(
                handle_client(ctx, dest_suffix, tls_config, compute_tls_config, socket).await
            }
            .unwrap_or_else(|e| {
+                if let Some(FirstMessage(io_error)) = e.downcast_ref() {
+                    // this is noisy. if we get EOF on the very first message that's likely
+                    // just NLB doing a healthcheck.
+                    if io_error.kind() == io::ErrorKind::UnexpectedEof {
+                        return;
+                    }
+                }
+
                // Acknowledge that the task has finished with an error.
                error!("per-client task finished with an error: {e:#}");
            })
@@ -257,12 +265,19 @@ pub(super) async fn task_main(
    Ok(())
 }

+#[derive(Debug, thiserror::Error)]
+#[error(transparent)]
+struct FirstMessage(io::Error);
+
 async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
    ctx: &RequestContext,
    raw_stream: S,
    tls_config: Arc<rustls::ServerConfig>,
 ) -> anyhow::Result<TlsStream<S>> {
-    let (mut stream, msg) = PqStream::parse_startup(Stream::from_raw(raw_stream)).await?;
+    let (mut stream, msg) = PqStream::parse_startup(Stream::from_raw(raw_stream))
+        .await
+        .map_err(FirstMessage)?;
+
    match msg {
        FeStartupPacket::SslRequest { direct: None } => {
            let raw = stream.accept_tls().await?;
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -10,21 +10,29 @@ use std::time::Duration;
 use anyhow::Context;
 use anyhow::{bail, ensure};
 use arc_swap::ArcSwapOption;
+#[cfg(any(test, feature = "testing"))]
+use camino::Utf8PathBuf;
 use futures::future::Either;
 use itertools::{Itertools, Position};
 use rand::{Rng, thread_rng};
 use remote_storage::RemoteStorageConfig;
 use tokio::net::TcpListener;
+#[cfg(any(test, feature = "testing"))]
+use tokio::sync::Notify;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
-use tracing::{Instrument, error, info, warn};
+use tracing::{error, info, warn};
 use utils::sentry_init::init_sentry;
 use utils::{project_build_tag, project_git_version};

 use crate::auth::backend::jwt::JwkCache;
+#[cfg(any(test, feature = "testing"))]
+use crate::auth::backend::local::LocalBackend;
 use crate::auth::backend::{ConsoleRedirectBackend, MaybeOwned};
 use crate::batch::BatchQueue;
 use crate::cancellation::{CancellationHandler, CancellationProcessor};
+#[cfg(any(test, feature = "testing"))]
+use crate::config::refresh_config_loop;
 use crate::config::{
    self, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, ProjectInfoCacheOptions,
    ProxyConfig, ProxyProtocolV2, remote_storage_from_toml,
@@ -60,6 +68,9 @@ enum AuthBackendType {

    #[cfg(any(test, feature = "testing"))]
    Postgres,
+
+    #[cfg(any(test, feature = "testing"))]
+    Local,
 }

 /// Neon proxy/router
@@ -74,6 +85,10 @@ struct ProxyCliArgs {
    proxy: SocketAddr,
    #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)]
    auth_backend: AuthBackendType,
+    /// Path of the local proxy config file (used for local-file auth backend)
+    #[clap(long, default_value = "./local_proxy.json")]
+    #[cfg(any(test, feature = "testing"))]
+    config_path: Utf8PathBuf,
    /// listen for management callback connection on ip:port
    #[clap(short, long, default_value = "127.0.0.1:7000")]
    mgmt: SocketAddr,
@@ -180,7 +195,9 @@ struct ProxyCliArgs {
    #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
    project_info_cache: String,
    /// cache for all valid endpoints
-    #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)]
+    // TODO: remove after a couple of releases.
+    #[clap(long, default_value_t = String::new())]
+    #[deprecated]
    endpoint_cache_config: String,
    #[clap(flatten)]
    parquet_upload: ParquetUploadArgs,
@@ -226,6 +243,14 @@ struct ProxyCliArgs {

    #[clap(flatten)]
    pg_sni_router: PgSniRouterArgs,
+
+    /// if this is not local proxy, this toggles whether we accept Postgres REST requests
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    is_rest_broker: bool,
+
+    /// cache for `db_schema_cache` introspection (use `size=0` to disable)
+    #[clap(long, default_value = "size=1000,ttl=1h")]
+    db_schema_cache: String,
 }

 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -386,6 +411,8 @@ pub async fn run() -> anyhow::Result<()> {
        64,
    ));

+    #[cfg(any(test, feature = "testing"))]
+    let refresh_config_notify = Arc::new(Notify::new());
    // client facing tasks. these will exit on error or on cancellation
    // cancellation returns Ok(())
    let mut client_tasks = JoinSet::new();
@@ -412,6 +439,17 @@ pub async fn run() -> anyhow::Result<()> {
                    endpoint_rate_limiter.clone(),
                ));
            }
+
+            // if auth backend is local, we need to load the config file
+            #[cfg(any(test, feature = "testing"))]
+            if let auth::Backend::Local(_) = &auth_backend {
+                refresh_config_notify.notify_one();
+                tokio::spawn(refresh_config_loop(
+                    config,
+                    args.config_path,
+                    refresh_config_notify.clone(),
+                ));
+            }
        }
        Either::Right(auth_backend) => {
            if let Some(proxy_listener) = proxy_listener {
@@ -462,7 +500,13 @@ pub async fn run() -> anyhow::Result<()> {

    // maintenance tasks. these never return unless there's an error
    let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(crate::signals::handle(cancellation_token.clone(), || {}));
+
+    maintenance_tasks.spawn(crate::signals::handle(cancellation_token.clone(), {
+        move || {
+            #[cfg(any(test, feature = "testing"))]
+            refresh_config_notify.notify_one();
+        }
+    }));
    maintenance_tasks.spawn(http::health_server::task_main(
        http_listener,
        AppMetrics {
@@ -478,52 +522,42 @@ pub async fn run() -> anyhow::Result<()> {
        maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
    }

-    #[cfg_attr(not(any(test, feature = "testing")), expect(irrefutable_let_patterns))]
-    if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend {
-        if let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
-            if let Some(client) = redis_client {
-                // project info cache and invalidation of that cache.
-                let cache = api.caches.project_info.clone();
-                maintenance_tasks.spawn(notifications::task_main(client.clone(), cache.clone()));
-                maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+    if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend
+        && let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api
+        && let Some(client) = redis_client
+    {
+        // project info cache and invalidation of that cache.
+        let cache = api.caches.project_info.clone();
+        maintenance_tasks.spawn(notifications::task_main(client.clone(), cache.clone()));
+        maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });

-                // Try to connect to Redis 3 times with 1 + (0..0.1) second interval.
-                // This prevents immediate exit and pod restart,
-                // which can cause hammering of the redis in case of connection issues.
-                // cancellation key management
-                let mut redis_kv_client = RedisKVClient::new(client.clone());
-                for attempt in (0..3).with_position() {
-                    match redis_kv_client.try_connect().await {
-                        Ok(()) => {
-                            info!("Connected to Redis KV client");
-                            cancellation_handler.init_tx(BatchQueue::new(CancellationProcessor {
-                                client: redis_kv_client,
-                                batch_size: args.cancellation_batch_size,
-                            }));
+        // Try to connect to Redis 3 times with 1 + (0..0.1) second interval.
+        // This prevents immediate exit and pod restart,
+        // which can cause hammering of the redis in case of connection issues.
+        // cancellation key management
+        let mut redis_kv_client = RedisKVClient::new(client.clone());
+        for attempt in (0..3).with_position() {
+            match redis_kv_client.try_connect().await {
+                Ok(()) => {
+                    info!("Connected to Redis KV client");
+                    cancellation_handler.init_tx(BatchQueue::new(CancellationProcessor {
+                        client: redis_kv_client,
+                        batch_size: args.cancellation_batch_size,
+                    }));

-                            break;
-                        }
-                        Err(e) => {
-                            error!("Failed to connect to Redis KV client: {e}");
-                            if matches!(attempt, Position::Last(_)) {
-                                bail!(
-                                    "Failed to connect to Redis KV client after {} attempts",
-                                    attempt.into_inner()
-                                );
-                            }
-                            let jitter = thread_rng().gen_range(0..100);
-                            tokio::time::sleep(Duration::from_millis(1000 + jitter)).await;
-                        }
-                    }
+                    break;
+                }
+                Err(e) => {
+                    error!("Failed to connect to Redis KV client: {e}");
+                    if matches!(attempt, Position::Last(_)) {
+                        bail!(
+                            "Failed to connect to Redis KV client after {} attempts",
+                            attempt.into_inner()
+                        );
+                    }
+                    let jitter = thread_rng().gen_range(0..100);
+                    tokio::time::sleep(Duration::from_millis(1000 + jitter)).await;
                }
-
-                // listen for notifications of new projects/endpoints/branches
-                let cache = api.caches.endpoints_cache.clone();
-                let span = tracing::info_span!("endpoints_cache");
-                maintenance_tasks.spawn(
-                    async move { cache.do_read(client, cancellation_token.clone()).await }
-                        .instrument(span),
-                );
            }
        }
    }
@@ -653,6 +687,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
        connect_compute_locks,
        connect_to_compute: compute_config,
+        #[cfg(feature = "testing")]
+        disable_pg_session_jwt: false,
    };

    let config = Box::leak(Box::new(config));
@@ -671,18 +707,15 @@ fn build_auth_backend(
            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
            let project_info_cache_config: ProjectInfoCacheOptions =
                args.project_info_cache.parse()?;
-            let endpoint_cache_config: config::EndpointCacheConfig =
-                args.endpoint_cache_config.parse()?;

            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
            info!(
                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
            );
-            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
+
            let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
                wake_compute_cache_config,
                project_info_cache_config,
-                endpoint_cache_config,
            )));

            let config::ConcurrencyLockOptions {
@@ -752,18 +785,15 @@ fn build_auth_backend(
            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
            let project_info_cache_config: ProjectInfoCacheOptions =
                args.project_info_cache.parse()?;
-            let endpoint_cache_config: config::EndpointCacheConfig =
-                args.endpoint_cache_config.parse()?;

            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
            info!(
                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
            );
-            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
+
            let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
                wake_compute_cache_config,
                project_info_cache_config,
-                endpoint_cache_config,
            )));

            let config::ConcurrencyLockOptions {
@@ -806,6 +836,19 @@ fn build_auth_backend(

            Ok(Either::Right(config))
        }
+
+        #[cfg(any(test, feature = "testing"))]
+        AuthBackendType::Local => {
+            let postgres: SocketAddr = "127.0.0.1:7432".parse()?;
+            let compute_ctl: ApiUrl = "http://127.0.0.1:3081/".parse()?;
+            let auth_backend = crate::auth::Backend::Local(
+                crate::auth::backend::MaybeOwned::Owned(LocalBackend::new(postgres, compute_ctl)),
+            );
+
+            let config = Box::leak(Box::new(auth_backend));
+
+            Ok(Either::Left(config))
+        }
    }
 }

--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -1,283 +0,0 @@
-use std::convert::Infallible;
-use std::future::pending;
-use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::{Arc, Mutex};
-
-use clashmap::ClashSet;
-use redis::streams::{StreamReadOptions, StreamReadReply};
-use redis::{AsyncCommands, FromRedisValue, Value};
-use serde::Deserialize;
-use tokio_util::sync::CancellationToken;
-use tracing::info;
-
-use crate::config::EndpointCacheConfig;
-use crate::context::RequestContext;
-use crate::ext::LockExt;
-use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
-use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
-use crate::rate_limiter::GlobalRateLimiter;
-use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
-use crate::types::EndpointId;
-
-// TODO: this could be an enum, but events in Redis need to be fixed first.
-// ProjectCreated was sent with type:branch_created. So we ignore type.
-#[derive(Deserialize, Debug, Clone, PartialEq)]
-struct ControlPlaneEvent {
-    endpoint_created: Option<EndpointCreated>,
-    branch_created: Option<BranchCreated>,
-    project_created: Option<ProjectCreated>,
-    #[serde(rename = "type")]
-    _type: Option<String>,
-}
-
-#[derive(Deserialize, Debug, Clone, PartialEq)]
-struct EndpointCreated {
-    endpoint_id: EndpointIdInt,
-}
-
-#[derive(Deserialize, Debug, Clone, PartialEq)]
-struct BranchCreated {
-    branch_id: BranchIdInt,
-}
-
-#[derive(Deserialize, Debug, Clone, PartialEq)]
-struct ProjectCreated {
-    project_id: ProjectIdInt,
-}
-
-impl TryFrom<&Value> for ControlPlaneEvent {
-    type Error = anyhow::Error;
-    fn try_from(value: &Value) -> Result<Self, Self::Error> {
-        let json = String::from_redis_value(value)?;
-        Ok(serde_json::from_str(&json)?)
-    }
-}
-
-pub struct EndpointsCache {
-    config: EndpointCacheConfig,
-    endpoints: ClashSet<EndpointIdInt>,
-    branches: ClashSet<BranchIdInt>,
-    projects: ClashSet<ProjectIdInt>,
-    ready: AtomicBool,
-    limiter: Arc<Mutex<GlobalRateLimiter>>,
-}
-
-impl EndpointsCache {
-    pub(crate) fn new(config: EndpointCacheConfig) -> Self {
-        Self {
-            limiter: Arc::new(Mutex::new(GlobalRateLimiter::new(
-                config.limiter_info.clone(),
-            ))),
-            config,
-            endpoints: ClashSet::new(),
-            branches: ClashSet::new(),
-            projects: ClashSet::new(),
-            ready: AtomicBool::new(false),
-        }
-    }
-
-    pub(crate) fn is_valid(&self, ctx: &RequestContext, endpoint: &EndpointId) -> bool {
-        if !self.ready.load(Ordering::Acquire) {
-            // the endpoint cache is not yet fully initialised.
-            return true;
-        }
-
-        if !self.should_reject(endpoint) {
-            ctx.set_rejected(false);
-            return true;
-        }
-
-        // report that we might want to reject this endpoint
-        ctx.set_rejected(true);
-
-        // If cache is disabled, just collect the metrics and return.
-        if self.config.disable_cache {
-            return true;
-        }
-
-        // If the limiter allows, we can pretend like it's valid
-        // (incase it is, due to redis channel lag).
-        if self.limiter.lock_propagate_poison().check() {
-            return true;
-        }
-
-        // endpoint not found, and there's too much load.
-        false
-    }
-
-    fn should_reject(&self, endpoint: &EndpointId) -> bool {
-        if endpoint.is_endpoint() {
-            let Some(endpoint) = EndpointIdInt::get(endpoint) else {
-                // if we haven't interned this endpoint, it's not in the cache.
-                return true;
-            };
-            !self.endpoints.contains(&endpoint)
-        } else if endpoint.is_branch() {
-            let Some(branch) = BranchIdInt::get(endpoint) else {
-                // if we haven't interned this branch, it's not in the cache.
-                return true;
-            };
-            !self.branches.contains(&branch)
-        } else {
-            let Some(project) = ProjectIdInt::get(endpoint) else {
-                // if we haven't interned this project, it's not in the cache.
-                return true;
-            };
-            !self.projects.contains(&project)
-        }
-    }
-
-    fn insert_event(&self, event: ControlPlaneEvent) {
-        if let Some(endpoint_created) = event.endpoint_created {
-            self.endpoints.insert(endpoint_created.endpoint_id);
-            Metrics::get()
-                .proxy
-                .redis_events_count
-                .inc(RedisEventsCount::EndpointCreated);
-        } else if let Some(branch_created) = event.branch_created {
-            self.branches.insert(branch_created.branch_id);
-            Metrics::get()
-                .proxy
-                .redis_events_count
-                .inc(RedisEventsCount::BranchCreated);
-        } else if let Some(project_created) = event.project_created {
-            self.projects.insert(project_created.project_id);
-            Metrics::get()
-                .proxy
-                .redis_events_count
-                .inc(RedisEventsCount::ProjectCreated);
-        }
-    }
-
-    pub async fn do_read(
-        &self,
-        mut con: ConnectionWithCredentialsProvider,
-        cancellation_token: CancellationToken,
-    ) -> anyhow::Result<Infallible> {
-        let mut last_id = "0-0".to_string();
-        loop {
-            if let Err(e) = con.connect().await {
-                tracing::error!("error connecting to redis: {:?}", e);
-                self.ready.store(false, Ordering::Release);
-            }
-            if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
-                tracing::error!("error reading from redis: {:?}", e);
-                self.ready.store(false, Ordering::Release);
-            }
-            if cancellation_token.is_cancelled() {
-                info!("cancellation token is cancelled, exiting");
-                // Maintenance tasks run forever. Sleep forever when canceled.
-                pending::<()>().await;
-            }
-            tokio::time::sleep(self.config.retry_interval).await;
-        }
-    }
-
-    async fn read_from_stream(
-        &self,
-        con: &mut ConnectionWithCredentialsProvider,
-        last_id: &mut String,
-    ) -> anyhow::Result<()> {
-        tracing::info!("reading endpoints/branches/projects from redis");
-        self.batch_read(
-            con,
-            StreamReadOptions::default().count(self.config.initial_batch_size),
-            last_id,
-            true,
-        )
-        .await?;
-        tracing::info!("ready to filter user requests");
-        self.ready.store(true, Ordering::Release);
-        self.batch_read(
-            con,
-            StreamReadOptions::default()
-                .count(self.config.default_batch_size)
-                .block(self.config.xread_timeout.as_millis() as usize),
-            last_id,
-            false,
-        )
-        .await
-    }
-
-    async fn batch_read(
-        &self,
-        conn: &mut ConnectionWithCredentialsProvider,
-        opts: StreamReadOptions,
-        last_id: &mut String,
-        return_when_finish: bool,
-    ) -> anyhow::Result<()> {
-        let mut total: usize = 0;
-        loop {
-            let mut res: StreamReadReply = conn
-                .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts)
-                .await?;
-
-            if res.keys.is_empty() {
-                if return_when_finish {
-                    if total != 0 {
-                        break;
-                    }
-                    anyhow::bail!(
-                        "Redis stream {} is empty, cannot be used to filter endpoints",
-                        self.config.stream_name
-                    );
-                }
-                // If we are not returning when finish, we should wait for more data.
-                continue;
-            }
-            if res.keys.len() != 1 {
-                anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name);
-            }
-
-            let key = res.keys.pop().expect("Checked length above");
-            let len = key.ids.len();
-            for stream_id in key.ids {
-                total += 1;
-                for value in stream_id.map.values() {
-                    match value.try_into() {
-                        Ok(event) => self.insert_event(event),
-                        Err(err) => {
-                            Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
-                                channel: &self.config.stream_name,
-                            });
-                            tracing::error!("error parsing value {value:?}: {err:?}");
-                        }
-                    }
-                }
-                if total.is_power_of_two() {
-                    tracing::debug!("endpoints read {}", total);
-                }
-                *last_id = stream_id.id;
-            }
-            if return_when_finish && len <= self.config.default_batch_size {
-                break;
-            }
-        }
-        tracing::info!("read {} endpoints/branches/projects from redis", total);
-        Ok(())
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_parse_control_plane_event() {
-        let s = r#"{"branch_created":null,"endpoint_created":{"endpoint_id":"ep-rapid-thunder-w0qqw2q9"},"project_created":null,"type":"endpoint_created"}"#;
-
-        let endpoint_id: EndpointId = "ep-rapid-thunder-w0qqw2q9".into();
-
-        assert_eq!(
-            serde_json::from_str::<ControlPlaneEvent>(s).unwrap(),
-            ControlPlaneEvent {
-                endpoint_created: Some(EndpointCreated {
-                    endpoint_id: endpoint_id.into(),
-                }),
-                branch_created: None,
-                project_created: None,
-                _type: Some("endpoint_created".into()),
-            }
-        );
-    }
-}
--- a/proxy/src/cache/mod.rs
+++ b/proxy/src/cache/mod.rs
@@ -1,5 +1,4 @@
 pub(crate) mod common;
-pub(crate) mod endpoints;
 pub(crate) mod project_info;
 mod timed_lru;

--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -64,6 +64,13 @@ impl Pipeline {
        let responses = self.replies;
        let batch_size = self.inner.len();

+        if !client.credentials_refreshed() {
+            tracing::debug!(
+                "Redis credentials are not refreshed. Sleeping for 5 seconds before retrying..."
+            );
+            tokio::time::sleep(Duration::from_secs(5)).await;
+        }
+
        match client.query(&self.inner).await {
            // for each reply, we expect that many values.
            Ok(Value::Array(values)) if values.len() == responses => {
@@ -127,6 +134,14 @@ impl QueueProcessing for CancellationProcessor {
    }

    async fn apply(&mut self, batch: Vec<Self::Req>) -> Vec<Self::Res> {
+        if !self.client.credentials_refreshed() {
+            // this will cause a timeout for cancellation operations
+            tracing::debug!(
+                "Redis credentials are not refreshed. Sleeping for 5 seconds before retrying..."
+            );
+            tokio::time::sleep(Duration::from_secs(5)).await;
+        }
+
        let mut pipeline = Pipeline::with_capacity(batch.len());

        let batch_size = batch.len();
--- a/proxy/src/compute/mod.rs
+++ b/proxy/src/compute/mod.rs
@@ -165,7 +165,7 @@ impl AuthInfo {
                ComputeCredentialKeys::AuthKeys(AuthKeys::ScramSha256(auth_keys)) => {
                    Some(Auth::Scram(Box::new(auth_keys)))
                }
-                ComputeCredentialKeys::JwtPayload(_) | ComputeCredentialKeys::None => None,
+                ComputeCredentialKeys::JwtPayload(_) => None,
            },
            server_params: StartupMessageParams::default(),
            skip_db_user: false,
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -4,17 +4,26 @@ use std::time::Duration;

 use anyhow::{Context, Ok, bail, ensure};
 use arc_swap::ArcSwapOption;
+use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
+use compute_api::spec::LocalProxySpec;
 use remote_storage::RemoteStorageConfig;
+use thiserror::Error;
+use tokio::sync::Notify;
+use tracing::{debug, error, info, warn};

 use crate::auth::backend::jwt::JwkCache;
+use crate::auth::backend::local::JWKS_ROLE_MAP;
 use crate::control_plane::locks::ApiLocks;
-use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig};
+use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings};
+use crate::ext::TaskExt;
+use crate::intern::RoleNameInt;
+use crate::rate_limiter::{RateLimitAlgorithm, RateLimiterConfig};
 use crate::scram::threadpool::ThreadPool;
 use crate::serverless::GlobalConnPoolOptions;
 use crate::serverless::cancel_set::CancelSet;
 pub use crate::tls::server_config::{TlsConfig, configure_tls};
-use crate::types::Host;
+use crate::types::{Host, RoleName};

 pub struct ProxyConfig {
    pub tls_config: ArcSwapOption<TlsConfig>,
@@ -26,6 +35,8 @@ pub struct ProxyConfig {
    pub wake_compute_retry_config: RetryConfig,
    pub connect_compute_locks: ApiLocks<Host>,
    pub connect_to_compute: ComputeConfig,
+    #[cfg(feature = "testing")]
+    pub disable_pg_session_jwt: bool,
 }

 pub struct ComputeConfig {
@@ -69,79 +80,6 @@ pub struct AuthenticationConfig {
    pub console_redirect_confirmation_timeout: tokio::time::Duration,
 }

-#[derive(Debug)]
-pub struct EndpointCacheConfig {
-    /// Batch size to receive all endpoints on the startup.
-    pub initial_batch_size: usize,
-    /// Batch size to receive endpoints.
-    pub default_batch_size: usize,
-    /// Timeouts for the stream read operation.
-    pub xread_timeout: Duration,
-    /// Stream name to read from.
-    pub stream_name: String,
-    /// Limiter info (to distinguish when to enable cache).
-    pub limiter_info: Vec<RateBucketInfo>,
-    /// Disable cache.
-    /// If true, cache is ignored, but reports all statistics.
-    pub disable_cache: bool,
-    /// Retry interval for the stream read operation.
-    pub retry_interval: Duration,
-}
-
-impl EndpointCacheConfig {
-    /// Default options for [`crate::control_plane::NodeInfoCache`].
-    /// Notice that by default the limiter is empty, which means that cache is disabled.
-    pub const CACHE_DEFAULT_OPTIONS: &'static str = "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s";
-
-    /// Parse cache options passed via cmdline.
-    /// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
-    fn parse(options: &str) -> anyhow::Result<Self> {
-        let mut initial_batch_size = None;
-        let mut default_batch_size = None;
-        let mut xread_timeout = None;
-        let mut stream_name = None;
-        let mut limiter_info = vec![];
-        let mut disable_cache = false;
-        let mut retry_interval = None;
-
-        for option in options.split(',') {
-            let (key, value) = option
-                .split_once('=')
-                .with_context(|| format!("bad key-value pair: {option}"))?;
-
-            match key {
-                "initial_batch_size" => initial_batch_size = Some(value.parse()?),
-                "default_batch_size" => default_batch_size = Some(value.parse()?),
-                "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?),
-                "stream_name" => stream_name = Some(value.to_string()),
-                "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?),
-                "disable_cache" => disable_cache = value.parse()?,
-                "retry_interval" => retry_interval = Some(humantime::parse_duration(value)?),
-                unknown => bail!("unknown key: {unknown}"),
-            }
-        }
-        RateBucketInfo::validate(&mut limiter_info)?;
-
-        Ok(Self {
-            initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?,
-            default_batch_size: default_batch_size.context("missing `default_batch_size`")?,
-            xread_timeout: xread_timeout.context("missing `xread_timeout`")?,
-            stream_name: stream_name.context("missing `stream_name`")?,
-            disable_cache,
-            limiter_info,
-            retry_interval: retry_interval.context("missing `retry_interval`")?,
-        })
-    }
-}
-
-impl FromStr for EndpointCacheConfig {
-    type Err = anyhow::Error;
-
-    fn from_str(options: &str) -> Result<Self, Self::Err> {
-        let error = || format!("failed to parse endpoint cache options '{options}'");
-        Self::parse(options).with_context(error)
-    }
-}
 #[derive(Debug)]
 pub struct MetricBackupCollectionConfig {
    pub remote_storage_config: Option<RemoteStorageConfig>,
@@ -409,6 +347,135 @@ impl FromStr for ConcurrencyLockOptions {
    }
 }

+#[derive(Error, Debug)]
+pub(crate) enum RefreshConfigError {
+    #[error(transparent)]
+    Read(#[from] std::io::Error),
+    #[error(transparent)]
+    Parse(#[from] serde_json::Error),
+    #[error(transparent)]
+    Validate(anyhow::Error),
+    #[error(transparent)]
+    Tls(anyhow::Error),
+}
+
+pub(crate) async fn refresh_config_loop(config: &ProxyConfig, path: Utf8PathBuf, rx: Arc<Notify>) {
+    let mut init = true;
+    loop {
+        rx.notified().await;
+
+        match refresh_config_inner(config, &path).await {
+            std::result::Result::Ok(()) => {}
+            // don't log for file not found errors if this is the first time we are checking
+            // for computes that don't use local_proxy, this is not an error.
+            Err(RefreshConfigError::Read(e))
+                if init && e.kind() == std::io::ErrorKind::NotFound =>
+            {
+                debug!(error=?e, ?path, "could not read config file");
+            }
+            Err(RefreshConfigError::Tls(e)) => {
+                error!(error=?e, ?path, "could not read TLS certificates");
+            }
+            Err(e) => {
+                error!(error=?e, ?path, "could not read config file");
+            }
+        }
+
+        init = false;
+    }
+}
+
+pub(crate) async fn refresh_config_inner(
+    config: &ProxyConfig,
+    path: &Utf8Path,
+) -> Result<(), RefreshConfigError> {
+    let bytes = tokio::fs::read(&path).await?;
+    let data: LocalProxySpec = serde_json::from_slice(&bytes)?;
+
+    let mut jwks_set = vec![];
+
+    fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result<JwksSettings> {
+        let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?;
+
+        ensure!(
+            jwks_url.has_authority()
+                && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"),
+            "Invalid JWKS url. Must be HTTP",
+        );
+
+        ensure!(
+            jwks_url.host().is_some_and(|h| h != url::Host::Domain("")),
+            "Invalid JWKS url. No domain listed",
+        );
+
+        // clear username, password and ports
+        jwks_url
+            .set_username("")
+            .expect("url can be a base and has a valid host and is not a file. should not error");
+        jwks_url
+            .set_password(None)
+            .expect("url can be a base and has a valid host and is not a file. should not error");
+        // local testing is hard if we need to have a specific restricted port
+        if cfg!(not(feature = "testing")) {
+            jwks_url.set_port(None).expect(
+                "url can be a base and has a valid host and is not a file. should not error",
+            );
+        }
+
+        // clear query params
+        jwks_url.set_fragment(None);
+        jwks_url.query_pairs_mut().clear().finish();
+
+        if jwks_url.scheme() != "https" {
+            // local testing is hard if we need to set up https support.
+            if cfg!(not(feature = "testing")) {
+                jwks_url
+                    .set_scheme("https")
+                    .expect("should not error to set the scheme to https if it was http");
+            } else {
+                warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS");
+            }
+        }
+
+        Ok(JwksSettings {
+            id: jwks.id,
+            jwks_url,
+            _provider_name: jwks.provider_name,
+            jwt_audience: jwks.jwt_audience,
+            role_names: jwks
+                .role_names
+                .into_iter()
+                .map(RoleName::from)
+                .map(|s| RoleNameInt::from(&s))
+                .collect(),
+        })
+    }
+
+    for jwks in data.jwks.into_iter().flatten() {
+        jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?);
+    }
+
+    info!("successfully loaded new config");
+    JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set })));
+
+    if let Some(tls_config) = data.tls {
+        let tls_config = tokio::task::spawn_blocking(move || {
+            crate::tls::server_config::configure_tls(
+                tls_config.key_path.as_ref(),
+                tls_config.cert_path.as_ref(),
+                None,
+                false,
+            )
+        })
+        .await
+        .propagate_task_panic()
+        .map_err(RefreshConfigError::Tls)?;
+        config.tls_config.store(Some(Arc::new(tls_config)));
+    }
+
+    std::result::Result::Ok(())
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -7,7 +7,7 @@ use once_cell::sync::OnceCell;
 use smol_str::SmolStr;
 use tokio::sync::mpsc;
 use tracing::field::display;
-use tracing::{Span, debug, error, info_span};
+use tracing::{Span, error, info_span};
 use try_lock::TryLock;
 use uuid::Uuid;

@@ -15,10 +15,7 @@ use self::parquet::RequestData;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::error::ErrorKind;
 use crate::intern::{BranchIdInt, ProjectIdInt};
-use crate::metrics::{
-    ConnectOutcome, InvalidEndpointsGroup, LatencyAccumulated, LatencyTimer, Metrics, Protocol,
-    Waiting,
-};
+use crate::metrics::{LatencyAccumulated, LatencyTimer, Metrics, Protocol, Waiting};
 use crate::pqproto::StartupMessageParams;
 use crate::protocol2::{ConnectionInfo, ConnectionInfoExtra};
 use crate::types::{DbName, EndpointId, RoleName};
@@ -70,8 +67,6 @@ struct RequestContextInner {
    // This sender is only used to log the length of session in case of success.
    disconnect_sender: Option<mpsc::UnboundedSender<RequestData>>,
    pub(crate) latency_timer: LatencyTimer,
-    // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
-    rejected: Option<bool>,
    disconnect_timestamp: Option<chrono::DateTime<Utc>>,
 }

@@ -106,7 +101,6 @@ impl Clone for RequestContext {
            auth_method: inner.auth_method.clone(),
            jwt_issuer: inner.jwt_issuer.clone(),
            success: inner.success,
-            rejected: inner.rejected,
            cold_start_info: inner.cold_start_info,
            pg_options: inner.pg_options.clone(),
            testodrome_query_id: inner.testodrome_query_id.clone(),
@@ -151,7 +145,6 @@ impl RequestContext {
            auth_method: None,
            jwt_issuer: None,
            success: false,
-            rejected: None,
            cold_start_info: ColdStartInfo::Unknown,
            pg_options: None,
            testodrome_query_id: None,
@@ -183,11 +176,6 @@ impl RequestContext {
        )
    }

-    pub(crate) fn set_rejected(&self, rejected: bool) {
-        let mut this = self.0.try_lock().expect("should not deadlock");
-        this.rejected = Some(rejected);
-    }
-
    pub(crate) fn set_cold_start_info(&self, info: ColdStartInfo) {
        self.0
            .try_lock()
@@ -209,11 +197,9 @@ impl RequestContext {
        if let Some(options_str) = options.get("options") {
            // If not found directly, try to extract it from the options string
            for option in options_str.split_whitespace() {
-                if option.starts_with("neon_query_id:") {
-                    if let Some(value) = option.strip_prefix("neon_query_id:") {
-                        this.set_testodrome_id(value.into());
-                        break;
-                    }
+                if let Some(value) = option.strip_prefix("neon_query_id:") {
+                    this.set_testodrome_id(value.into());
+                    break;
                }
            }
        }
@@ -463,38 +449,6 @@ impl RequestContextInner {
    }

    fn log_connect(&mut self) {
-        let outcome = if self.success {
-            ConnectOutcome::Success
-        } else {
-            ConnectOutcome::Failed
-        };
-
-        // TODO: get rid of entirely/refactor
-        // check for false positives
-        // AND false negatives
-        if let Some(rejected) = self.rejected {
-            let ep = self
-                .endpoint_id
-                .as_ref()
-                .map(|x| x.as_str())
-                .unwrap_or_default();
-            // This makes sense only if cache is disabled
-            debug!(
-                ?outcome,
-                ?rejected,
-                ?ep,
-                "check endpoint is valid with outcome"
-            );
-            Metrics::get()
-                .proxy
-                .invalid_endpoints_total
-                .inc(InvalidEndpointsGroup {
-                    protocol: self.protocol,
-                    rejected: rejected.into(),
-                    outcome,
-                });
-        }
-
        if let Some(tx) = self.sender.take() {
            // If type changes, this error handling needs to be updated.
            let tx: mpsc::UnboundedSender<RequestData> = tx;
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -159,13 +159,6 @@ impl NeonControlPlaneClient {
        ctx: &RequestContext,
        endpoint: &EndpointId,
    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
-        if !self
-            .caches
-            .endpoints_cache
-            .is_valid(ctx, &endpoint.normalize())
-        {
-            return Err(GetEndpointJwksError::EndpointNotFound);
-        }
        let request_id = ctx.session_id().to_string();
        async {
            let request = self
@@ -250,10 +243,8 @@ impl NeonControlPlaneClient {
            info!(duration = ?start.elapsed(), "received http response");
            let body = parse_body::<WakeCompute>(response.status(), response.bytes().await?)?;

-            // Unfortunately, ownership won't let us use `Option::ok_or` here.
-            let (host, port) = match parse_host_port(&body.address) {
-                None => return Err(WakeComputeError::BadComputeAddress(body.address)),
-                Some(x) => x,
+            let Some((host, port)) = parse_host_port(&body.address) else {
+                return Err(WakeComputeError::BadComputeAddress(body.address));
            };

            let host_addr = IpAddr::from_str(host).ok();
@@ -302,11 +293,6 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
            return Ok(secret);
        }

-        if !self.caches.endpoints_cache.is_valid(ctx, normalized_ep) {
-            info!("endpoint is not valid, skipping the request");
-            return Err(GetAuthInfoError::UnknownEndpoint);
-        }
-
        let auth_info = self.do_get_auth_req(ctx, endpoint, role).await?;

        let control = EndpointAccessControl {
@@ -348,11 +334,6 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
            return Ok(control);
        }

-        if !self.caches.endpoints_cache.is_valid(ctx, normalized_ep) {
-            info!("endpoint is not valid, skipping the request");
-            return Err(GetAuthInfoError::UnknownEndpoint);
-        }
-
        let auth_info = self.do_get_auth_req(ctx, endpoint, role).await?;

        let control = EndpointAccessControl {
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -13,9 +13,8 @@ use tracing::{debug, info};
 use super::{EndpointAccessControl, RoleAccessControl};
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError};
-use crate::cache::endpoints::EndpointsCache;
 use crate::cache::project_info::ProjectInfoCacheImpl;
-use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
+use crate::config::{CacheOptions, ProjectInfoCacheOptions};
 use crate::context::RequestContext;
 use crate::control_plane::{CachedNodeInfo, ControlPlaneApi, NodeInfoCache, errors};
 use crate::error::ReportableError;
@@ -121,15 +120,12 @@ pub struct ApiCaches {
    pub(crate) node_info: NodeInfoCache,
    /// Cache which stores project_id -> endpoint_ids mapping.
    pub project_info: Arc<ProjectInfoCacheImpl>,
-    /// List of all valid endpoints.
-    pub endpoints_cache: Arc<EndpointsCache>,
 }

 impl ApiCaches {
    pub fn new(
        wake_compute_cache_config: CacheOptions,
        project_info_cache_config: ProjectInfoCacheOptions,
-        endpoint_cache_config: EndpointCacheConfig,
    ) -> Self {
        Self {
            node_info: NodeInfoCache::new(
@@ -139,7 +135,6 @@ impl ApiCaches {
                true,
            ),
            project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
-            endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
        }
    }
 }
@@ -213,7 +208,12 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
        self.metrics
            .semaphore_acquire_seconds
            .observe(now.elapsed().as_secs_f64());
-        debug!("acquired permit {:?}", now.elapsed().as_secs_f64());
+
+        if permit.is_ok() {
+            debug!(elapsed = ?now.elapsed(), "acquired permit");
+        } else {
+            debug!(elapsed = ?now.elapsed(), "timed out acquiring permit");
+        }
        Ok(WakeComputePermit { permit: permit? })
    }

--- a/proxy/src/control_plane/errors.rs
+++ b/proxy/src/control_plane/errors.rs
@@ -99,10 +99,6 @@ pub(crate) enum GetAuthInfoError {

    #[error(transparent)]
    ApiError(ControlPlaneError),
-
-    /// Proxy does not know about the endpoint in advanced
-    #[error("endpoint not found in endpoint cache")]
-    UnknownEndpoint,
 }

 // This allows more useful interactions than `#[from]`.
@@ -119,8 +115,6 @@ impl UserFacingError for GetAuthInfoError {
            Self::BadSecret => REQUEST_FAILED.to_owned(),
            // However, API might return a meaningful error.
            Self::ApiError(e) => e.to_string_client(),
-            // pretend like control plane returned an error.
-            Self::UnknownEndpoint => REQUEST_FAILED.to_owned(),
        }
    }
 }
@@ -130,8 +124,6 @@ impl ReportableError for GetAuthInfoError {
        match self {
            Self::BadSecret => crate::error::ErrorKind::ControlPlane,
            Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
-            // we only apply endpoint filtering if control plane is under high load.
-            Self::UnknownEndpoint => crate::error::ErrorKind::ServiceRateLimit,
        }
    }
 }
@@ -200,9 +192,6 @@ impl CouldRetry for WakeComputeError {

 #[derive(Debug, Error)]
 pub enum GetEndpointJwksError {
-    #[error("endpoint not found")]
-    EndpointNotFound,
-
    #[error("failed to build control plane request: {0}")]
    RequestBuild(#[source] reqwest::Error),

--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -78,16 +78,6 @@ pub(crate) trait ReportableError: fmt::Display + Send + 'static {
    fn get_error_kind(&self) -> ErrorKind;
 }

-impl ReportableError for postgres_client::error::Error {
-    fn get_error_kind(&self) -> ErrorKind {
-        if self.as_db_error().is_some() {
-            ErrorKind::Postgres
-        } else {
-            ErrorKind::Compute
-        }
-    }
-}
-
 /// Flattens `Result<Result<T>>` into `Result<T>`.
 pub fn flatten_err<T>(r: Result<anyhow::Result<T>, JoinError>) -> anyhow::Result<T> {
    r.context("join error").and_then(|x| x)
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -52,7 +52,7 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
            StderrWriter {
                stderr: std::io::stderr(),
            },
-            &["request_id", "session_id", "conn_id"],
+            &["conn_id", "ep", "query_id", "request_id", "session_id"],
        ))
    } else {
        None
@@ -271,18 +271,18 @@ where
        });

        // In case logging fails we generate a simpler JSON object.
-        if let Err(err) = res {
-            if let Ok(mut line) = serde_json::to_vec(&serde_json::json!( {
+        if let Err(err) = res
+            && let Ok(mut line) = serde_json::to_vec(&serde_json::json!( {
                "timestamp": now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true),
                "level": "ERROR",
                "message": format_args!("cannot log event: {err:?}"),
                "fields": {
                    "event": format_args!("{event:?}"),
                },
-            })) {
-                line.push(b'\n');
-                self.writer.make_writer().write_all(&line).ok();
-            }
+            }))
+        {
+            line.push(b'\n');
+            self.writer.make_writer().write_all(&line).ok();
        }
    }

@@ -583,10 +583,11 @@ impl EventFormatter {
            THREAD_ID.with(|tid| serializer.serialize_entry("thread_id", tid))?;

            // TODO: tls cache? name could change
-            if let Some(thread_name) = std::thread::current().name() {
-                if !thread_name.is_empty() && thread_name != "tokio-runtime-worker" {
-                    serializer.serialize_entry("thread_name", thread_name)?;
-                }
+            if let Some(thread_name) = std::thread::current().name()
+                && !thread_name.is_empty()
+                && thread_name != "tokio-runtime-worker"
+            {
+                serializer.serialize_entry("thread_name", thread_name)?;
            }

            if let Some(task_id) = tokio::task::try_id() {
@@ -596,10 +597,10 @@ impl EventFormatter {
            serializer.serialize_entry("target", meta.target())?;

            // Skip adding module if it's the same as target.
-            if let Some(module) = meta.module_path() {
-                if module != meta.target() {
-                    serializer.serialize_entry("module", module)?;
-                }
+            if let Some(module) = meta.module_path()
+                && module != meta.target()
+            {
+                serializer.serialize_entry("module", module)?;
            }

            if let Some(file) = meta.file() {
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -236,13 +236,6 @@ pub enum Bool {
    False,
 }

-#[derive(FixedCardinalityLabel, Copy, Clone)]
-#[label(singleton = "outcome")]
-pub enum Outcome {
-    Success,
-    Failed,
-}
-
 #[derive(FixedCardinalityLabel, Copy, Clone)]
 #[label(singleton = "outcome")]
 pub enum CacheOutcome {
--- a/proxy/src/pglb/copy_bidirectional.rs
+++ b/proxy/src/pglb/copy_bidirectional.rs
@@ -90,27 +90,27 @@ where
        // TODO: 1 info log, with a enum label for close direction.

        // Early termination checks from compute to client.
-        if let TransferState::Done(_) = compute_to_client {
-            if let TransferState::Running(buf) = &client_to_compute {
-                info!("Compute is done, terminate client");
-                // Initiate shutdown
-                client_to_compute = TransferState::ShuttingDown(buf.amt);
-                client_to_compute_result =
-                    transfer_one_direction(cx, &mut client_to_compute, client, compute)
-                        .map_err(ErrorSource::from_client)?;
-            }
+        if let TransferState::Done(_) = compute_to_client
+            && let TransferState::Running(buf) = &client_to_compute
+        {
+            info!("Compute is done, terminate client");
+            // Initiate shutdown
+            client_to_compute = TransferState::ShuttingDown(buf.amt);
+            client_to_compute_result =
+                transfer_one_direction(cx, &mut client_to_compute, client, compute)
+                    .map_err(ErrorSource::from_client)?;
        }

        // Early termination checks from client to compute.
-        if let TransferState::Done(_) = client_to_compute {
-            if let TransferState::Running(buf) = &compute_to_client {
-                info!("Client is done, terminate compute");
-                // Initiate shutdown
-                compute_to_client = TransferState::ShuttingDown(buf.amt);
-                compute_to_client_result =
-                    transfer_one_direction(cx, &mut compute_to_client, compute, client)
-                        .map_err(ErrorSource::from_compute)?;
-            }
+        if let TransferState::Done(_) = client_to_compute
+            && let TransferState::Running(buf) = &compute_to_client
+        {
+            info!("Client is done, terminate compute");
+            // Initiate shutdown
+            compute_to_client = TransferState::ShuttingDown(buf.amt);
+            compute_to_client_result =
+                transfer_one_direction(cx, &mut compute_to_client, compute, client)
+                    .map_err(ErrorSource::from_compute)?;
        }

        // It is not a problem if ready! returns early ... (comment remains the same)
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -39,7 +39,11 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {

        let config = config.map_or(self.default_config, Into::into);

-        if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 {
+        if self
+            .access_count
+            .fetch_add(1, Ordering::AcqRel)
+            .is_multiple_of(2048)
+        {
            self.do_gc(now);
        }

--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -16,44 +16,6 @@ use super::LeakyBucketConfig;
 use crate::ext::LockExt;
 use crate::intern::EndpointIdInt;

-pub struct GlobalRateLimiter {
-    data: Vec<RateBucket>,
-    info: Vec<RateBucketInfo>,
-}
-
-impl GlobalRateLimiter {
-    pub fn new(info: Vec<RateBucketInfo>) -> Self {
-        Self {
-            data: vec![
-                RateBucket {
-                    start: Instant::now(),
-                    count: 0,
-                };
-                info.len()
-            ],
-            info,
-        }
-    }
-
-    /// Check that number of connections is below `max_rps` rps.
-    pub fn check(&mut self) -> bool {
-        let now = Instant::now();
-
-        let should_allow_request = self
-            .data
-            .iter_mut()
-            .zip(&self.info)
-            .all(|(bucket, info)| bucket.should_allow_request(info, now, 1));
-
-        if should_allow_request {
-            // only increment the bucket counts if the request will actually be accepted
-            self.data.iter_mut().for_each(|b| b.inc(1));
-        }
-
-        should_allow_request
-    }
-}
-
 // Simple per-endpoint rate limiter.
 //
 // Check that number of connections to the endpoint is below `max_rps` rps.
@@ -211,7 +173,11 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
        // worst case memory usage is about:
        //    = 2 * 2048 * 64 * (48B + 72B)
        //    = 30MB
-        if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 {
+        if self
+            .access_count
+            .fetch_add(1, Ordering::AcqRel)
+            .is_multiple_of(2048)
+        {
            self.do_gc();
        }

--- a/Show More
+++ b/Show More