clean up

split out binaries
proxy: experiment with idea to split crates
2026-03-09 19:30:38 +00:00 · 2024-08-13 15:08:57 +01:00 · 2024-08-13 15:08:57 +01:00 · 2024-08-13 15:08:54 +01:00 · 2024-08-13 15:01:48 +01:00 · 2024-08-13 11:08:25 +01:00
197 changed files with 5916 additions and 2703 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -8,6 +8,9 @@ self-hosted-runner:
    - small-arm64
    - us-east-2
 config-variables:
+  - BENCHMARK_PROJECT_ID_PUB
+  - BENCHMARK_PROJECT_ID_SUB
  - REMOTE_STORAGE_AZURE_CONTAINER
  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
+  - DEV_AWS_OIDC_ROLE_ARN
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -56,6 +56,10 @@ concurrency:
 jobs:
  bench:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # Required for OIDC authentication in azure runners
    strategy:
      fail-fast: false
      matrix:
@@ -63,9 +67,13 @@ jobs:
          - DEFAULT_PG_VERSION: 16
            PLATFORM: "neon-staging"
            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+            RUNNER: [ self-hosted, us-east-2, x64 ]
+            IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
          - DEFAULT_PG_VERSION: 16
            PLATFORM: "azure-staging"
            region_id: 'azure-eastus2'
+            RUNNER: [ self-hosted, eastus2, x64 ]
+            IMAGE: neondatabase/build-tools:pinned
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -76,14 +84,21 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.PLATFORM }}

-    runs-on: [ self-hosted, us-east-2, x64 ]
+    runs-on: ${{ matrix.RUNNER }}
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: ${{ matrix.IMAGE }}
      options: --init

    steps:
    - uses: actions/checkout@v4

+    - name: Configure AWS credentials # necessary on Azure runners
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}  
+        role-duration-seconds: 18000 # 5 hours
+
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
@@ -147,7 +162,7 @@ jobs:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: 16
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -161,6 +176,7 @@ jobs:
    steps:
    - uses: actions/checkout@v4

+    
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
@@ -168,7 +184,7 @@ jobs:
        path: /tmp/neon/
        prefix: latest

-    - name: Run benchmark
+    - name: Run Logical Replication benchmarks
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
@@ -176,12 +192,15 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 5400
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+        BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }}
+        BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }}

-    - name: Run benchmark
+    - name: Run Physical Replication benchmarks
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
@@ -234,6 +253,9 @@ jobs:
      id: pgbench-compare-matrix
      run: |
        region_id_default=${{ env.DEFAULT_REGION_ID }}
+        runner_default='["self-hosted", "us-east-2", "x64"]'
+        runner_azure='["self-hosted", "eastus2", "x64"]'
+        image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned"
        matrix='{
          "pg_version" : [
            16
@@ -247,16 +269,19 @@ jobs:
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
+          "runner": ['"$runner_default"'],
+          "image": [ "'"$image_default"'" ],
+          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -299,6 +324,10 @@ jobs:
  pgbench-compare:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    needs: [ generate-matrices ]
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # Required for OIDC authentication in azure runners

    strategy:
      fail-fast: false
@@ -314,9 +343,9 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.platform }}

-    runs-on: [ self-hosted, us-east-2, x64 ]
+    runs-on: ${{ matrix.runner }}
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: ${{ matrix.image }}
      options: --init

    # Increase timeout to 8h, default timeout is 6h
@@ -325,6 +354,13 @@ jobs:
    steps:
    - uses: actions/checkout@v4

+    - name: Configure AWS credentials # necessary on Azure runners
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+        
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
@@ -432,12 +468,20 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  pgbench-pgvector:
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # Required for OIDC authentication in azure runners
    strategy:
      fail-fast: false
      matrix:
        include:
          - PLATFORM: "neonvm-captest-pgvector"
+            RUNNER: [ self-hosted, us-east-2, x64 ]
+            IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
          - PLATFORM: "azure-captest-pgvector"
+            RUNNER: [ self-hosted, eastus2, x64 ]
+            IMAGE: neondatabase/build-tools:pinned

    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
@@ -450,9 +494,9 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.PLATFORM }}

-    runs-on: [ self-hosted, us-east-2, x64 ]
+    runs-on: ${{ matrix.RUNNER }}
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: ${{ matrix.IMAGE }}
      options: --init

    steps:
@@ -463,12 +507,12 @@ jobs:
    - name: Install postgresql-16 where pytest expects it
      run: |
        cd /home/nonroot
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb 
-        dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb 
+        dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg
        mkdir -p /tmp/neon/pg_install/v16/bin
        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench  
        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql  
@@ -493,6 +537,13 @@ jobs:
        esac

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+        
+    - name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours

    - name: Benchmark pgvector hnsw indexing
      uses: ./.github/actions/run-python-test-set
@@ -521,7 +572,7 @@ jobs:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-
+    
    - name: Create Allure report
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -149,8 +149,6 @@ jobs:

    env:
      BUILD_TYPE: release
-      # remove the cachepot wrapper and build without crate caches
-      RUSTC_WRAPPER: ""
      # build with incremental compilation produce partial results
      # so do not attempt to cache this build, also disable the incremental compilation
      CARGO_INCREMENTAL: 0
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -66,7 +66,31 @@ jobs:
        ports:
          - 9000:9000
          - 8123:8123
-
+      zookeeper:
+        image: quay.io/debezium/zookeeper:2.7
+        ports:
+          - 2181:2181
+      kafka:
+        image: quay.io/debezium/kafka:2.7
+        env:
+          ZOOKEEPER_CONNECT: "zookeeper:2181"
+          KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
+          KAFKA_BROKER_ID: 1
+          KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
+          KAFKA_JMX_PORT: 9991
+        ports:
+          - 9092:9092
+      debezium:
+        image: quay.io/debezium/connect:2.7
+        env:
+          BOOTSTRAP_SERVERS: kafka:9092
+          GROUP_ID: 1
+          CONFIG_STORAGE_TOPIC: debezium-config
+          OFFSET_STORAGE_TOPIC: debezium-offset
+          STATUS_STORAGE_TOPIC: debezium-status
+          DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
+        ports:
+          - 8083:8083
    steps:
      - uses: actions/checkout@v4

--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -7,12 +7,20 @@ on:
        description: 'Source tag'
        required: true
        type: string
+      force:
+        description: 'Force the image to be pinned'
+        default: false
+        type: boolean
  workflow_call:
    inputs:
      from-tag:
        description: 'Source tag'
        required: true
        type: string
+      force:
+        description: 'Force the image to be pinned'
+        default: false
+        type: boolean

 defaults:
  run:
@@ -22,15 +30,18 @@ concurrency:
  group: pin-build-tools-image-${{ inputs.from-tag }}
  cancel-in-progress: false

+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}

-jobs:
-  tag-image:
-    runs-on: ubuntu-22.04
+env:
+  FROM_TAG: ${{ inputs.from-tag }}
+  TO_TAG: pinned

-    env:
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: pinned
+jobs:
+  check-manifests:
+    runs-on: ubuntu-22.04
+    outputs:
+      skip: ${{ steps.check-manifests.outputs.skip }}

    steps:
      - name: Check if we really need to pin the image
@@ -47,27 +58,44 @@ jobs:

          echo "skip=${skip}" | tee -a $GITHUB_OUTPUT

+  tag-image:
+    needs: check-manifests
+
+    # use format(..) to catch both inputs.force = true AND inputs.force = 'true'
+    if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true'
+
+    runs-on: ubuntu-22.04
+
+    permissions:
+      id-token: write # for `azure/login`
+
+    steps:
      - uses: docker/login-action@v3
-        if: steps.check-manifests.outputs.skip == 'false'
+
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
-        if: steps.check-manifests.outputs.skip == 'false'
-        run: |
-          docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
-                                             neondatabase/build-tools:${FROM_TAG}
-
      - uses: docker/login-action@v3
-        if: steps.check-manifests.outputs.skip == 'false'
        with:
          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
          password: ${{ secrets.AWS_SECRET_KEY_DEV }}

-      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
-        if: steps.check-manifests.outputs.skip == 'false'
+      - name: Azure login
+        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
+        with:
+          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
+          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
+
+      - name: Login to ACR
+        run: |
+          az acr login --name=neoneastus2
+
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
        run: |
          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
+                                          -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \
+                                          -t neondatabase/build-tools:${TO_TAG} \
                                             neondatabase/build-tools:${FROM_TAG}
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -13,8 +13,6 @@ defaults:
 env:
  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

 jobs:
  cancel-previous-e2e-tests:
@@ -64,19 +62,35 @@ jobs:
    needs: [ tag ]
    runs-on: ubuntu-22.04
    env:
+      EVENT_ACTION: ${{ github.event.action }}
+      GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      TAG: ${{ needs.tag.outputs.build-tag }}
    steps:
-      - name: check if ecr image are present
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+      - name: Wait for `promote-images` job to finish
+        # It's important to have a timeout here, the script in the step can run infinitely
+        timeout-minutes: 60
        run: |
-          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
-            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
-            if [ "$OUTPUT" == "" ]; then
-              echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT
-              exit 1
-            fi
+          if [ "${GITHUB_EVENT_NAME}" != "pull_request" ] || [ "${EVENT_ACTION}" != "ready_for_review" ]; then
+            exit 0
+          fi
+
+          # For PRs we use the run id as the tag
+          BUILD_AND_TEST_RUN_ID=${TAG}
+          while true; do
+            conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion')
+            case "$conclusion" in
+              success)
+                break
+                ;;
+              failure | cancelled | skipped)
+                echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..."
+                exit 1
+                ;;
+              *)
+                echo "The 'promote-images' hasn't succeed yet. Waiting..."
+                sleep 60
+                ;;
+            esac
          done

      - name: Set e2e-platforms
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -484,7 +484,7 @@ dependencies = [
 "http 0.2.9",
 "http 1.1.0",
 "once_cell",
- "p256",
+ "p256 0.11.1",
 "percent-encoding",
 "ring 0.17.6",
 "sha2",
@@ -848,6 +848,12 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce"

+[[package]]
+name = "base16ct"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf"
+
 [[package]]
 name = "base64"
 version = "0.13.1"
@@ -971,9 +977,9 @@ checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"

 [[package]]
 name = "bytemuck"
-version = "1.16.0"
+version = "1.16.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
+checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83"

 [[package]]
 name = "byteorder"
@@ -1526,8 +1532,10 @@ version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76"
 dependencies = [
+ "generic-array",
 "rand_core 0.6.4",
 "subtle",
+ "zeroize",
 ]

 [[package]]
@@ -1621,6 +1629,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c"
 dependencies = [
 "const-oid",
+ "pem-rfc7468",
 "zeroize",
 ]

@@ -1720,6 +1729,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
 dependencies = [
 "block-buffer",
+ "const-oid",
 "crypto-common",
 "subtle",
 ]
@@ -1771,11 +1781,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c"
 dependencies = [
 "der 0.6.1",
- "elliptic-curve",
- "rfc6979",
+ "elliptic-curve 0.12.3",
+ "rfc6979 0.3.1",
 "signature 1.6.4",
 ]

+[[package]]
+name = "ecdsa"
+version = "0.16.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca"
+dependencies = [
+ "der 0.7.8",
+ "digest",
+ "elliptic-curve 0.13.8",
+ "rfc6979 0.4.0",
+ "signature 2.2.0",
+ "spki 0.7.3",
+]
+
 [[package]]
 name = "either"
 version = "1.8.1"
@@ -1788,16 +1812,36 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3"
 dependencies = [
- "base16ct",
+ "base16ct 0.1.1",
 "crypto-bigint 0.4.9",
 "der 0.6.1",
 "digest",
- "ff",
+ "ff 0.12.1",
 "generic-array",
- "group",
- "pkcs8",
+ "group 0.12.1",
+ "pkcs8 0.9.0",
 "rand_core 0.6.4",
- "sec1",
+ "sec1 0.3.0",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "elliptic-curve"
+version = "0.13.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47"
+dependencies = [
+ "base16ct 0.2.0",
+ "crypto-bigint 0.5.5",
+ "digest",
+ "ff 0.13.0",
+ "generic-array",
+ "group 0.13.0",
+ "pem-rfc7468",
+ "pkcs8 0.10.2",
+ "rand_core 0.6.4",
+ "sec1 0.7.3",
 "subtle",
 "zeroize",
 ]
@@ -1951,6 +1995,16 @@ dependencies = [
 "subtle",
 ]

+[[package]]
+name = "ff"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ded41244b729663b1e574f1b4fb731469f69f79c17667b5d776b16cda0479449"
+dependencies = [
+ "rand_core 0.6.4",
+ "subtle",
+]
+
 [[package]]
 name = "filetime"
 version = "0.2.22"
@@ -2148,6 +2202,7 @@ checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
 dependencies = [
 "typenum",
 "version_check",
+ "zeroize",
 ]

 [[package]]
@@ -2214,7 +2269,18 @@ version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7"
 dependencies = [
- "ff",
+ "ff 0.12.1",
+ "rand_core 0.6.4",
+ "subtle",
+]
+
+[[package]]
+name = "group"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63"
+dependencies = [
+ "ff 0.13.0",
 "rand_core 0.6.4",
 "subtle",
 ]
@@ -2776,6 +2842,42 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "jose-b64"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bec69375368709666b21c76965ce67549f2d2db7605f1f8707d17c9656801b56"
+dependencies = [
+ "base64ct",
+ "serde",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "jose-jwa"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ab78e053fe886a351d67cf0d194c000f9d0dcb92906eb34d853d7e758a4b3a7"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "jose-jwk"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "280fa263807fe0782ecb6f2baadc28dffc04e00558a58e33bfdb801d11fd58e7"
+dependencies = [
+ "jose-b64",
+ "jose-jwa",
+ "p256 0.13.2",
+ "p384",
+ "rsa",
+ "serde",
+ "zeroize",
+]
+
 [[package]]
 name = "js-sys"
 version = "0.3.69"
@@ -2835,6 +2937,9 @@ name = "lazy_static"
 version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+dependencies = [
+ "spin 0.5.2",
+]

 [[package]]
 name = "lazycell"
@@ -3204,6 +3309,23 @@ dependencies = [
 "num-traits",
 ]

+[[package]]
+name = "num-bigint-dig"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151"
+dependencies = [
+ "byteorder",
+ "lazy_static",
+ "libm",
+ "num-integer",
+ "num-iter",
+ "num-traits",
+ "rand 0.8.5",
+ "smallvec",
+ "zeroize",
+]
+
 [[package]]
 name = "num-complex"
 version = "0.4.4"
@@ -3481,11 +3603,33 @@ version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594"
 dependencies = [
- "ecdsa",
- "elliptic-curve",
+ "ecdsa 0.14.8",
+ "elliptic-curve 0.12.3",
 "sha2",
 ]

+[[package]]
+name = "p256"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b"
+dependencies = [
+ "ecdsa 0.16.9",
+ "elliptic-curve 0.13.8",
+ "primeorder",
+ "sha2",
+]
+
+[[package]]
+name = "p384"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70786f51bcc69f6a4c0360e063a4cac5419ef7c5cd5b3c99ad70f3be5ba79209"
+dependencies = [
+ "elliptic-curve 0.13.8",
+ "primeorder",
+]
+
 [[package]]
 name = "pagebench"
 version = "0.1.0"
@@ -3847,6 +3991,15 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "pem-rfc7468"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
+dependencies = [
+ "base64ct",
+]
+
 [[package]]
 name = "percent-encoding"
 version = "2.2.0"
@@ -3863,6 +4016,29 @@ dependencies = [
 "indexmap 1.9.3",
 ]

+[[package]]
+name = "pg_sni_router"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "futures",
+ "git-version",
+ "itertools 0.10.5",
+ "pq_proto",
+ "proxy-core",
+ "proxy-sasl",
+ "rustls 0.22.4",
+ "rustls-pemfile 2.1.1",
+ "socket2 0.5.5",
+ "tokio",
+ "tokio-util",
+ "tracing",
+ "tracing-utils",
+ "utils",
+ "uuid",
+]
+
 [[package]]
 name = "phf"
 version = "0.11.1"
@@ -3913,6 +4089,17 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"

+[[package]]
+name = "pkcs1"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f"
+dependencies = [
+ "der 0.7.8",
+ "pkcs8 0.10.2",
+ "spki 0.7.3",
+]
+
 [[package]]
 name = "pkcs8"
 version = "0.9.0"
@@ -3923,6 +4110,16 @@ dependencies = [
 "spki 0.6.0",
 ]

+[[package]]
+name = "pkcs8"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
+dependencies = [
+ "der 0.7.8",
+ "spki 0.7.3",
+]
+
 [[package]]
 name = "pkg-config"
 version = "0.3.27"
@@ -3960,7 +4157,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3973,7 +4170,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -3992,7 +4189,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4116,6 +4313,15 @@ dependencies = [
 "syn 2.0.52",
 ]

+[[package]]
+name = "primeorder"
+version = "0.13.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6"
+dependencies = [
+ "elliptic-curve 0.13.8",
+]
+
 [[package]]
 name = "proc-macro-hack"
 version = "0.5.20+deprecated"
@@ -4230,9 +4436,38 @@ dependencies = [
 [[package]]
 name = "proxy"
 version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "aws-config",
+ "clap",
+ "futures",
+ "git-version",
+ "humantime",
+ "itertools 0.10.5",
+ "metrics",
+ "pq_proto",
+ "proxy-core",
+ "proxy-sasl",
+ "remote_storage",
+ "rustls 0.22.4",
+ "rustls-pemfile 2.1.1",
+ "socket2 0.5.5",
+ "tikv-jemallocator",
+ "tokio",
+ "tokio-util",
+ "tracing",
+ "tracing-utils",
+ "utils",
+ "uuid",
+]
+
+[[package]]
+name = "proxy-core"
+version = "0.1.0"
 dependencies = [
 "ahash",
 "anyhow",
+ "arc-swap",
 "async-compression",
 "async-trait",
 "atomic-take",
@@ -4250,11 +4485,11 @@ dependencies = [
 "consumption_metrics",
 "crossbeam-deque",
 "dashmap",
+ "ecdsa 0.16.9",
 "env_logger",
 "fallible-iterator",
 "framed-websockets",
 "futures",
- "git-version",
 "hashbrown 0.14.5",
 "hashlink",
 "hex",
@@ -4270,12 +4505,14 @@ dependencies = [
 "indexmap 2.0.1",
 "ipnet",
 "itertools 0.10.5",
+ "jose-jwa",
+ "jose-jwk",
 "lasso",
 "md5",
 "measured",
 "metrics",
 "once_cell",
- "opentelemetry",
+ "p256 0.13.2",
 "parking_lot 0.12.1",
 "parquet",
 "parquet_derive",
@@ -4284,7 +4521,7 @@ dependencies = [
 "postgres-protocol",
 "postgres_backend",
 "pq_proto",
- "prometheus",
+ "proxy-sasl",
 "rand 0.8.5",
 "rand_distr",
 "rcgen",
@@ -4296,6 +4533,7 @@ dependencies = [
 "reqwest-retry",
 "reqwest-tracing",
 "routerify",
+ "rsa",
 "rstest",
 "rustc-hash",
 "rustls 0.22.4",
@@ -4305,6 +4543,7 @@ dependencies = [
 "serde",
 "serde_json",
 "sha2",
+ "signature 2.2.0",
 "smallvec",
 "smol_str",
 "socket2 0.5.5",
@@ -4312,7 +4551,6 @@ dependencies = [
 "task-local-extensions",
 "thiserror",
 "tikv-jemalloc-ctl",
- "tikv-jemallocator",
 "tokio",
 "tokio-postgres",
 "tokio-postgres-rustls",
@@ -4324,6 +4562,7 @@ dependencies = [
 "tracing-opentelemetry",
 "tracing-subscriber",
 "tracing-utils",
+ "try-lock",
 "typed-json",
 "url",
 "urlencoding",
@@ -4334,6 +4573,35 @@ dependencies = [
 "x509-parser",
 ]

+[[package]]
+name = "proxy-sasl"
+version = "0.1.0"
+dependencies = [
+ "ahash",
+ "anyhow",
+ "base64 0.13.1",
+ "bytes",
+ "crossbeam-deque",
+ "hmac",
+ "itertools 0.10.5",
+ "lasso",
+ "measured",
+ "parking_lot 0.12.1",
+ "pbkdf2",
+ "postgres-protocol",
+ "pq_proto",
+ "rand 0.8.5",
+ "rustls 0.22.4",
+ "sha2",
+ "subtle",
+ "thiserror",
+ "tokio",
+ "tracing",
+ "uuid",
+ "workspace_hack",
+ "x509-parser",
+]
+
 [[package]]
 name = "quick-xml"
 version = "0.31.0"
@@ -4806,6 +5074,16 @@ dependencies = [
 "zeroize",
 ]

+[[package]]
+name = "rfc6979"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2"
+dependencies = [
+ "hmac",
+ "subtle",
+]
+
 [[package]]
 name = "ring"
 version = "0.16.20"
@@ -4866,6 +5144,26 @@ dependencies = [
 "archery",
 ]

+[[package]]
+name = "rsa"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc"
+dependencies = [
+ "const-oid",
+ "digest",
+ "num-bigint-dig",
+ "num-integer",
+ "num-traits",
+ "pkcs1",
+ "pkcs8 0.10.2",
+ "rand_core 0.6.4",
+ "signature 2.2.0",
+ "spki 0.7.3",
+ "subtle",
+ "zeroize",
+]
+
 [[package]]
 name = "rstest"
 version = "0.18.2"
@@ -5194,10 +5492,24 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928"
 dependencies = [
- "base16ct",
+ "base16ct 0.1.1",
 "der 0.6.1",
 "generic-array",
- "pkcs8",
+ "pkcs8 0.9.0",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "sec1"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc"
+dependencies = [
+ "base16ct 0.2.0",
+ "der 0.7.8",
+ "generic-array",
+ "pkcs8 0.10.2",
 "subtle",
 "zeroize",
 ]
@@ -5544,6 +5856,7 @@ version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
 dependencies = [
+ "digest",
 "rand_core 0.6.4",
 ]

@@ -6186,7 +6499,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -6563,9 +6876,9 @@ dependencies = [

 [[package]]
 name = "try-lock"
-version = "0.2.4"
+version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed"
+checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"

 [[package]]
 name = "tungstenite"
@@ -7378,13 +7691,17 @@ dependencies = [
 "clap",
 "clap_builder",
 "crossbeam-utils",
+ "crypto-bigint 0.5.5",
+ "der 0.7.8",
 "deranged",
+ "digest",
 "either",
 "fail",
 "futures-channel",
 "futures-executor",
 "futures-io",
 "futures-util",
+ "generic-array",
 "getrandom 0.2.11",
 "hashbrown 0.14.5",
 "hex",
@@ -7392,6 +7709,7 @@ dependencies = [
 "hyper 0.14.26",
 "indexmap 1.9.3",
 "itertools 0.10.5",
+ "lazy_static",
 "libc",
 "log",
 "memchr",
@@ -7415,7 +7733,9 @@ dependencies = [
 "serde",
 "serde_json",
 "sha2",
+ "signature 2.2.0",
 "smallvec",
+ "spki 0.7.3",
 "subtle",
 "syn 1.0.109",
 "syn 2.0.52",
@@ -7526,6 +7846,7 @@ version = "1.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
 dependencies = [
+ "serde",
 "zeroize_derive",
 ]

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,7 +9,10 @@ members = [
    "pageserver/ctl",
    "pageserver/client",
    "pageserver/pagebench",
-    "proxy",
+    "proxy/core",
+    "proxy/sasl",
+    "proxy/proxy",
+    "proxy/pg_sni_router",
    "safekeeper",
    "storage_broker",
    "storage_controller",
@@ -184,6 +187,7 @@ tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
+try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
 typed-json = "0.1"
 url = "2.2"
--- a/21
+++ b/21
@@ -17,7 +17,7 @@ COPY --chown=nonroot pgxn pgxn
 COPY --chown=nonroot Makefile Makefile
 COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh

-ENV BUILD_TYPE release
+ENV BUILD_TYPE=release
 RUN set -e \
    && mold -run make -j $(nproc) -s neon-pg-ext \
    && rm -rf pg_install/build \
@@ -29,24 +29,12 @@ WORKDIR /home/nonroot
 ARG GIT_VERSION=local
 ARG BUILD_TAG

-# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
-# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
-# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build
-ARG RUSTC_WRAPPER=cachepot
-ENV AWS_REGION=eu-central-1
-ENV CACHEPOT_S3_KEY_PREFIX=cachepot
-ARG CACHEPOT_BUCKET=neon-github-dev
-#ARG AWS_ACCESS_KEY_ID
-#ARG AWS_SECRET_ACCESS_KEY
-
 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
 COPY --chown=nonroot . .

-# Show build caching stats to check if it was used in the end.
-# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
    && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
      --bin pg_sni_router  \
@@ -58,8 +46,7 @@ RUN set -e \
      --bin proxy  \
      --bin neon_local \
      --bin storage_scrubber \
-      --locked --release \
-    && cachepot -s
+      --locked --release

 # Build final image
 #
@@ -104,7 +91,7 @@ RUN mkdir -p /data/.neon/ && \

 # When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
 # that want a particular postgres version will select it explicitly: this is just a default.
-ENV LD_LIBRARY_PATH /usr/local/v16/lib
+ENV LD_LIBRARY_PATH=/usr/local/v16/lib


 VOLUME ["/data"]
@@ -112,5 +99,5 @@ USER neon
 EXPOSE 6400
 EXPOSE 9898

-CMD /usr/local/bin/pageserver -D /data/.neon
+CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"]

--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -58,7 +58,7 @@ RUN set -e \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

 # protobuf-compiler (protoc)
-ENV PROTOC_VERSION 25.1
+ENV PROTOC_VERSION=25.1
 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
    && unzip -q protoc.zip -d protoc \
    && mv protoc/bin/protoc /usr/local/bin/protoc \
@@ -99,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
    && rm awscliv2.zip

 # Mold: A Modern Linker
-ENV MOLD_VERSION v2.31.0
+ENV MOLD_VERSION=v2.33.0
 RUN set -e \
    && git clone https://github.com/rui314/mold.git \
    && mkdir mold/build \
@@ -168,7 +168,7 @@ USER nonroot:nonroot
 WORKDIR /home/nonroot

 # Python
-ENV PYTHON_VERSION=3.9.18 \
+ENV PYTHON_VERSION=3.9.19 \
    PYENV_ROOT=/home/nonroot/.pyenv \
    PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
 RUN set -e \
@@ -192,9 +192,14 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.80.0
+ENV RUSTC_VERSION=1.80.1
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
+ARG RUSTFILT_VERSION=0.2.1
+ARG CARGO_HAKARI_VERSION=0.9.30
+ARG CARGO_DENY_VERSION=0.16.1
+ARG CARGO_HACK_VERSION=0.6.31
+ARG CARGO_NEXTEST_VERSION=0.9.72
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
 	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
@@ -203,15 +208,13 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    . "$HOME/.cargo/env" && \
    cargo --version && rustup --version && \
    rustup component add llvm-tools-preview rustfmt clippy && \
-    cargo install --git https://github.com/paritytech/cachepot && \
-    cargo install rustfilt && \
-    cargo install cargo-hakari && \
-    cargo install cargo-deny --locked && \
-    cargo install cargo-hack && \
-    cargo install cargo-nextest && \
+    cargo install rustfilt            --version ${RUSTFILT_VERSION} && \
+    cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} && \
+    cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
+    cargo install cargo-hack          --version ${CARGO_HACK_VERSION} && \
+    cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
    rm -rf /home/nonroot/.cargo/registry && \
    rm -rf /home/nonroot/.cargo/git
-ENV RUSTC_WRAPPER=cachepot

 # Show versions
 RUN whoami \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -94,7 +94,7 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar
    DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
    make clean && cp -R /sfcgal/* /

-ENV PATH "/usr/local/pgsql/bin:$PATH"
+ENV PATH="/usr/local/pgsql/bin:$PATH"

 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
    echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
@@ -411,7 +411,7 @@ FROM build-deps AS timescaledb-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ARG PG_VERSION
-ENV PATH "/usr/local/pgsql/bin:$PATH"
+ENV PATH="/usr/local/pgsql/bin:$PATH"

 RUN case "${PG_VERSION}" in \
      "v14" | "v15") \
@@ -444,7 +444,7 @@ FROM build-deps AS pg-hint-plan-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ARG PG_VERSION
-ENV PATH "/usr/local/pgsql/bin:$PATH"
+ENV PATH="/usr/local/pgsql/bin:$PATH"

 RUN case "${PG_VERSION}" in \
      "v14") \
@@ -480,7 +480,7 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS pg-cron-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
    mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
@@ -506,7 +506,7 @@ RUN apt-get update && \
        libboost-system1.74-dev \
        libeigen3-dev

-ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
    echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
    mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
@@ -546,7 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
 FROM build-deps AS pg-uuidv7-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
    echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
@@ -563,7 +563,7 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz
 FROM build-deps AS pg-roaringbitmap-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
    echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
@@ -580,7 +580,7 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
 FROM build-deps AS pg-semver-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
    echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
    mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
@@ -598,7 +598,7 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ARG PG_VERSION
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
      "v14" | "v15") \
        export PG_EMBEDDING_VERSION=0.3.5 \
@@ -622,7 +622,7 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
    echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
@@ -750,7 +750,7 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -
 FROM build-deps AS wal2json-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
    echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
@@ -766,7 +766,7 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
 FROM build-deps AS pg-ivm-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
    echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
    mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
@@ -783,7 +783,7 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv
 FROM build-deps AS pg-partman-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH "/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
    echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
    mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
@@ -933,7 +933,8 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
 #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
 COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
 COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
-#COPY --from=rum-pg-build /rum.tar.gz /ext-src
+COPY --from=rum-pg-build /rum.tar.gz /ext-src
+COPY patches/rum.patch /ext-src
 #COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
 COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
 COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
@@ -945,7 +946,7 @@ COPY patches/pg_hintplan.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
-COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
+#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
 COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
 COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
 COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
@@ -960,6 +961,7 @@ RUN cd /ext-src/ && for f in *.tar.gz; \
    rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
    || exit 1; rm -f $f; done
 RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
+RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
 # cmake is required for the h3 test
 RUN apt-get update && apt-get install -y cmake
 RUN patch -p1 < /ext-src/pg_hintplan.patch
@@ -1032,6 +1034,6 @@ RUN apt update &&  \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

-ENV LANG en_US.utf8
+ENV LANG=en_US.utf8
 USER postgres
 ENTRYPOINT ["/usr/local/bin/compute_ctl"]
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -158,6 +158,8 @@ pub struct NeonStorageControllerConf {

    /// Threshold for auto-splitting a tenant into shards
    pub split_threshold: Option<u64>,
+
+    pub max_secondary_lag_bytes: Option<u64>,
 }

 impl NeonStorageControllerConf {
@@ -173,6 +175,7 @@ impl Default for NeonStorageControllerConf {
            max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
            max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
            split_threshold: None,
+            max_secondary_lag_bytes: None,
        }
    }
 }
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -383,6 +383,10 @@ impl StorageController {
            args.push(format!("--split-threshold={split_threshold}"))
        }

+        if let Some(lag) = self.config.max_secondary_lag_bytes.as_ref() {
+            args.push(format!("--max-secondary-lag-bytes={lag}"))
+        }
+
        args.push(format!(
            "--neon-local-repo-dir={}",
            self.env.base_data_dir.display()
--- a/deny.toml
+++ b/deny.toml
@@ -4,6 +4,7 @@
 # to your expectations and requirements.

 # Root options
+[graph]
 targets = [
    { triple = "x86_64-unknown-linux-gnu" },
    { triple = "aarch64-unknown-linux-gnu" },
@@ -12,6 +13,7 @@ targets = [
 ]
 all-features = false
 no-default-features = false
+[output]
 feature-depth = 1

 # This section is considered when running `cargo deny check advisories`
@@ -19,17 +21,16 @@ feature-depth = 1
 # https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
 [advisories]
 db-urls = ["https://github.com/rustsec/advisory-db"]
-vulnerability = "deny"
-unmaintained = "warn"
 yanked = "warn"
-notice = "warn"
-ignore = []
+
+[[advisories.ignore]]
+id = "RUSTSEC-2023-0071"
+reason = "the marvin attack only affects private key decryption, not public key signature verification"

 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
 [licenses]
-unlicensed = "deny"
 allow = [
    "Apache-2.0",
    "Artistic-2.0",
@@ -42,10 +43,6 @@ allow = [
    "OpenSSL",
    "Unicode-DFS-2016",
 ]
-deny = []
-copyleft = "warn"
-allow-osi-fsf-free = "neither"
-default = "deny"
 confidence-threshold = 0.8
 exceptions = [
    # Zlib license has some restrictions if we decide to change sth
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -78,7 +78,7 @@ for pg_version in 14 15 16; do
        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
        rm -rf $TMPDIR
        # We are running tests now
-        if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
+        if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
        then
            cleanup
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,15 +1,15 @@
 #!/bin/bash
 set -x

-cd /ext-src
+cd /ext-src || exit 2
 FAILED=
-LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
+LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
 for d in ${LIST}
 do
-       [ -d ${d} ] || continue
+       [ -d "${d}" ] || continue
    psql -c "select 1" >/dev/null || break
-       make -C ${d} installcheck || FAILED="${d} ${FAILED}"
+       USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
 done
 [ -z "${FAILED}" ] && exit 0
-echo ${FAILED}
+echo "${FAILED}"
 exit 1
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -1,13 +1,18 @@
 # Summary

+# Looking for `neon.tech` docs?
+
+This page linkes to a selection of technical content about the open source code in this repository.
+
+Please visit https://neon.tech/docs for documentation about using the Neon service, which is based on the code
+in this repository.
+
+# Architecture
+
 [Introduction]()
 - [Separation of Compute and Storage](./separation-compute-storage.md)

-# Architecture
-
 - [Compute]()
-  - [WAL proposer]()
-  - [WAL Backpressure]()
  - [Postgres changes](./core_changes.md)

 - [Pageserver](./pageserver.md)
@@ -16,33 +21,15 @@
    - [WAL Redo](./pageserver-walredo.md)
    - [Page cache](./pageserver-pagecache.md)
    - [Storage](./pageserver-storage.md)
-        - [Datadir mapping]()
-        - [Layer files]()
-        - [Branching]()
-        - [Garbage collection]()
-    - [Cloud Storage]()
    - [Processing a GetPage request](./pageserver-processing-getpage.md)
    - [Processing WAL](./pageserver-processing-wal.md)
-	- [Management API]()
-	- [Tenant Rebalancing]()

 - [WAL Service](walservice.md)
  - [Consensus protocol](safekeeper-protocol.md)
-  - [Management API]()
-  - [Rebalancing]()
-
- [Control Plane]()
-
- [Proxy]()

 - [Source view](./sourcetree.md)
  - [docker.md](./docker.md) — Docker images and building pipeline.
  - [Error handling and logging](./error-handling.md)
-  - [Testing]()
-    - [Unit testing]()
-    - [Integration testing]()
-    - [Benchmarks]()
-

 - [Glossary](./glossary.md)

@@ -58,28 +45,6 @@

 # RFCs

- [RFCs](./rfcs/README.md)
-
- [002-storage](rfcs/002-storage.md)
- [003-laptop-cli](rfcs/003-laptop-cli.md)
- [004-durability](rfcs/004-durability.md)
- [005-zenith_local](rfcs/005-zenith_local.md)
- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md)
- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md)
- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md)
- [008-push-pull](rfcs/008-push-pull.md)
- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md)
- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md)
- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md)
- [010-storage_details](rfcs/010-storage_details.md)
- [011-retention-policy](rfcs/011-retention-policy.md)
- [012-background-tasks](rfcs/012-background-tasks.md)
- [013-term-history](rfcs/013-term-history.md)
- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md)
- [014-storage-lsm](rfcs/014-storage-lsm.md)
- [015-storage-messaging](rfcs/015-storage-messaging.md)
- [016-connection-routing](rfcs/016-connection-routing.md)
- [017-timeline-data-management](rfcs/017-timeline-data-management.md)
- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md)
- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md)
- [cluster-size-limits](rfcs/cluster-size-limits.md)
+Major changes are documented in RFCS:
+- See [RFCs](./rfcs/README.md) for more information
+- view the RFCs at https://github.com/neondatabase/neon/tree/main/docs/rfcs
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -107,7 +107,10 @@ impl Key {
    /// As long as Neon does not support tablespace (because of lack of access to local file system),
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
    pub fn to_i128(&self) -> i128 {
-        assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
+        assert!(
+            self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222,
+            "invalid key: {self}",
+        );
        (((self.field1 & 0x7F) as i128) << 120)
            | (((self.field2 & 0xFFFF) as i128) << 104)
            | ((self.field3 as i128) << 72)
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -637,6 +637,13 @@ pub struct TenantInfo {
    pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
    pub attachment_status: TenantAttachmentStatus,
    pub generation: u32,
+
+    /// Opaque explanation if gc is being blocked.
+    ///
+    /// Only looked up for the individual tenant detail, not the listing. This is purely for
+    /// debugging, not included in openapi.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub gc_blocking: Option<String>,
 }

 #[derive(Serialize, Deserialize, Clone)]
@@ -940,6 +947,8 @@ pub struct TopTenantShardsResponse {
 }

 pub mod virtual_file {
+    use std::path::PathBuf;
+
    #[derive(
        Copy,
        Clone,
@@ -958,6 +967,53 @@ pub mod virtual_file {
        #[cfg(target_os = "linux")]
        TokioEpollUring,
    }
+
+    /// Direct IO modes for a pageserver.
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+    pub enum DirectIoMode {
+        /// Direct IO disabled (uses usual buffered IO).
+        #[default]
+        Disabled,
+        /// Direct IO disabled (performs checks and perf simulations).
+        Evaluate {
+            /// Alignment check level
+            alignment_check: DirectIoAlignmentCheckLevel,
+            /// Latency padded for performance simulation.
+            latency_padding: DirectIoLatencyPadding,
+        },
+        /// Direct IO enabled.
+        Enabled {
+            /// Actions to perform on alignment error.
+            on_alignment_error: DirectIoOnAlignmentErrorAction,
+        },
+    }
+
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(rename_all = "kebab-case")]
+    pub enum DirectIoAlignmentCheckLevel {
+        #[default]
+        Error,
+        Log,
+        None,
+    }
+
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(rename_all = "kebab-case")]
+    pub enum DirectIoOnAlignmentErrorAction {
+        Error,
+        #[default]
+        FallbackToBuffered,
+    }
+
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(tag = "type", rename_all = "kebab-case")]
+    pub enum DirectIoLatencyPadding {
+        /// Pad virtual file operations with IO to a fake file.
+        FakeFileRW { path: PathBuf },
+        #[default]
+        None,
+    }
 }

 // Wrapped in libpq CopyData
@@ -1427,6 +1483,7 @@ mod tests {
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
            generation: 1,
+            gc_blocking: None,
        };
        let expected_active = json!({
            "id": original_active.id.to_string(),
@@ -1449,6 +1506,7 @@ mod tests {
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
            generation: 1,
+            gc_blocking: None,
        };
        let expected_broken = json!({
            "id": original_broken.id.to_string(),
--- a/libs/pageserver_api/src/models/detach_ancestor.rs
+++ b/libs/pageserver_api/src/models/detach_ancestor.rs
@@ -1,6 +1,8 @@
+use std::collections::HashSet;
+
 use utils::id::TimelineId;

 #[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
 pub struct AncestorDetached {
-    pub reparented_timelines: Vec<TimelineId>,
+    pub reparented_timelines: HashSet<TimelineId>,
 }
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -144,7 +144,20 @@ impl PgConnectionConfig {
            // implement and this function is hardly a bottleneck. The function is only called around
            // establishing a new connection.
            #[allow(unstable_name_collisions)]
-            config.options(&encode_options(&self.options));
+            config.options(
+                &self
+                    .options
+                    .iter()
+                    .map(|s| {
+                        if s.contains(['\\', ' ']) {
+                            Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
+                        } else {
+                            Cow::Borrowed(s.as_str())
+                        }
+                    })
+                    .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
+                    .collect::<String>(),
+            );
        }
        config
    }
@@ -165,21 +178,6 @@ impl PgConnectionConfig {
    }
 }

-#[allow(unstable_name_collisions)]
-fn encode_options(options: &[String]) -> String {
-    options
-        .iter()
-        .map(|s| {
-            if s.contains(['\\', ' ']) {
-                Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
-            } else {
-                Cow::Borrowed(s.as_str())
-            }
-        })
-        .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
-        .collect::<String>()
-}
-
 impl fmt::Display for PgConnectionConfig {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        // The password is intentionally hidden and not part of this display string.
@@ -208,7 +206,7 @@ impl fmt::Debug for PgConnectionConfig {

 #[cfg(test)]
 mod tests_pg_connection_config {
-    use crate::{encode_options, PgConnectionConfig};
+    use crate::PgConnectionConfig;
    use once_cell::sync::Lazy;
    use url::Host;

@@ -257,12 +255,18 @@ mod tests_pg_connection_config {

    #[test]
    fn test_with_options() {
-        let options = encode_options(&[
-            "hello".to_owned(),
-            "world".to_owned(),
-            "with space".to_owned(),
-            "and \\ backslashes".to_owned(),
+        let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123).extend_options([
+            "hello",
+            "world",
+            "with space",
+            "and \\ backslashes",
        ]);
-        assert_eq!(options, "hello world with\\ space and\\ \\\\\\ backslashes");
+        assert_eq!(cfg.host(), &*STUB_HOST);
+        assert_eq!(cfg.port(), 123);
+        assert_eq!(cfg.raw_address(), "stub.host.example:123");
+        assert_eq!(
+            cfg.to_tokio_postgres_config().get_options(),
+            Some("hello world with\\ space and\\ \\\\\\ backslashes")
+        );
    }
 }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -128,7 +128,7 @@ pub mod circuit_breaker;
 ///
 /// #############################################################################################
 /// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details.
-/// We use `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
+/// We used `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
 /// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
 /// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
 /// The problem needs further investigation and regular `const` declaration instead of a macro.
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -78,8 +78,9 @@ impl Drop for GateGuard {
    }
 }

-#[derive(Debug)]
+#[derive(Debug, thiserror::Error)]
 pub enum GateError {
+    #[error("gate is closed")]
    GateClosed,
 }

--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -108,3 +108,7 @@ harness = false
 [[bench]]
 name = "bench_walredo"
 harness = false
+
+[[bench]]
+name = "bench_ingest"
+harness = false
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -0,0 +1,239 @@
+use std::{env, num::NonZeroUsize};
+
+use bytes::Bytes;
+use camino::Utf8PathBuf;
+use criterion::{criterion_group, criterion_main, Criterion};
+use pageserver::{
+    config::PageServerConf,
+    context::{DownloadBehavior, RequestContext},
+    l0_flush::{L0FlushConfig, L0FlushGlobalState},
+    page_cache,
+    repository::Value,
+    task_mgr::TaskKind,
+    tenant::storage_layer::InMemoryLayer,
+    virtual_file,
+};
+use pageserver_api::{key::Key, shard::TenantShardId};
+use utils::{
+    bin_ser::BeSer,
+    id::{TenantId, TimelineId},
+};
+
+// A very cheap hash for generating non-sequential keys.
+fn murmurhash32(mut h: u32) -> u32 {
+    h ^= h >> 16;
+    h = h.wrapping_mul(0x85ebca6b);
+    h ^= h >> 13;
+    h = h.wrapping_mul(0xc2b2ae35);
+    h ^= h >> 16;
+    h
+}
+
+enum KeyLayout {
+    /// Sequential unique keys
+    Sequential,
+    /// Random unique keys
+    Random,
+    /// Random keys, but only use the bits from the mask of them
+    RandomReuse(u32),
+}
+
+enum WriteDelta {
+    Yes,
+    No,
+}
+
+async fn ingest(
+    conf: &'static PageServerConf,
+    put_size: usize,
+    put_count: usize,
+    key_layout: KeyLayout,
+    write_delta: WriteDelta,
+) -> anyhow::Result<()> {
+    let mut lsn = utils::lsn::Lsn(1000);
+    let mut key = Key::from_i128(0x0);
+
+    let timeline_id = TimelineId::generate();
+    let tenant_id = TenantId::generate();
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?;
+
+    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
+
+    let gate = utils::sync::gate::Gate::default();
+    let entered = gate.enter().unwrap();
+
+    let layer =
+        InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
+
+    let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
+    let ctx = RequestContext::new(
+        pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
+        pageserver::context::DownloadBehavior::Download,
+    );
+
+    for i in 0..put_count {
+        lsn += put_size as u64;
+
+        // Generate lots of keys within a single relation, which simulates the typical bulk ingest case: people
+        // usually care the most about write performance when they're blasting a huge batch of data into a huge table.
+        match key_layout {
+            KeyLayout::Sequential => {
+                // Use sequential order to illustrate the experience a user is likely to have
+                // when ingesting bulk data.
+                key.field6 = i as u32;
+            }
+            KeyLayout::Random => {
+                // Use random-order keys to avoid giving a false advantage to data structures that are
+                // faster when inserting on the end.
+                key.field6 = murmurhash32(i as u32);
+            }
+            KeyLayout::RandomReuse(mask) => {
+                // Use low bits only, to limit cardinality
+                key.field6 = murmurhash32(i as u32) & mask;
+            }
+        }
+
+        layer.put_value(key, lsn, &data, &ctx).await?;
+    }
+    layer.freeze(lsn + 1).await;
+
+    if matches!(write_delta, WriteDelta::Yes) {
+        let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct {
+            max_concurrency: NonZeroUsize::new(1).unwrap(),
+        });
+        let (_desc, path) = layer
+            .write_to_disk(&ctx, None, l0_flush_state.inner())
+            .await?
+            .unwrap();
+        tokio::fs::remove_file(path).await?;
+    }
+
+    Ok(())
+}
+
+/// Wrapper to instantiate a tokio runtime
+fn ingest_main(
+    conf: &'static PageServerConf,
+    put_size: usize,
+    put_count: usize,
+    key_layout: KeyLayout,
+    write_delta: WriteDelta,
+) {
+    let runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    runtime.block_on(async move {
+        let r = ingest(conf, put_size, put_count, key_layout, write_delta).await;
+        if let Err(e) = r {
+            panic!("{e:?}");
+        }
+    });
+}
+
+/// Declare a series of benchmarks for the Pageserver's ingest write path.
+///
+/// This benchmark does not include WAL decode: it starts at InMemoryLayer::put_value, and ends either
+/// at freezing the ephemeral layer, or writing the ephemeral layer out to an L0 (depending on whether WriteDelta is set).
+///
+/// Genuine disk I/O is used, so expect results to differ depending on storage.  However, when running on
+/// a fast disk, CPU is the bottleneck at time of writing.
+fn criterion_benchmark(c: &mut Criterion) {
+    let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into().unwrap();
+    let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent).unwrap();
+    eprintln!("Data directory: {}", temp_dir.path());
+
+    let conf: &'static PageServerConf = Box::leak(Box::new(
+        pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
+    ));
+    virtual_file::init(16384, virtual_file::io_engine_for_bench());
+    page_cache::init(conf.page_cache_size);
+
+    {
+        let mut group = c.benchmark_group("ingest-small-values");
+        let put_size = 100usize;
+        let put_count = 128 * 1024 * 1024 / put_size;
+        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
+        group.sample_size(10);
+        group.bench_function("ingest 128MB/100b seq", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/100b rand", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Random,
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/100b rand-1024keys", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::RandomReuse(0x3ff),
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/100b seq, no delta", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::No,
+                )
+            })
+        });
+    }
+
+    {
+        let mut group = c.benchmark_group("ingest-big-values");
+        let put_size = 8192usize;
+        let put_count = 128 * 1024 * 1024 / put_size;
+        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
+        group.sample_size(10);
+        group.bench_function("ingest 128MB/8k seq", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::Yes,
+                )
+            })
+        });
+        group.bench_function("ingest 128MB/8k seq, no delta", |b| {
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    put_size,
+                    put_count,
+                    KeyLayout::Sequential,
+                    WriteDelta::No,
+                )
+            })
+        });
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -123,6 +123,7 @@ fn main() -> anyhow::Result<()> {

    // after setting up logging, log the effective IO engine choice and read path implementations
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
+    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
    info!(?conf.get_impl, "starting with get page implementation");
    info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -300,6 +300,9 @@ pub struct PageServerConf {
    /// This flag is temporary and will be removed after gradual rollout.
    /// See <https://github.com/neondatabase/neon/issues/8184>.
    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
+
+    /// Direct IO settings
+    pub virtual_file_direct_io: virtual_file::DirectIoMode,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -408,6 +411,8 @@ struct PageServerConfigBuilder {
    l0_flush: BuilderValue<L0FlushConfig>,

    compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
+
+    virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
 }

 impl PageServerConfigBuilder {
@@ -498,6 +503,7 @@ impl PageServerConfigBuilder {
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: Set(L0FlushConfig::default()),
            compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
+            virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
        }
    }
 }
@@ -685,6 +691,10 @@ impl PageServerConfigBuilder {
        self.compact_level0_phase1_value_access = BuilderValue::Set(value);
    }

+    pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) {
+        self.virtual_file_direct_io = BuilderValue::Set(value);
+    }
+
    pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();

@@ -743,6 +753,7 @@ impl PageServerConfigBuilder {
                ephemeral_bytes_per_memory_kb,
                l0_flush,
                compact_level0_phase1_value_access,
+                virtual_file_direct_io,
            }
            CUSTOM LOGIC
            {
@@ -1018,6 +1029,9 @@ impl PageServerConf {
                "compact_level0_phase1_value_access" => {
                    builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
                }
+                "virtual_file_direct_io" => {
+                    builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
+                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1103,6 +1117,7 @@ impl PageServerConf {
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
            l0_flush: L0FlushConfig::default(),
            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+            virtual_file_direct_io: virtual_file::DirectIoMode::default(),
        }
    }
 }
@@ -1345,6 +1360,7 @@ background_task_maximum_delay = '334 s'
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1420,6 +1436,7 @@ background_task_maximum_delay = '334 s'
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
+                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -308,6 +308,45 @@ paths:
            application/json:
              schema:
                type: string
+
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: Persistently add a gc blocking at the tenant level because of this timeline
+      responses:
+        "200":
+          description: OK
+
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: Persistently remove a tenant level gc blocking for this timeline
+      responses:
+        "200":
+          description: OK
+
  /v1/tenant/{tenant_shard_id}/location_config:
    parameters:
      - name: tenant_shard_id
@@ -893,7 +932,7 @@ components:
          description: Whether to poll remote storage for layers to download.  If false, secondary locations don't download anything.
    ArchivalConfigRequest:
      type: object
-      required
+      required:
        - state
      properties:
        state:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -935,6 +935,7 @@ async fn tenant_list_handler(
            generation: (*gen)
                .into()
                .expect("Tenants are always attached with a generation"),
+            gc_blocking: None,
        })
        .collect::<Vec<TenantInfo>>();

@@ -986,6 +987,7 @@ async fn tenant_status(
                    .generation()
                    .into()
                    .expect("Tenants are always attached with a generation"),
+                gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")),
            },
            walredo: tenant.wal_redo_manager_status(),
            timelines: tenant.list_timeline_ids(),
@@ -1160,7 +1162,10 @@ async fn layer_map_info_handler(
    let timeline =
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;
-    let layer_map_info = timeline.layer_map_info(reset).await;
+    let layer_map_info = timeline
+        .layer_map_info(reset)
+        .await
+        .map_err(|_shutdown| ApiError::ShuttingDown)?;

    json_response(StatusCode::OK, layer_map_info)
 }
@@ -1226,6 +1231,72 @@ async fn evict_timeline_layer_handler(
    }
 }

+async fn timeline_gc_blocking_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    block_or_unblock_gc(request, true).await
+}
+
+async fn timeline_gc_unblocking_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    block_or_unblock_gc(request, false).await
+}
+
+/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
+///
+/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
+async fn block_or_unblock_gc(
+    request: Request<Body>,
+    block: bool,
+) -> Result<Response<Body>, ApiError> {
+    use crate::tenant::{
+        remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized,
+    };
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let state = get_state(&request);
+
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+    let timeline = tenant.get_timeline(timeline_id, true)?;
+
+    let fut = async {
+        if block {
+            timeline.block_gc(&tenant).await.map(|_| ())
+        } else {
+            timeline.unblock_gc(&tenant).await
+        }
+    };
+
+    let span = tracing::info_span!(
+        "block_or_unblock_gc",
+        tenant_id = %tenant_shard_id.tenant_id,
+        shard_id = %tenant_shard_id.shard_slug(),
+        timeline_id = %timeline_id,
+        block = block,
+    );
+
+    let res = fut.instrument(span).await;
+
+    res.map_err(|e| {
+        if e.is::<NotInitialized>() || e.is::<WaitCompletionError>() {
+            ApiError::ShuttingDown
+        } else {
+            ApiError::InternalServerError(e)
+        }
+    })?;
+
+    json_response(StatusCode::OK, ())
+}
+
 /// Get tenant_size SVG graph along with the JSON data.
 fn synthetic_size_html_response(
    inputs: ModelInputs,
@@ -2904,6 +2975,14 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| api_handler(r, evict_timeline_layer_handler),
        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
+            |r| api_handler(r, timeline_gc_blocking_handler),
+        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
+            |r| api_handler(r, timeline_gc_unblocking_handler),
+        )
        .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
            api_handler(r, secondary_upload_handler)
        })
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -24,7 +24,7 @@ impl Default for L0FlushConfig {
 #[derive(Clone)]
 pub struct L0FlushGlobalState(Arc<Inner>);

-pub(crate) enum Inner {
+pub enum Inner {
    PageCached,
    Direct { semaphore: tokio::sync::Semaphore },
 }
@@ -40,7 +40,7 @@ impl L0FlushGlobalState {
        }
    }

-    pub(crate) fn inner(&self) -> &Arc<Inner> {
+    pub fn inner(&self) -> &Arc<Inner> {
        &self.0
    }
 }
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -56,7 +56,6 @@ impl Statvfs {
 }

 pub mod mock {
-    use anyhow::Context;
    use camino::Utf8Path;
    use regex::Regex;
    use tracing::log::info;
@@ -135,14 +134,30 @@ pub mod mock {
            {
                continue;
            }
-            total += entry
-                .metadata()
-                .with_context(|| format!("get metadata of {:?}", entry.path()))?
-                .len();
+            let m = match entry.metadata() {
+                Ok(m) => m,
+                Err(e) if is_not_found(&e) => {
+                    // some temp file which got removed right as we are walking
+                    continue;
+                }
+                Err(e) => {
+                    return Err(anyhow::Error::new(e)
+                        .context(format!("get metadata of {:?}", entry.path())))
+                }
+            };
+            total += m.len();
        }
        Ok(total)
    }

+    fn is_not_found(e: &walkdir::Error) -> bool {
+        let Some(io_error) = e.io_error() else {
+            return false;
+        };
+        let kind = io_error.kind();
+        matches!(kind, std::io::ErrorKind::NotFound)
+    }
+
    pub struct Statvfs {
        pub blocks: u64,
        pub blocks_available: u64,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -148,6 +148,7 @@ pub(crate) mod timeline;

 pub mod size;

+mod gc_block;
 pub(crate) mod throttle;

 pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -303,6 +304,12 @@ pub struct Tenant {
    /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
    ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,

+    /// `index_part.json` based gc blocking reason tracking.
+    ///
+    /// New gc iterations must start a new iteration by acquiring `GcBlock::start` before
+    /// proceeding.
+    pub(crate) gc_block: gc_block::GcBlock,
+
    l0_flush_global_state: L0FlushGlobalState,
 }

@@ -594,6 +601,12 @@ impl From<PageReconstructError> for GcError {
    }
 }

+impl From<timeline::layer_manager::Shutdown> for GcError {
+    fn from(_: timeline::layer_manager::Shutdown) -> Self {
+        GcError::TimelineCancelled
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum LoadConfigError {
    #[error("TOML deserialization error: '{0}'")]
@@ -703,6 +716,7 @@ impl Tenant {
                    .read()
                    .await
                    .layer_map()
+                    .expect("currently loading, layer manager cannot be shutdown already")
                    .iter_historic_layers()
                    .next()
                    .is_some(),
@@ -1036,6 +1050,8 @@ impl Tenant {
            }
        }

+        let mut gc_blocks = HashMap::new();
+
        // For every timeline, download the metadata file, scan the local directory,
        // and build a layer map that contains an entry for each remote and local
        // layer file.
@@ -1045,6 +1061,16 @@ impl Tenant {
                .remove(&timeline_id)
                .expect("just put it in above");

+            if let Some(blocking) = index_part.gc_blocking.as_ref() {
+                // could just filter these away, but it helps while testing
+                anyhow::ensure!(
+                    !blocking.reasons.is_empty(),
+                    "index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons"
+                );
+                let prev = gc_blocks.insert(timeline_id, blocking.reasons);
+                assert!(prev.is_none());
+            }
+
            // TODO again handle early failure
            self.load_remote_timeline(
                timeline_id,
@@ -1089,6 +1115,8 @@ impl Tenant {
        // IndexPart is the source of truth.
        self.clean_up_timelines(&existent_timelines)?;

+        self.gc_block.set_scanned(gc_blocks);
+
        fail::fail_point!("attach-before-activate", |_| {
            anyhow::bail!("attach-before-activate");
        });
@@ -1679,6 +1707,14 @@ impl Tenant {
            }
        }

+        let _guard = match self.gc_block.start().await {
+            Ok(guard) => guard,
+            Err(reasons) => {
+                info!("Skipping GC: {reasons}");
+                return Ok(GcResult::default());
+            }
+        };
+
        self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx)
            .await
    }
@@ -2691,6 +2727,7 @@ impl Tenant {
            )),
            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
            ongoing_timeline_detach: std::sync::Mutex::default(),
+            gc_block: Default::default(),
            l0_flush_global_state,
        }
    }
@@ -2975,54 +3012,6 @@ impl Tenant {
        // because that will stall branch creation.
        let gc_cs = self.gc_cs.lock().await;

-        // Paranoia check: it is critical that GcInfo's list of child timelines is correct, to avoid incorrectly GC'ing data they
-        // depend on.  So although GcInfo is updated continuously by Timeline::new and Timeline::drop, we also calculate it here
-        // and fail out if it's inaccurate.
-        // (this can be removed later, it's a risk mitigation for https://github.com/neondatabase/neon/pull/8427)
-        {
-            let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId)>> =
-                BTreeMap::new();
-            timelines.iter().for_each(|timeline| {
-                if let Some(ancestor_timeline_id) = &timeline.get_ancestor_timeline_id() {
-                    let ancestor_children =
-                        all_branchpoints.entry(*ancestor_timeline_id).or_default();
-                    ancestor_children.push((timeline.get_ancestor_lsn(), timeline.timeline_id));
-                }
-            });
-
-            for timeline in &timelines {
-                let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints
-                    .remove(&timeline.timeline_id)
-                    .unwrap_or_default();
-
-                branchpoints.sort_by_key(|b| b.0);
-
-                let target = timeline.gc_info.read().unwrap();
-
-                // We require that retain_lsns contains everything in `branchpoints`, but not that
-                // they are exactly equal: timeline deletions can race with us, so retain_lsns
-                // may contain some extra stuff.  It is safe to have extra timelines in there, because it
-                // just means that we retain slightly more data than we otherwise might.
-                let have_branchpoints = target.retain_lsns.iter().copied().collect::<HashSet<_>>();
-                for b in &branchpoints {
-                    if !have_branchpoints.contains(b) {
-                        tracing::error!(
-                            "Bug: `retain_lsns` is set incorrectly.  Expected be {:?}, but found {:?}",
-                            branchpoints,
-                            target.retain_lsns
-                        );
-                        debug_assert!(false);
-                        // Do not GC based on bad information!
-                        // (ab-use an existing GcError type rather than adding a new one, since this is a
-                        // "should never happen" check that will be removed soon).
-                        return Err(GcError::Remote(anyhow::anyhow!(
-                            "retain_lsns failed validation!"
-                        )));
-                    }
-                }
-            }
-        }
-
        // Ok, we now know all the branch points.
        // Update the GC information for each timeline.
        let mut gc_timelines = Vec::with_capacity(timelines.len());
@@ -4092,7 +4081,7 @@ pub(crate) mod harness {

 #[cfg(test)]
 mod tests {
-    use std::collections::BTreeMap;
+    use std::collections::{BTreeMap, BTreeSet};

    use super::*;
    use crate::keyspace::KeySpaceAccum;
@@ -4644,10 +4633,10 @@ mod tests {

        let layer_map = tline.layers.read().await;
        let level0_deltas = layer_map
-            .layer_map()
-            .get_level0_deltas()
-            .into_iter()
-            .map(|desc| layer_map.get_from_desc(&desc))
+            .layer_map()?
+            .level0_deltas()
+            .iter()
+            .map(|desc| layer_map.get_from_desc(desc))
            .collect::<Vec<_>>();

        assert!(!level0_deltas.is_empty());
@@ -4767,7 +4756,7 @@ mod tests {
        lsn: Lsn,
        repeat: usize,
        key_count: usize,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
        let compact = true;
        bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
    }
@@ -4780,7 +4769,9 @@ mod tests {
        repeat: usize,
        key_count: usize,
        compact: bool,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
+        let mut inserted: HashMap<Key, BTreeSet<Lsn>> = Default::default();
+
        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let mut blknum = 0;

@@ -4801,6 +4792,7 @@ mod tests {
                        ctx,
                    )
                    .await?;
+                inserted.entry(test_key).or_default().insert(lsn);
                writer.finish_write(lsn);
                drop(writer);

@@ -4825,7 +4817,7 @@ mod tests {
            assert_eq!(res.layers_removed, 0, "this never removes anything");
        }

-        Ok(())
+        Ok(inserted)
    }

    //
@@ -4872,14 +4864,16 @@ mod tests {
            .await?;

        let lsn = Lsn(0x10);
-        bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
+        let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;

        let guard = tline.layers.read().await;
-        guard.layer_map().dump(true, &ctx).await?;
+        let lm = guard.layer_map()?;
+
+        lm.dump(true, &ctx).await?;

        let mut reads = Vec::new();
        let mut prev = None;
-        guard.layer_map().iter_historic_layers().for_each(|desc| {
+        lm.iter_historic_layers().for_each(|desc| {
            if !desc.is_delta() {
                prev = Some(desc.clone());
                return;
@@ -4933,9 +4927,39 @@ mod tests {
                    &ctx,
                )
                .await;
-            tline
-                .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
-                .await;
+
+            let mut expected_lsns: HashMap<Key, Lsn> = Default::default();
+            let mut expect_missing = false;
+            let mut key = read.start().unwrap();
+            while key != read.end().unwrap() {
+                if let Some(lsns) = inserted.get(&key) {
+                    let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn);
+                    match expected_lsn {
+                        Some(lsn) => {
+                            expected_lsns.insert(key, *lsn);
+                        }
+                        None => {
+                            expect_missing = true;
+                            break;
+                        }
+                    }
+                } else {
+                    expect_missing = true;
+                    break;
+                }
+
+                key = key.next();
+            }
+
+            if expect_missing {
+                assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_))));
+            } else {
+                for (key, image) in vectored_res? {
+                    let expected_lsn = expected_lsns.get(&key).expect("determined above");
+                    let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn));
+                    assert_eq!(image?, expected_image);
+                }
+            }
        }

        Ok(())
@@ -4985,10 +5009,6 @@ mod tests {
            )
            .await;

-        child_timeline
-            .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
-            .await;
-
        let images = vectored_res?;
        assert!(images.is_empty());
        Ok(())
@@ -5859,23 +5879,12 @@ mod tests {
            tline.freeze_and_flush().await?; // force create a delta layer
        }

-        let before_num_l0_delta_files = tline
-            .layers
-            .read()
-            .await
-            .layer_map()
-            .get_level0_deltas()
-            .len();
+        let before_num_l0_delta_files =
+            tline.layers.read().await.layer_map()?.level0_deltas().len();

        tline.compact(&cancel, EnumSet::empty(), &ctx).await?;

-        let after_num_l0_delta_files = tline
-            .layers
-            .read()
-            .await
-            .layer_map()
-            .get_level0_deltas()
-            .len();
+        let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len();

        assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");

@@ -6899,7 +6908,10 @@ mod tests {
        }

        let cancel = CancellationToken::new();
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();

        for (idx, expected) in expected_result.iter().enumerate() {
            assert_eq!(
@@ -6993,7 +7005,10 @@ mod tests {
            guard.cutoffs.time = Lsn(0x40);
            guard.cutoffs.space = Lsn(0x40);
        }
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();

        Ok(())
    }
@@ -7327,7 +7342,10 @@ mod tests {
        }

        let cancel = CancellationToken::new();
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();

        for idx in 0..10 {
            assert_eq!(
@@ -7353,7 +7371,10 @@ mod tests {
            guard.cutoffs.time = Lsn(0x40);
            guard.cutoffs.space = Lsn(0x40);
        }
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();

        Ok(())
    }
@@ -7898,11 +7919,28 @@ mod tests {
        verify_result().await;

        let cancel = CancellationToken::new();
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        let mut dryrun_flags = EnumSet::new();
+        dryrun_flags.insert(CompactFlags::DryRun);
+
+        tline
+            .compact_with_gc(&cancel, dryrun_flags, &ctx)
+            .await
+            .unwrap();
+        // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
+        // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
+        verify_result().await;
+
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
        verify_result().await;

        // compact again
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
        verify_result().await;

        // increase GC horizon and compact again
@@ -7912,11 +7950,17 @@ mod tests {
            guard.cutoffs.time = Lsn(0x38);
            guard.cutoffs.space = Lsn(0x38);
        }
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
        verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result

        // not increasing the GC horizon and compact again
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
        verify_result().await;

        Ok(())
@@ -8097,7 +8141,10 @@ mod tests {
        verify_result().await;

        let cancel = CancellationToken::new();
-        branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+        branch_tline
+            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();

        verify_result().await;

--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -29,6 +29,7 @@ impl EphemeralFile {
        conf: &PageServerConf,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
+        gate_guard: utils::sync::gate::GateGuard,
        ctx: &RequestContext,
    ) -> Result<EphemeralFile, io::Error> {
        static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
@@ -51,10 +52,12 @@ impl EphemeralFile {
        )
        .await?;

+        let prewarm = conf.l0_flush.prewarm_on_write();
+
        Ok(EphemeralFile {
            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
+            rw: page_caching::RW::new(file, prewarm, gate_guard),
        })
    }

@@ -161,7 +164,11 @@ mod tests {
    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
        let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;

-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?;
+        let gate = utils::sync::gate::Gate::default();
+
+        let entered = gate.enter().unwrap();
+
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;

        let pos_foo = file.write_blob(b"foo", &ctx).await?;
        assert_eq!(
@@ -215,4 +222,38 @@ mod tests {

        Ok(())
    }
+
+    #[tokio::test]
+    async fn ephemeral_file_holds_gate_open() {
+        const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
+
+        let (conf, tenant_id, timeline_id, ctx) =
+            harness("ephemeral_file_holds_gate_open").unwrap();
+
+        let gate = utils::sync::gate::Gate::default();
+
+        let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
+            .await
+            .unwrap();
+
+        let mut closing = tokio::task::spawn(async move {
+            gate.close().await;
+        });
+
+        // gate is entered until the ephemeral file is dropped
+        // do not start paused tokio-epoll-uring has a sleep loop
+        tokio::time::pause();
+        tokio::time::timeout(FOREVER, &mut closing)
+            .await
+            .expect_err("closing cannot complete before dropping");
+
+        // this is a requirement of the reset_tenant functionality: we have to be able to restart a
+        // tenant fast, and for that, we need all tenant_dir operations be guarded by entering a gate
+        drop(file);
+
+        tokio::time::timeout(FOREVER, &mut closing)
+            .await
+            .expect("closing completes right away")
+            .expect("closing does not panic");
+    }
 }
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -18,6 +18,8 @@ use super::zero_padded_read_write;
 pub struct RW {
    page_cache_file_id: page_cache::FileId,
    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
+    /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
+    _gate_guard: utils::sync::gate::GateGuard,
 }

 /// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
@@ -29,7 +31,11 @@ pub enum PrewarmOnWrite {
 }

 impl RW {
-    pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self {
+    pub fn new(
+        file: VirtualFile,
+        prewarm_on_write: PrewarmOnWrite,
+        _gate_guard: utils::sync::gate::GateGuard,
+    ) -> Self {
        let page_cache_file_id = page_cache::next_file_id();
        Self {
            page_cache_file_id,
@@ -38,6 +44,7 @@ impl RW {
                file,
                prewarm_on_write,
            )),
+            _gate_guard,
        }
    }

@@ -145,6 +152,7 @@ impl Drop for RW {
        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.

        // unlink the file
+        // we are clear to do this, because we have entered a gate
        let res = std::fs::remove_file(&self.rw.as_writer().file.path);
        if let Err(e) = res {
            if e.kind() != std::io::ErrorKind::NotFound {
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -0,0 +1,213 @@
+use std::collections::HashMap;
+
+use utils::id::TimelineId;
+
+use super::remote_timeline_client::index::GcBlockingReason;
+
+type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
+
+#[derive(Default)]
+pub(crate) struct GcBlock {
+    /// The timelines which have current reasons to block gc.
+    ///
+    /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
+    /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
+    reasons: std::sync::Mutex<Storage>,
+    blocking: tokio::sync::Mutex<()>,
+}
+
+impl GcBlock {
+    /// Start another gc iteration.
+    ///
+    /// Returns a guard to be held for the duration of gc iteration to allow synchronizing with
+    /// it's ending, or if not currently possible, a value describing the reasons why not.
+    ///
+    /// Cancellation safe.
+    pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
+        let reasons = {
+            let g = self.reasons.lock().unwrap();
+
+            // TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in
+            // tests, we use everything. we should warn if the gc has been consecutively blocked
+            // for more than 1h (within single tenant session?).
+            BlockingReasons::clean_and_summarize(g)
+        };
+
+        if let Some(reasons) = reasons {
+            Err(reasons)
+        } else {
+            Ok(Guard {
+                _inner: self.blocking.lock().await,
+            })
+        }
+    }
+
+    pub(crate) fn summary(&self) -> Option<BlockingReasons> {
+        let g = self.reasons.lock().unwrap();
+
+        BlockingReasons::summarize(&g)
+    }
+
+    /// Start blocking gc for this one timeline for the given reason.
+    ///
+    /// This is not a guard based API but instead it mimics set API. The returned future will not
+    /// resolve until an existing gc round has completed.
+    ///
+    /// Returns true if this block was new, false if gc was already blocked for this reason.
+    ///
+    /// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will
+    /// keep the gc blocking reason.
+    pub(crate) async fn insert(
+        &self,
+        timeline: &super::Timeline,
+        reason: GcBlockingReason,
+    ) -> anyhow::Result<bool> {
+        let (added, uploaded) = {
+            let mut g = self.reasons.lock().unwrap();
+            let set = g.entry(timeline.timeline_id).or_default();
+            let added = set.insert(reason);
+
+            // LOCK ORDER: intentionally hold the lock, see self.reasons.
+            let uploaded = timeline
+                .remote_client
+                .schedule_insert_gc_block_reason(reason)?;
+
+            (added, uploaded)
+        };
+
+        uploaded.await?;
+
+        // ensure that any ongoing gc iteration has completed
+        drop(self.blocking.lock().await);
+
+        Ok(added)
+    }
+
+    /// Remove blocking gc for this one timeline and the given reason.
+    pub(crate) async fn remove(
+        &self,
+        timeline: &super::Timeline,
+        reason: GcBlockingReason,
+    ) -> anyhow::Result<()> {
+        use std::collections::hash_map::Entry;
+
+        super::span::debug_assert_current_span_has_tenant_and_timeline_id();
+
+        let (remaining_blocks, uploaded) = {
+            let mut g = self.reasons.lock().unwrap();
+            match g.entry(timeline.timeline_id) {
+                Entry::Occupied(mut oe) => {
+                    let set = oe.get_mut();
+                    set.remove(reason);
+                    if set.is_empty() {
+                        oe.remove();
+                    }
+                }
+                Entry::Vacant(_) => {
+                    // we must still do the index_part.json update regardless, in case we had earlier
+                    // been cancelled
+                }
+            }
+
+            let remaining_blocks = g.len();
+
+            // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
+            let uploaded = timeline
+                .remote_client
+                .schedule_remove_gc_block_reason(reason)?;
+
+            (remaining_blocks, uploaded)
+        };
+        uploaded.await?;
+
+        // no need to synchronize with gc iteration again
+
+        if remaining_blocks > 0 {
+            tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked");
+        } else {
+            tracing::info!("gc is now unblocked for the tenant");
+        }
+
+        Ok(())
+    }
+
+    pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
+        let unblocked = {
+            let mut g = self.reasons.lock().unwrap();
+            if g.is_empty() {
+                return;
+            }
+
+            g.remove(&timeline.timeline_id);
+
+            BlockingReasons::clean_and_summarize(g).is_none()
+        };
+
+        if unblocked {
+            tracing::info!("gc is now unblocked following deletion");
+        }
+    }
+
+    /// Initialize with the non-deleted timelines of this tenant.
+    pub(crate) fn set_scanned(&self, scanned: Storage) {
+        let mut g = self.reasons.lock().unwrap();
+        assert!(g.is_empty());
+        g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
+
+        if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
+            tracing::info!(summary=?reasons, "initialized with gc blocked");
+        }
+    }
+}
+
+pub(super) struct Guard<'a> {
+    _inner: tokio::sync::MutexGuard<'a, ()>,
+}
+
+#[derive(Debug)]
+pub(crate) struct BlockingReasons {
+    timelines: usize,
+    reasons: enumset::EnumSet<GcBlockingReason>,
+}
+
+impl std::fmt::Display for BlockingReasons {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{} timelines block for {:?}",
+            self.timelines, self.reasons
+        )
+    }
+}
+
+impl BlockingReasons {
+    fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
+        let mut reasons = enumset::EnumSet::empty();
+        g.retain(|_key, value| {
+            reasons = reasons.union(*value);
+            !value.is_empty()
+        });
+        if !g.is_empty() {
+            Some(BlockingReasons {
+                timelines: g.len(),
+                reasons,
+            })
+        } else {
+            None
+        }
+    }
+
+    fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
+        if g.is_empty() {
+            None
+        } else {
+            let reasons = g
+                .values()
+                .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
+            Some(BlockingReasons {
+                timelines: g.len(),
+                reasons,
+            })
+        }
+    }
+}
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -846,8 +846,8 @@ impl LayerMap {
    }

    /// Return all L0 delta layers
-    pub fn get_level0_deltas(&self) -> Vec<Arc<PersistentLayerDesc>> {
-        self.l0_delta_layers.to_vec()
+    pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
+        &self.l0_delta_layers
    }

    /// debugging function to print out the contents of the layer map
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -13,7 +13,7 @@ use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::cmp::Ordering;
-use std::collections::{BTreeMap, HashMap};
+use std::collections::{BTreeMap, HashMap, HashSet};
 use std::ops::Deref;
 use std::sync::Arc;
 use std::time::Duration;
@@ -224,21 +224,8 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
 }

 /// See [`Self::spawn`].
-#[derive(Clone)]
-pub struct BackgroundPurges(Arc<std::sync::Mutex<BackgroundPurgesInner>>);
-enum BackgroundPurgesInner {
-    Open(tokio::task::JoinSet<()>),
-    // we use the async mutex for coalescing
-    ShuttingDown(Arc<tokio::sync::Mutex<tokio::task::JoinSet<()>>>),
-}
-
-impl Default for BackgroundPurges {
-    fn default() -> Self {
-        Self(Arc::new(std::sync::Mutex::new(
-            BackgroundPurgesInner::Open(JoinSet::new()),
-        )))
-    }
-}
+#[derive(Clone, Default)]
+pub struct BackgroundPurges(tokio_util::task::TaskTracker);

 impl BackgroundPurges {
    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
@@ -247,24 +234,32 @@ impl BackgroundPurges {
    /// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
    /// Thus the [`BackgroundPurges`] type to keep track of these tasks.
    pub fn spawn(&self, tmp_path: Utf8PathBuf) {
-        let mut guard = self.0.lock().unwrap();
-        let jset = match &mut *guard {
-            BackgroundPurgesInner::Open(ref mut jset) => jset,
-            BackgroundPurgesInner::ShuttingDown(_) => {
-                warn!("trying to spawn background purge during shutdown, ignoring");
-                return;
+        // because on shutdown we close and wait, we are misusing TaskTracker a bit.
+        //
+        // so first acquire a token, then check if the tracker has been closed. the tracker might get closed
+        // right after, but at least the shutdown will wait for what we are spawning next.
+        let token = self.0.token();
+
+        if self.0.is_closed() {
+            warn!(
+                %tmp_path,
+                "trying to spawn background purge during shutdown, ignoring"
+            );
+            return;
+        }
+
+        let span = info_span!(parent: None, "background_purge", %tmp_path);
+
+        let task = move || {
+            let _token = token;
+            let _entered = span.entered();
+            if let Err(error) = std::fs::remove_dir_all(tmp_path.as_path()) {
+                // should we fatal_io_error here?
+                warn!(%error, "failed to purge tenant directory");
            }
        };
-        jset.spawn_on(
-            async move {
-                if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await {
-                    // should we fatal_io_error here?
-                    warn!(%error, path=%tmp_path, "failed to purge tenant directory");
-                }
-            }
-            .instrument(info_span!(parent: None, "background_purge")),
-            BACKGROUND_RUNTIME.handle(),
-        );
+
+        BACKGROUND_RUNTIME.spawn_blocking(task);
    }

    /// When this future completes, all background purges have completed.
@@ -278,42 +273,9 @@ impl BackgroundPurges {
    /// instances of this future will continue to be correct.
    #[instrument(skip_all)]
    pub async fn shutdown(&self) {
-        let jset = {
-            let mut guard = self.0.lock().unwrap();
-            match &mut *guard {
-                BackgroundPurgesInner::Open(jset) => {
-                    *guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new(
-                        std::mem::take(jset),
-                    )))
-                }
-                BackgroundPurgesInner::ShuttingDown(_) => {
-                    // calling shutdown multiple times is most likely a bug in pageserver shutdown code
-                    warn!("already shutting down");
-                }
-            };
-            match &mut *guard {
-                BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(),
-                BackgroundPurgesInner::Open(_) => {
-                    unreachable!("above code transitions into shut down state");
-                }
-            }
-        };
-        let mut jset = jset.lock().await; // concurrent callers coalesce here
-        while let Some(res) = jset.join_next().await {
-            match res {
-                Ok(()) => {}
-                Err(e) if e.is_panic() => {
-                    // If it panicked, the error is already logged by the panic hook.
-                }
-                Err(e) if e.is_cancelled() => {
-                    unreachable!("we don't cancel the joinset or runtime")
-                }
-                Err(e) => {
-                    // No idea when this can happen, but let's log it.
-                    warn!(%e, "background purge task failed or panicked");
-                }
-            }
-        }
+        // forbid new tasks (can be called many times)
+        self.0.close();
+        self.0.wait().await;
    }
 }

@@ -1767,14 +1729,9 @@ impl TenantManager {
            let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
            for timeline in timelines.values() {
                tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink");
-                let timeline_layers = timeline
-                    .layers
-                    .read()
-                    .await
-                    .likely_resident_layers()
-                    .collect::<Vec<_>>();
+                let layers = timeline.layers.read().await;

-                for layer in timeline_layers {
+                for layer in layers.likely_resident_layers() {
                    let relative_path = layer
                        .local_path()
                        .strip_prefix(&parent_path)
@@ -1971,7 +1928,8 @@ impl TenantManager {
        timeline_id: TimelineId,
        prepared: PreparedTimelineDetach,
        ctx: &RequestContext,
-    ) -> Result<Vec<TimelineId>, anyhow::Error> {
+    ) -> Result<HashSet<TimelineId>, anyhow::Error> {
+        // FIXME: this is unnecessary, slotguard already has these semantics
        struct RevertOnDropSlot(Option<SlotGuard>);

        impl Drop for RevertOnDropSlot {
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -800,6 +800,123 @@ impl RemoteTimelineClient {
            .context("wait completion")
    }

+    /// Adds a gc blocking reason for this timeline if one does not exist already.
+    ///
+    /// A retryable step of timeline detach ancestor.
+    ///
+    /// Returns a future which waits until the completion of the upload.
+    pub(crate) fn schedule_insert_gc_block_reason(
+        self: &Arc<Self>,
+        reason: index::GcBlockingReason,
+    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
+    {
+        let maybe_barrier = {
+            let mut guard = self.upload_queue.lock().unwrap();
+            let upload_queue = guard.initialized_mut()?;
+
+            if let index::GcBlockingReason::DetachAncestor = reason {
+                if upload_queue.dirty.metadata.ancestor_timeline().is_none() {
+                    drop(guard);
+                    panic!("cannot start detach ancestor if there is nothing to detach from");
+                }
+            }
+
+            let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason));
+
+            let current = upload_queue.dirty.gc_blocking.as_ref();
+            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
+
+            match (current, uploaded) {
+                (x, y) if wanted(x) && wanted(y) => None,
+                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
+                // Usual case: !wanted(x) && !wanted(y)
+                //
+                // Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to
+                // turn on and off some reason.
+                (x, y) => {
+                    if !wanted(x) && wanted(y) {
+                        // this could be avoided by having external in-memory synchronization, like
+                        // timeline detach ancestor
+                        warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason");
+                    }
+
+                    // at this point, the metadata must always show that there is a parent
+                    upload_queue.dirty.gc_blocking = current
+                        .map(|x| x.with_reason(reason))
+                        .or_else(|| Some(index::GcBlocking::started_now_for(reason)));
+                    self.schedule_index_upload(upload_queue)?;
+                    Some(self.schedule_barrier0(upload_queue))
+                }
+            }
+        };
+
+        Ok(async move {
+            if let Some(barrier) = maybe_barrier {
+                Self::wait_completion0(barrier).await?;
+            }
+            Ok(())
+        })
+    }
+
+    /// Removes a gc blocking reason for this timeline if one exists.
+    ///
+    /// A retryable step of timeline detach ancestor.
+    ///
+    /// Returns a future which waits until the completion of the upload.
+    pub(crate) fn schedule_remove_gc_block_reason(
+        self: &Arc<Self>,
+        reason: index::GcBlockingReason,
+    ) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
+    {
+        let maybe_barrier = {
+            let mut guard = self.upload_queue.lock().unwrap();
+            let upload_queue = guard.initialized_mut()?;
+
+            if let index::GcBlockingReason::DetachAncestor = reason {
+                if !upload_queue
+                    .clean
+                    .0
+                    .lineage
+                    .is_detached_from_original_ancestor()
+                {
+                    drop(guard);
+                    panic!("cannot complete timeline_ancestor_detach while not detached");
+                }
+            }
+
+            let wanted = |x: Option<&index::GcBlocking>| {
+                x.is_none() || x.is_some_and(|b| !b.blocked_by(reason))
+            };
+
+            let current = upload_queue.dirty.gc_blocking.as_ref();
+            let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
+
+            match (current, uploaded) {
+                (x, y) if wanted(x) && wanted(y) => None,
+                (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
+                (x, y) => {
+                    if !wanted(x) && wanted(y) {
+                        warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)");
+                    }
+
+                    upload_queue.dirty.gc_blocking =
+                        current.as_ref().and_then(|x| x.without_reason(reason));
+                    assert!(wanted(upload_queue.dirty.gc_blocking.as_ref()));
+                    // FIXME: bogus ?
+                    self.schedule_index_upload(upload_queue)?;
+                    Some(self.schedule_barrier0(upload_queue))
+                }
+            }
+        };
+
+        Ok(async move {
+            if let Some(barrier) = maybe_barrier {
+                Self::wait_completion0(barrier).await?;
+            }
+            Ok(())
+        })
+    }
+
    /// Launch an upload operation in the background; the file is added to be included in next
    /// `index_part.json` upload.
    pub(crate) fn schedule_layer_file_upload(
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -60,6 +60,9 @@ pub struct IndexPart {
    #[serde(default)]
    pub(crate) lineage: Lineage,

+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub(crate) gc_blocking: Option<GcBlocking>,
+
    /// Describes the kind of aux files stored in the timeline.
    ///
    /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
@@ -85,10 +88,11 @@ impl IndexPart {
    /// - 6: last_aux_file_policy is added.
    /// - 7: metadata_bytes is no longer written, but still read
    /// - 8: added `archived_at`
-    const LATEST_VERSION: usize = 8;
+    /// - 9: +gc_blocking
+    const LATEST_VERSION: usize = 9;

    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9];

    pub const FILE_NAME: &'static str = "index_part.json";

@@ -101,6 +105,7 @@ impl IndexPart {
            deleted_at: None,
            archived_at: None,
            lineage: Default::default(),
+            gc_blocking: None,
            last_aux_file_policy: None,
        }
    }
@@ -251,6 +256,64 @@ impl Lineage {
    }
 }

+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub(crate) struct GcBlocking {
+    pub(crate) started_at: NaiveDateTime,
+    pub(crate) reasons: enumset::EnumSet<GcBlockingReason>,
+}
+
+#[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)]
+#[enumset(serialize_repr = "list")]
+pub(crate) enum GcBlockingReason {
+    Manual,
+    DetachAncestor,
+}
+
+impl GcBlocking {
+    pub(super) fn started_now_for(reason: GcBlockingReason) -> Self {
+        GcBlocking {
+            started_at: chrono::Utc::now().naive_utc(),
+            reasons: enumset::EnumSet::only(reason),
+        }
+    }
+
+    /// Returns true if the given reason is one of the reasons why the gc is blocked.
+    pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool {
+        self.reasons.contains(reason)
+    }
+
+    /// Returns a version of self with the given reason.
+    pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self {
+        assert!(!self.blocked_by(reason));
+        let mut reasons = self.reasons;
+        reasons.insert(reason);
+
+        Self {
+            started_at: self.started_at,
+            reasons,
+        }
+    }
+
+    /// Returns a version of self without the given reason. Assumption is that if
+    /// there are no more reasons, we can unblock the gc by returning `None`.
+    pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option<Self> {
+        assert!(self.blocked_by(reason));
+
+        if self.reasons.len() == 1 {
+            None
+        } else {
+            let mut reasons = self.reasons;
+            assert!(reasons.remove(reason));
+            assert!(!reasons.is_empty());
+
+            Some(Self {
+                started_at: self.started_at,
+                reasons,
+            })
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -292,6 +355,7 @@ mod tests {
            deleted_at: None,
            archived_at: None,
            lineage: Lineage::default(),
+            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -335,6 +399,7 @@ mod tests {
            deleted_at: None,
            archived_at: None,
            lineage: Lineage::default(),
+            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -379,6 +444,7 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
            lineage: Lineage::default(),
+            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -426,6 +492,7 @@ mod tests {
            deleted_at: None,
            archived_at: None,
            lineage: Lineage::default(),
+            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -468,6 +535,7 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
            lineage: Lineage::default(),
+            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -513,6 +581,7 @@ mod tests {
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
            },
+            gc_blocking: None,
            last_aux_file_policy: None,
        };

@@ -563,6 +632,7 @@ mod tests {
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
            },
+            gc_blocking: None,
            last_aux_file_policy: Some(AuxFilePolicy::V2),
        };

@@ -618,6 +688,7 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
            lineage: Default::default(),
+            gc_blocking: None,
            last_aux_file_policy: Default::default(),
        };

@@ -674,6 +745,7 @@ mod tests {
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
            lineage: Default::default(),
+            gc_blocking: None,
            last_aux_file_policy: Default::default(),
        };

@@ -681,6 +753,68 @@ mod tests {
        assert_eq!(part, expected);
    }

+    #[test]
+    fn v9_indexpart_is_parsed() {
+        let example = r#"{
+            "version": 9,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata": {
+                "disk_consistent_lsn": "0/16960E8",
+                "prev_record_lsn": "0/1696070",
+                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
+                "ancestor_lsn": "0/0",
+                "latest_gc_cutoff_lsn": "0/1696070",
+                "initdb_lsn": "0/1696070",
+                "pg_version": 14
+            },
+            "gc_blocking": {
+                "started_at": "2024-07-19T09:00:00.123",
+                "reasons": ["DetachAncestor"]
+            }
+        }"#;
+
+        let expected = IndexPart {
+            version: 9,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::new(
+                Lsn::from_str("0/16960E8").unwrap(),
+                Some(Lsn::from_str("0/1696070").unwrap()),
+                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
+                Lsn::INVALID,
+                Lsn::from_str("0/1696070").unwrap(),
+                Lsn::from_str("0/1696070").unwrap(),
+                14,
+            ).with_recalculated_checksum().unwrap(),
+            deleted_at: None,
+            lineage: Default::default(),
+            gc_blocking: Some(GcBlocking {
+                started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
+                reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
+            }),
+            last_aux_file_policy: Default::default(),
+            archived_at: None,
+        };
+
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
    fn parse_naive_datetime(s: &str) -> NaiveDateTime {
        chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
    }
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -55,7 +55,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
 use utils::{
    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
-    id::TimelineId, serde_system_time,
+    id::TimelineId, pausable_failpoint, serde_system_time,
 };

 use super::{
@@ -1146,12 +1146,14 @@ impl<'a> TenantDownloader<'a> {
        layer: HeatMapLayer,
        ctx: &RequestContext,
    ) -> Result<Option<HeatMapLayer>, UpdateError> {
-        // Failpoint for simulating slow remote storage
+        // Failpoints for simulating slow remote storage
        failpoint_support::sleep_millis_async!(
            "secondary-layer-download-sleep",
            &self.secondary_state.cancel
        );

+        pausable_failpoint!("secondary-layer-download-pausable");
+
        let local_path = local_layer_path(
            self.conf,
            tenant_shard_id,
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -435,21 +435,6 @@ impl ReadableLayer {
    }
 }

-/// Return value from [`Layer::get_value_reconstruct_data`]
-#[derive(Clone, Copy, Debug)]
-pub enum ValueReconstructResult {
-    /// Got all the data needed to reconstruct the requested page
-    Complete,
-    /// This layer didn't contain all the required data, the caller should look up
-    /// the predecessor layer at the returned LSN and collect more data from there.
-    Continue,
-
-    /// This layer didn't contain data needed to reconstruct the page version at
-    /// the returned LSN. This is usually considered an error, but might be OK
-    /// in some circumstances.
-    Missing,
-}
-
 /// Layers contain a hint indicating whether they are likely to be used for reads.  This is a hint rather
 /// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
 /// of layers (for example when creating a branch that makes some previously covered layers visible).  It should
@@ -554,19 +539,25 @@ impl LayerAccessStats {
        self.record_residence_event_at(SystemTime::now())
    }

-    pub(crate) fn record_access_at(&self, now: SystemTime) {
+    fn record_access_at(&self, now: SystemTime) -> bool {
        let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);

        // A layer which is accessed must be visible.
        mask |= 0x1 << Self::VISIBILITY_SHIFT;
        value |= 0x1 << Self::VISIBILITY_SHIFT;

-        self.write_bits(mask, value);
+        let old_bits = self.write_bits(mask, value);
+        !matches!(
+            self.decode_visibility(old_bits),
+            LayerVisibilityHint::Visible
+        )
    }

-    pub(crate) fn record_access(&self, ctx: &RequestContext) {
+    /// Returns true if we modified the layer's visibility to set it to Visible implicitly
+    /// as a result of this access
+    pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool {
        if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
-            return;
+            return false;
        }

        self.record_access_at(SystemTime::now())
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,13 +36,12 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{
    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
    VectoredReadPlanner,
 };
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::PageReconstructError;
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
@@ -72,10 +71,7 @@ use utils::{
    lsn::Lsn,
 };

-use super::{
-    AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer,
-    ValuesReconstructState,
-};
+use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};

 ///
 /// Header stored in the beginning of the file
@@ -200,7 +196,6 @@ impl DeltaKey {
 pub struct DeltaLayer {
    path: Utf8PathBuf,
    pub desc: PersistentLayerDesc,
-    access_stats: LayerAccessStats,
    inner: OnceCell<Arc<DeltaLayerInner>>,
 }

@@ -299,7 +294,6 @@ impl DeltaLayer {
    /// not loaded already.
    ///
    async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> {
-        self.access_stats.record_access(ctx);
        // Quick exit if already loaded
        self.inner
            .get_or_try_init(|| self.load_inner(ctx))
@@ -350,7 +344,6 @@ impl DeltaLayer {
                summary.lsn_range,
                metadata.len(),
            ),
-            access_stats: Default::default(),
            inner: OnceCell::new(),
        })
    }
@@ -373,7 +366,6 @@ impl DeltaLayer {
 /// 3. Call `finish`.
 ///
 struct DeltaLayerWriterInner {
-    conf: &'static PageServerConf,
    pub path: Utf8PathBuf,
    timeline_id: TimelineId,
    tenant_shard_id: TenantShardId,
@@ -384,6 +376,9 @@ struct DeltaLayerWriterInner {
    tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,

    blob_writer: BlobWriter<true>,
+
+    // Number of key-lsns in the layer.
+    num_keys: usize,
 }

 impl DeltaLayerWriterInner {
@@ -417,7 +412,6 @@ impl DeltaLayerWriterInner {
        let tree_builder = DiskBtreeBuilder::new(block_buf);

        Ok(Self {
-            conf,
            path,
            timeline_id,
            tenant_shard_id,
@@ -425,6 +419,7 @@ impl DeltaLayerWriterInner {
            lsn_range,
            tree: tree_builder,
            blob_writer,
+            num_keys: 0,
        })
    }

@@ -475,6 +470,9 @@ impl DeltaLayerWriterInner {

        let delta_key = DeltaKey::from_key_lsn(&key, lsn);
        let res = self.tree.append(&delta_key.0, blob_ref.0);
+
+        self.num_keys += 1;
+
        (val, res.map_err(|e| anyhow::anyhow!(e)))
    }

@@ -488,11 +486,10 @@ impl DeltaLayerWriterInner {
    async fn finish(
        self,
        key_end: Key,
-        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
        let temp_path = self.path.clone();
-        let result = self.finish0(key_end, timeline, ctx).await;
+        let result = self.finish0(key_end, ctx).await;
        if result.is_err() {
            tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
            if let Err(e) = std::fs::remove_file(&temp_path) {
@@ -505,9 +502,8 @@ impl DeltaLayerWriterInner {
    async fn finish0(
        self,
        key_end: Key,
-        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -572,11 +568,9 @@ impl DeltaLayerWriterInner {
        // fsync the file
        file.sync_all().await?;

-        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
+        trace!("created delta layer {}", self.path);

-        trace!("created delta layer {}", layer.local_path());
-
-        Ok(layer)
+        Ok((desc, self.path))
    }
 }

@@ -677,14 +671,20 @@ impl DeltaLayerWriter {
    pub(crate) async fn finish(
        mut self,
        key_end: Key,
-        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
-        self.inner
-            .take()
-            .unwrap()
-            .finish(key_end, timeline, ctx)
-            .await
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+        self.inner.take().unwrap().finish(key_end, ctx).await
+    }
+
+    #[cfg(test)]
+    pub(crate) fn num_keys(&self) -> usize {
+        self.inner.as_ref().unwrap().num_keys
+    }
+
+    #[cfg(test)]
+    pub(crate) fn estimated_size(&self) -> u64 {
+        let inner = self.inner.as_ref().unwrap();
+        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
    }
 }

@@ -808,95 +808,6 @@ impl DeltaLayerInner {
        })
    }

-    pub(super) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        let mut need_image = true;
-        // Scan the page versions backwards, starting from `lsn`.
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            &block_reader,
-        );
-        let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
-
-        let mut offsets: Vec<(Lsn, u64)> = Vec::new();
-
-        tree_reader
-            .visit(
-                &search_key.0,
-                VisitDirection::Backwards,
-                |key, value| {
-                    let blob_ref = BlobRef(value);
-                    if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
-                        return false;
-                    }
-                    let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
-                    if entry_lsn < lsn_range.start {
-                        return false;
-                    }
-                    offsets.push((entry_lsn, blob_ref.pos()));
-
-                    !blob_ref.will_init()
-                },
-                &RequestContextBuilder::extend(ctx)
-                    .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
-                    .build(),
-            )
-            .await?;
-
-        let ctx = &RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::DeltaLayerValue)
-            .build();
-
-        // Ok, 'offsets' now contains the offsets of all the entries we need to read
-        let cursor = block_reader.block_cursor();
-        let mut buf = Vec::new();
-        for (entry_lsn, pos) in offsets {
-            cursor
-                .read_blob_into_buf(pos, &mut buf, ctx)
-                .await
-                .with_context(|| {
-                    format!("Failed to read blob from virtual file {}", self.file.path)
-                })?;
-            let val = Value::des(&buf).with_context(|| {
-                format!(
-                    "Failed to deserialize file blob from virtual file {}",
-                    self.file.path
-                )
-            })?;
-            match val {
-                Value::Image(img) => {
-                    reconstruct_state.img = Some((entry_lsn, img));
-                    need_image = false;
-                    break;
-                }
-                Value::WalRecord(rec) => {
-                    let will_init = rec.will_init();
-                    reconstruct_state.records.push((entry_lsn, rec));
-                    if will_init {
-                        // This WAL record initializes the page, so no need to go further back
-                        need_image = false;
-                        break;
-                    }
-                }
-            }
-        }
-
-        // If an older page image is needed to reconstruct the page, let the
-        // caller know.
-        if need_image {
-            Ok(ValueReconstructResult::Continue)
-        } else {
-            Ok(ValueReconstructResult::Complete)
-        }
-    }
-
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    //
@@ -1669,8 +1580,9 @@ pub(crate) mod test {
    use super::*;
    use crate::repository::Value;
    use crate::tenant::harness::TIMELINE_ID;
+    use crate::tenant::storage_layer::{Layer, ResidentLayer};
    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
-    use crate::tenant::Tenant;
+    use crate::tenant::{Tenant, Timeline};
    use crate::{
        context::DownloadBehavior,
        task_mgr::TaskKind,
@@ -1964,9 +1876,8 @@ pub(crate) mod test {
            res?;
        }

-        let resident = writer
-            .finish(entries_meta.key_range.end, &timeline, &ctx)
-            .await?;
+        let (desc, path) = writer.finish(entries_meta.key_range.end, &ctx).await?;
+        let resident = Layer::finish_creating(harness.conf, &timeline, desc, &path)?;

        let inner = resident.get_as_delta(&ctx).await?;

@@ -2046,6 +1957,7 @@ pub(crate) mod test {
            .await
            .likely_resident_layers()
            .next()
+            .cloned()
            .unwrap();

        {
@@ -2120,7 +2032,8 @@ pub(crate) mod test {
            .read()
            .await
            .likely_resident_layers()
-            .find(|x| x != &initdb_layer)
+            .find(|&x| x != &initdb_layer)
+            .cloned()
            .unwrap();

        // create a copy for the timeline, so we don't overwrite the file
@@ -2155,7 +2068,8 @@ pub(crate) mod test {
                .await
                .unwrap();

-            let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();
+            let (desc, path) = writer.finish(Key::MAX, ctx).await.unwrap();
+            let copied_layer = Layer::finish_creating(tenant.conf, &branch, desc, &path).unwrap();

            copied_layer.get_as_delta(ctx).await.unwrap();

@@ -2283,7 +2197,9 @@ pub(crate) mod test {
        for (key, lsn, value) in deltas {
            writer.put_value(key, lsn, value, ctx).await?;
        }
-        let delta_layer = writer.finish(key_end, tline, ctx).await?;
+
+        let (desc, path) = writer.finish(key_end, ctx).await?;
+        let delta_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;

        Ok::<_, anyhow::Error>(delta_layer)
    }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -32,9 +32,6 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{
    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::{
-    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
-};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
@@ -137,7 +134,6 @@ pub struct ImageLayer {
    pub desc: PersistentLayerDesc,
    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,
-    access_stats: LayerAccessStats,
    inner: OnceCell<ImageLayerInner>,
 }

@@ -255,7 +251,6 @@ impl ImageLayer {
    /// not loaded already.
    ///
    async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> {
-        self.access_stats.record_access(ctx);
        self.inner
            .get_or_try_init(|| self.load_inner(ctx))
            .await
@@ -306,7 +301,6 @@ impl ImageLayer {
                metadata.len(),
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
-            access_stats: Default::default(),
            inner: OnceCell::new(),
        })
    }
@@ -429,46 +423,6 @@ impl ImageLayerInner {
        })
    }

-    pub(super) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader =
-            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
-
-        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
-        key.write_to_byte_slice(&mut keybuf);
-        if let Some(offset) = tree_reader
-            .get(
-                &keybuf,
-                &RequestContextBuilder::extend(ctx)
-                    .page_content_kind(PageContentKind::ImageLayerBtreeNode)
-                    .build(),
-            )
-            .await?
-        {
-            let blob = block_reader
-                .block_cursor()
-                .read_blob(
-                    offset,
-                    &RequestContextBuilder::extend(ctx)
-                        .page_content_kind(PageContentKind::ImageLayerValue)
-                        .build(),
-                )
-                .await
-                .with_context(|| format!("failed to read value from offset {}", offset))?;
-            let value = Bytes::from(blob);
-
-            reconstruct_state.img = Some((self.lsn, value));
-            Ok(ValueReconstructResult::Complete)
-        } else {
-            Ok(ValueReconstructResult::Missing)
-        }
-    }
-
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    pub(super) async fn get_values_reconstruct_data(
@@ -753,6 +707,10 @@ struct ImageLayerWriterInner {
 }

 impl ImageLayerWriterInner {
+    fn size(&self) -> u64 {
+        self.tree.borrow_writer().size() + self.blob_writer.size()
+    }
+
    ///
    /// Start building a new image layer.
    ///
@@ -1044,6 +1002,10 @@ impl ImageLayerWriter {
            .finish(timeline, ctx, Some(end_key))
            .await
    }
+
+    pub(crate) fn size(&self) -> u64 {
+        self.inner.as_ref().unwrap().size()
+    }
 }

 impl Drop for ImageLayerWriter {
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,11 +10,11 @@ use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
 use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
-use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::PageReconstructError;
 use crate::{l0_flush, page_cache, walrecord};
-use anyhow::{anyhow, ensure, Result};
+use anyhow::{anyhow, Result};
+use camino::Utf8PathBuf;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
@@ -34,8 +34,7 @@ use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::{RwLock, RwLockWriteGuard};

 use super::{
-    DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState,
-    ValuesReconstructState,
+    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
 };

 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
@@ -55,9 +54,6 @@ pub struct InMemoryLayer {
    /// Writes are only allowed when this is `None`.
    pub(crate) end_lsn: OnceLock<Lsn>,

-    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
-    local_path_str: Arc<str>,
-
    /// Used for traversal path. Cached representation of the in-memory layer after frozen.
    frozen_local_path_str: OnceLock<Arc<str>>,

@@ -248,12 +244,6 @@ impl InMemoryLayer {
        self.start_lsn..self.end_lsn_or_max()
    }

-    pub(crate) fn local_path_str(&self) -> &Arc<str> {
-        self.frozen_local_path_str
-            .get()
-            .unwrap_or(&self.local_path_str)
-    }
-
    /// debugging function to print out the contents of the layer
    ///
    /// this is likely completly unused
@@ -303,60 +293,6 @@ impl InMemoryLayer {
        Ok(())
    }

-    /// Look up given value in the layer.
-    pub(crate) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        ensure!(lsn_range.start >= self.start_lsn);
-        let mut need_image = true;
-
-        let ctx = RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::InMemoryLayer)
-            .build();
-
-        let inner = self.inner.read().await;
-
-        let reader = inner.file.block_cursor();
-
-        // Scan the page versions backwards, starting from `lsn`.
-        if let Some(vec_map) = inner.index.get(&key) {
-            let slice = vec_map.slice_range(lsn_range);
-            for (entry_lsn, pos) in slice.iter().rev() {
-                let buf = reader.read_blob(*pos, &ctx).await?;
-                let value = Value::des(&buf)?;
-                match value {
-                    Value::Image(img) => {
-                        reconstruct_state.img = Some((*entry_lsn, img));
-                        return Ok(ValueReconstructResult::Complete);
-                    }
-                    Value::WalRecord(rec) => {
-                        let will_init = rec.will_init();
-                        reconstruct_state.records.push((*entry_lsn, rec));
-                        if will_init {
-                            // This WAL record initializes the page, so no need to go further back
-                            need_image = false;
-                            break;
-                        }
-                    }
-                }
-            }
-        }
-
-        // release lock on 'inner'
-
-        // If an older page image is needed to reconstruct the page, let the
-        // caller know.
-        if need_image {
-            Ok(ValueReconstructResult::Continue)
-        } else {
-            Ok(ValueReconstructResult::Complete)
-        }
-    }
-
    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
    //
@@ -449,20 +385,17 @@ impl InMemoryLayer {
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        start_lsn: Lsn,
+        gate_guard: utils::sync::gate::GateGuard,
        ctx: &RequestContext,
    ) -> Result<InMemoryLayer> {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");

-        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?;
+        let file =
+            EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
        let key = InMemoryLayerFileId(file.page_cache_file_id());

        Ok(InMemoryLayer {
            file_id: key,
-            local_path_str: {
-                let mut buf = String::new();
-                inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
-                buf.into()
-            },
            frozen_local_path_str: OnceLock::new(),
            conf,
            timeline_id,
@@ -482,8 +415,7 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-
-    pub(crate) async fn put_value(
+    pub async fn put_value(
        &self,
        key: Key,
        lsn: Lsn,
@@ -548,8 +480,6 @@ impl InMemoryLayer {
    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
    pub async fn freeze(&self, end_lsn: Lsn) {
-        let inner = self.inner.write().await;
-
        assert!(
            self.start_lsn < end_lsn,
            "{} >= {}",
@@ -567,9 +497,13 @@ impl InMemoryLayer {
            })
            .expect("frozen_local_path_str set only once");

-        for vec_map in inner.index.values() {
-            for (lsn, _pos) in vec_map.as_slice() {
-                assert!(*lsn < end_lsn);
+        #[cfg(debug_assertions)]
+        {
+            let inner = self.inner.write().await;
+            for vec_map in inner.index.values() {
+                for (lsn, _pos) in vec_map.as_slice() {
+                    assert!(*lsn < end_lsn);
+                }
            }
        }
    }
@@ -579,12 +513,12 @@ impl InMemoryLayer {
    /// if there are no matching keys.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub(crate) async fn write_to_disk(
+    pub async fn write_to_disk(
        &self,
-        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        key_range: Option<Range<Key>>,
-    ) -> Result<Option<ResidentLayer>> {
+        l0_flush_global_state: &l0_flush::Inner,
+    ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -596,9 +530,8 @@ impl InMemoryLayer {
        // rare though, so we just accept the potential latency hit for now.
        let inner = self.inner.read().await;

-        let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
        use l0_flush::Inner;
-        let _concurrency_permit = match &*l0_flush_global_state {
+        let _concurrency_permit = match l0_flush_global_state {
            Inner::PageCached => None,
            Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
        };
@@ -628,7 +561,7 @@ impl InMemoryLayer {
        )
        .await?;

-        match &*l0_flush_global_state {
+        match l0_flush_global_state {
            l0_flush::Inner::PageCached => {
                let ctx = RequestContextBuilder::extend(ctx)
                    .page_content_kind(PageContentKind::InMemoryLayer)
@@ -693,7 +626,7 @@ impl InMemoryLayer {
        }

        // MAX is used here because we identify L0 layers by full key range
-        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
+        let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;

        // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
        //
@@ -705,6 +638,6 @@ impl InMemoryLayer {
        // we dirtied when writing to the filesystem have been flushed and marked !dirty.
        drop(_concurrency_permit);

-        Ok(Some(delta_layer))
+        Ok(Some((desc, path)))
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -24,8 +24,7 @@ use super::delta_layer::{self, DeltaEntry};
 use super::image_layer::{self};
 use super::{
    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    LayerVisibilityHint, PersistentLayerDesc, ValueReconstructResult, ValueReconstructState,
-    ValuesReconstructState,
+    LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState,
 };

 use utils::generation::Generation;
@@ -301,42 +300,6 @@ impl Layer {
        self.0.delete_on_drop();
    }

-    /// Return data needed to reconstruct given page at LSN.
-    ///
-    /// It is up to the caller to collect more data from the previous layer and
-    /// perform WAL redo, if necessary.
-    ///
-    /// # Cancellation-Safety
-    ///
-    /// This method is cancellation-safe.
-    pub(crate) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        use anyhow::ensure;
-
-        let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
-        self.0.access_stats.record_access(ctx);
-
-        if self.layer_desc().is_delta {
-            ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
-            ensure!(self.layer_desc().key_range.contains(&key));
-        } else {
-            ensure!(self.layer_desc().key_range.contains(&key));
-            ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
-            ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
-        }
-
-        layer
-            .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
-            .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
-            .await
-            .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
-    }
-
    pub(crate) async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
@@ -353,7 +316,7 @@ impl Layer {
                other => GetVectoredError::Other(anyhow::anyhow!(other)),
            })?;

-        self.0.access_stats.record_access(ctx);
+        self.record_access(ctx);

        layer
            .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
@@ -433,18 +396,18 @@ impl Layer {
        self.0.info(reset)
    }

-    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
-        &self.0.access_stats
+    pub(crate) fn latest_activity(&self) -> SystemTime {
+        self.0.access_stats.latest_activity()
+    }
+
+    pub(crate) fn visibility(&self) -> LayerVisibilityHint {
+        self.0.access_stats.visibility()
    }

    pub(crate) fn local_path(&self) -> &Utf8Path {
        &self.0.path
    }

-    pub(crate) fn debug_str(&self) -> &Arc<str> {
-        &self.0.debug_str
-    }
-
    pub(crate) fn metadata(&self) -> LayerFileMetadata {
        self.0.metadata()
    }
@@ -488,13 +451,31 @@ impl Layer {
        }
    }

+    fn record_access(&self, ctx: &RequestContext) {
+        if self.0.access_stats.record_access(ctx) {
+            // Visibility was modified to Visible
+            tracing::info!(
+                "Layer {} became visible as a result of access",
+                self.0.desc.key()
+            );
+            if let Some(tl) = self.0.timeline.upgrade() {
+                tl.metrics
+                    .visible_physical_size_gauge
+                    .add(self.0.desc.file_size)
+            }
+        }
+    }
+
    pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
-        let old_visibility = self.access_stats().set_visibility(visibility.clone());
+        let old_visibility = self.0.access_stats.set_visibility(visibility.clone());
        use LayerVisibilityHint::*;
        match (old_visibility, visibility) {
            (Visible, Covered) => {
                // Subtract this layer's contribution to the visible size metric
                if let Some(tl) = self.0.timeline.upgrade() {
+                    debug_assert!(
+                        tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
+                    );
                    tl.metrics
                        .visible_physical_size_gauge
                        .sub(self.0.desc.file_size)
@@ -519,7 +500,7 @@ impl Layer {
 ///
 /// However when we want something evicted, we cannot evict it right away as there might be current
 /// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
-/// read with [`Layer::get_value_reconstruct_data`].
+/// read with [`Layer::get_values_reconstruct_data`].
 ///
 /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
 #[derive(Debug)]
@@ -600,9 +581,6 @@ struct LayerInner {
    /// Full path to the file; unclear if this should exist anymore.
    path: Utf8PathBuf,

-    /// String representation of the layer, used for traversal id.
-    debug_str: Arc<str>,
-
    desc: PersistentLayerDesc,

    /// Timeline access is needed for remote timeline client and metrics.
@@ -715,6 +693,9 @@ impl Drop for LayerInner {
            }

            if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
+                debug_assert!(
+                    timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
+                );
                timeline
                    .metrics
                    .visible_physical_size_gauge
@@ -836,9 +817,6 @@ impl LayerInner {

        LayerInner {
            conf,
-            debug_str: {
-                format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
-            },
            path: local_path,
            desc,
            timeline: Arc::downgrade(timeline),
@@ -1759,28 +1737,6 @@ impl DownloadedLayer {
            .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
    }

-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        owner: &Arc<LayerInner>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        use LayerKind::*;
-
-        match self.get(owner, ctx).await? {
-            Delta(d) => {
-                d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
-                    .await
-            }
-            Image(i) => {
-                i.get_value_reconstruct_data(key, reconstruct_data, ctx)
-                    .await
-            }
-        }
-    }
-
    async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
@@ -1879,7 +1835,7 @@ impl ResidentLayer {
                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
                // while it's being held.
-                owner.access_stats.record_access(ctx);
+                self.owner.record_access(ctx);

                delta_layer::DeltaLayerInner::load_keys(d, ctx)
                    .await
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -39,7 +39,7 @@ async fn smoke_test() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -50,13 +50,26 @@ async fn smoke_test() {
    // all layers created at pageserver are like `layer`, initialized with strong
    // Arc<DownloadedLayer>.

+    let controlfile_keyspace = KeySpace {
+        ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()],
+    };
+
    let img_before = {
-        let mut data = ValueReconstructState::default();
+        let mut data = ValuesReconstructState::default();
        layer
-            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
+            .get_values_reconstruct_data(
+                controlfile_keyspace.clone(),
+                Lsn(0x10)..Lsn(0x11),
+                &mut data,
+                &ctx,
+            )
            .await
            .unwrap();
-        data.img
+        data.keys
+            .remove(&CONTROLFILE_KEY)
+            .expect("must be present")
+            .expect("should not error")
+            .img
            .take()
            .expect("tenant harness writes the control file")
    };
@@ -74,13 +87,24 @@ async fn smoke_test() {

    // on accesses when the layer is evicted, it will automatically be downloaded.
    let img_after = {
-        let mut data = ValueReconstructState::default();
+        let mut data = ValuesReconstructState::default();
        layer
-            .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
+            .get_values_reconstruct_data(
+                controlfile_keyspace.clone(),
+                Lsn(0x10)..Lsn(0x11),
+                &mut data,
+                &ctx,
+            )
            .instrument(download_span.clone())
            .await
            .unwrap();
-        data.img.take().unwrap()
+        data.keys
+            .remove(&CONTROLFILE_KEY)
+            .expect("must be present")
+            .expect("should not error")
+            .img
+            .take()
+            .expect("tenant harness writes the control file")
    };

    assert_eq!(img_before, img_after);
@@ -152,7 +176,7 @@ async fn smoke_test() {
    {
        let layers = &[layer];
        let mut g = timeline.layers.write().await;
-        g.finish_gc_timeline(layers);
+        g.open_mut().unwrap().finish_gc_timeline(layers);
        // this just updates the remote_physical_size for demonstration purposes
        rtc.schedule_gc_update(layers).unwrap();
    }
@@ -192,7 +216,7 @@ async fn evict_and_wait_on_wanted_deleted() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -236,7 +260,7 @@ async fn evict_and_wait_on_wanted_deleted() {
    // the deletion of the layer in remote_storage happens.
    {
        let mut layers = timeline.layers.write().await;
-        layers.finish_gc_timeline(&[layer]);
+        layers.open_mut().unwrap().finish_gc_timeline(&[layer]);
    }

    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
@@ -277,7 +301,7 @@ fn read_wins_pending_eviction() {
        let layer = {
            let mut layers = {
                let layers = timeline.layers.read().await;
-                layers.likely_resident_layers().collect::<Vec<_>>()
+                layers.likely_resident_layers().cloned().collect::<Vec<_>>()
            };

            assert_eq!(layers.len(), 1);
@@ -409,7 +433,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
        let layer = {
            let mut layers = {
                let layers = timeline.layers.read().await;
-                layers.likely_resident_layers().collect::<Vec<_>>()
+                layers.likely_resident_layers().cloned().collect::<Vec<_>>()
            };

            assert_eq!(layers.len(), 1);
@@ -578,7 +602,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -658,7 +682,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
        };

        assert_eq!(layers.len(), 1);
@@ -777,9 +801,9 @@ async fn eviction_cancellation_on_drop() {
    let (evicted_layer, not_evicted) = {
        let mut layers = {
            let mut guard = timeline.layers.write().await;
-            let layers = guard.likely_resident_layers().collect::<Vec<_>>();
+            let layers = guard.likely_resident_layers().cloned().collect::<Vec<_>>();
            // remove the layers from layermap
-            guard.finish_gc_timeline(&layers);
+            guard.open_mut().unwrap().finish_gc_timeline(&layers);

            layers
        };
@@ -830,7 +854,7 @@ async fn eviction_cancellation_on_drop() {
 fn layer_size() {
    assert_eq!(size_of::<LayerAccessStats>(), 8);
    assert_eq!(size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(size_of::<LayerInner>(), 312);
+    assert_eq!(size_of::<LayerInner>(), 296);
    // it also has the utf8 path
 }

--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -1,12 +1,13 @@
-use std::sync::Arc;
+use std::{ops::Range, sync::Arc};

 use bytes::Bytes;
 use pageserver_api::key::{Key, KEY_SIZE};
 use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};

-use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
+use crate::tenant::storage_layer::Layer;
+use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};

-use super::{ImageLayerWriter, ResidentLayer};
+use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};

 /// An image writer that takes images and produces multiple image layers. The interface does not
 /// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
@@ -98,6 +99,111 @@ impl SplitImageLayerWriter {
        generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
        Ok(generated_layers)
    }
+
+    /// When split writer fails, the caller should call this function and handle partially generated layers.
+    #[allow(dead_code)]
+    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, ImageLayerWriter)> {
+        Ok((self.generated_layers, self.inner))
+    }
+}
+
+/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
+/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
+/// to be cleaned up).
+#[must_use]
+pub struct SplitDeltaLayerWriter {
+    inner: DeltaLayerWriter,
+    target_layer_size: u64,
+    generated_layers: Vec<ResidentLayer>,
+    conf: &'static PageServerConf,
+    timeline_id: TimelineId,
+    tenant_shard_id: TenantShardId,
+    lsn_range: Range<Lsn>,
+}
+
+impl SplitDeltaLayerWriter {
+    pub async fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_shard_id: TenantShardId,
+        start_key: Key,
+        lsn_range: Range<Lsn>,
+        target_layer_size: u64,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Self> {
+        Ok(Self {
+            target_layer_size,
+            inner: DeltaLayerWriter::new(
+                conf,
+                timeline_id,
+                tenant_shard_id,
+                start_key,
+                lsn_range.clone(),
+                ctx,
+            )
+            .await?,
+            generated_layers: Vec::new(),
+            conf,
+            timeline_id,
+            tenant_shard_id,
+            lsn_range,
+        })
+    }
+
+    pub async fn put_value(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: Value,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
+        // number, and therefore the final layer size could be a little bit larger or smaller than the target.
+        let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
+        if self.inner.num_keys() >= 1
+            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
+        {
+            let next_delta_writer = DeltaLayerWriter::new(
+                self.conf,
+                self.timeline_id,
+                self.tenant_shard_id,
+                key,
+                self.lsn_range.clone(),
+                ctx,
+            )
+            .await?;
+            let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
+            let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
+            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+            self.generated_layers.push(delta_layer);
+        }
+        self.inner.put_value(key, lsn, val, ctx).await
+    }
+
+    pub(crate) async fn finish(
+        self,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+        end_key: Key,
+    ) -> anyhow::Result<Vec<ResidentLayer>> {
+        let Self {
+            mut generated_layers,
+            inner,
+            ..
+        } = self;
+
+        let (desc, path) = inner.finish(end_key, ctx).await?;
+        let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+        generated_layers.push(delta_layer);
+        Ok(generated_layers)
+    }
+
+    /// When split writer fails, the caller should call this function and handle partially generated layers.
+    #[allow(dead_code)]
+    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, DeltaLayerWriter)> {
+        Ok((self.generated_layers, self.inner))
+    }
 }

 #[cfg(test)]
@@ -138,7 +244,7 @@ mod tests {
            .await
            .unwrap();

-        let mut writer = SplitImageLayerWriter::new(
+        let mut image_writer = SplitImageLayerWriter::new(
            tenant.conf,
            tline.timeline_id,
            tenant.tenant_shard_id,
@@ -150,11 +256,42 @@ mod tests {
        .await
        .unwrap();

-        writer
+        let mut delta_writer = SplitDeltaLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x18)..Lsn(0x20),
+            4 * 1024 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
+
+        image_writer
            .put_image(get_key(0), get_img(0), &tline, &ctx)
            .await
            .unwrap();
-        let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
+        let layers = image_writer
+            .finish(&tline, &ctx, get_key(10))
+            .await
+            .unwrap();
+        assert_eq!(layers.len(), 1);
+
+        delta_writer
+            .put_value(
+                get_key(0),
+                Lsn(0x18),
+                Value::Image(get_img(0)),
+                &tline,
+                &ctx,
+            )
+            .await
+            .unwrap();
+        let layers = delta_writer
+            .finish(&tline, &ctx, get_key(10))
+            .await
+            .unwrap();
        assert_eq!(layers.len(), 1);
    }

@@ -170,7 +307,7 @@ mod tests {
            .await
            .unwrap();

-        let mut writer = SplitImageLayerWriter::new(
+        let mut image_writer = SplitImageLayerWriter::new(
            tenant.conf,
            tline.timeline_id,
            tenant.tenant_shard_id,
@@ -181,26 +318,58 @@ mod tests {
        )
        .await
        .unwrap();
+        let mut delta_writer = SplitDeltaLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x18)..Lsn(0x20),
+            4 * 1024 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
        const N: usize = 2000;
        for i in 0..N {
            let i = i as u32;
-            writer
+            image_writer
                .put_image(get_key(i), get_large_img(), &tline, &ctx)
                .await
                .unwrap();
+            delta_writer
+                .put_value(
+                    get_key(i),
+                    Lsn(0x20),
+                    Value::Image(get_large_img()),
+                    &tline,
+                    &ctx,
+                )
+                .await
+                .unwrap();
        }
-        let layers = writer
+        let image_layers = image_writer
            .finish(&tline, &ctx, get_key(N as u32))
            .await
            .unwrap();
-        assert_eq!(layers.len(), N / 512 + 1);
-        for idx in 0..layers.len() {
-            assert_ne!(layers[idx].layer_desc().key_range.start, Key::MIN);
-            assert_ne!(layers[idx].layer_desc().key_range.end, Key::MAX);
+        let delta_layers = delta_writer
+            .finish(&tline, &ctx, get_key(N as u32))
+            .await
+            .unwrap();
+        assert_eq!(image_layers.len(), N / 512 + 1);
+        assert_eq!(delta_layers.len(), N / 512 + 1);
+        for idx in 0..image_layers.len() {
+            assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
+            assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
+            assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
+            assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
            if idx > 0 {
                assert_eq!(
-                    layers[idx - 1].layer_desc().key_range.end,
-                    layers[idx].layer_desc().key_range.start
+                    image_layers[idx - 1].layer_desc().key_range.end,
+                    image_layers[idx].layer_desc().key_range.start
+                );
+                assert_eq!(
+                    delta_layers[idx - 1].layer_desc().key_range.end,
+                    delta_layers[idx].layer_desc().key_range.start
                );
            }
        }
@@ -218,7 +387,7 @@ mod tests {
            .await
            .unwrap();

-        let mut writer = SplitImageLayerWriter::new(
+        let mut image_writer = SplitImageLayerWriter::new(
            tenant.conf,
            tline.timeline_id,
            tenant.tenant_shard_id,
@@ -230,15 +399,56 @@ mod tests {
        .await
        .unwrap();

-        writer
+        let mut delta_writer = SplitDeltaLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            get_key(0),
+            Lsn(0x18)..Lsn(0x20),
+            4 * 1024,
+            &ctx,
+        )
+        .await
+        .unwrap();
+
+        image_writer
            .put_image(get_key(0), get_img(0), &tline, &ctx)
            .await
            .unwrap();
-        writer
+        image_writer
            .put_image(get_key(1), get_large_img(), &tline, &ctx)
            .await
            .unwrap();
-        let layers = writer.finish(&tline, &ctx, get_key(10)).await.unwrap();
+        let layers = image_writer
+            .finish(&tline, &ctx, get_key(10))
+            .await
+            .unwrap();
+        assert_eq!(layers.len(), 2);
+
+        delta_writer
+            .put_value(
+                get_key(0),
+                Lsn(0x18),
+                Value::Image(get_img(0)),
+                &tline,
+                &ctx,
+            )
+            .await
+            .unwrap();
+        delta_writer
+            .put_value(
+                get_key(1),
+                Lsn(0x1A),
+                Value::Image(get_large_img()),
+                &tline,
+                &ctx,
+            )
+            .await
+            .unwrap();
+        let layers = delta_writer
+            .finish(&tline, &ctx, get_key(10))
+            .await
+            .unwrap();
        assert_eq!(layers.len(), 2);
    }
 }
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -407,9 +407,16 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                        error_run_count += 1;
                        let wait_duration = Duration::from_secs_f64(wait_duration);

-                        error!(
-                        "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
-                    );
+                        if matches!(e, crate::tenant::GcError::TimelineCancelled) {
+                            // Timeline was cancelled during gc. We might either be in an event
+                            // that affects the entire tenant (tenant deletion, pageserver shutdown),
+                            // or in one that affects the timeline only (timeline deletion).
+                            // Therefore, don't exit the loop.
+                            info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
+                        } else {
+                            error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
+                        }
+
                        wait_duration
                    }
                }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -19,8 +19,10 @@ use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
+use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
+use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;
@@ -41,6 +43,7 @@ use crate::virtual_file::{MaybeFatalIo, VirtualFile};

 use crate::keyspace::KeySpace;
 use crate::repository::{Key, Value};
+use crate::walrecord::NeonWalRecord;

 use utils::lsn::Lsn;

@@ -73,6 +76,7 @@ impl KeyHistoryRetention {
        key: Key,
        delta_writer: &mut Vec<(Key, Lsn, Value)>,
        mut image_writer: Option<&mut ImageLayerWriter>,
+        stat: &mut CompactionStatistics,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let mut first_batch = true;
@@ -82,6 +86,7 @@ impl KeyHistoryRetention {
                    let Value::Image(img) = &logs[0].1 else {
                        unreachable!()
                    };
+                    stat.produce_image_key(img);
                    if let Some(image_writer) = image_writer.as_mut() {
                        image_writer.put_image(key, img.clone(), ctx).await?;
                    } else {
@@ -89,24 +94,111 @@ impl KeyHistoryRetention {
                    }
                } else {
                    for (lsn, val) in logs {
+                        stat.produce_key(&val);
                        delta_writer.push((key, lsn, val));
                    }
                }
                first_batch = false;
            } else {
                for (lsn, val) in logs {
+                    stat.produce_key(&val);
                    delta_writer.push((key, lsn, val));
                }
            }
        }
        let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
        for (lsn, val) in above_horizon_logs {
+            stat.produce_key(&val);
            delta_writer.push((key, lsn, val));
        }
        Ok(())
    }
 }

+#[derive(Debug, Serialize, Default)]
+struct CompactionStatisticsNumSize {
+    num: u64,
+    size: u64,
+}
+
+#[derive(Debug, Serialize, Default)]
+pub struct CompactionStatistics {
+    delta_layer_visited: CompactionStatisticsNumSize,
+    image_layer_visited: CompactionStatisticsNumSize,
+    delta_layer_produced: CompactionStatisticsNumSize,
+    image_layer_produced: CompactionStatisticsNumSize,
+    num_delta_layer_discarded: usize,
+    num_image_layer_discarded: usize,
+    num_unique_keys_visited: usize,
+    wal_keys_visited: CompactionStatisticsNumSize,
+    image_keys_visited: CompactionStatisticsNumSize,
+    wal_produced: CompactionStatisticsNumSize,
+    image_produced: CompactionStatisticsNumSize,
+}
+
+impl CompactionStatistics {
+    fn estimated_size_of_value(val: &Value) -> usize {
+        match val {
+            Value::Image(img) => img.len(),
+            Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(),
+            _ => std::mem::size_of::<NeonWalRecord>(),
+        }
+    }
+    fn estimated_size_of_key() -> usize {
+        KEY_SIZE // TODO: distinguish image layer and delta layer (count LSN in delta layer)
+    }
+    fn visit_delta_layer(&mut self, size: u64) {
+        self.delta_layer_visited.num += 1;
+        self.delta_layer_visited.size += size;
+    }
+    fn visit_image_layer(&mut self, size: u64) {
+        self.image_layer_visited.num += 1;
+        self.image_layer_visited.size += size;
+    }
+    fn on_unique_key_visited(&mut self) {
+        self.num_unique_keys_visited += 1;
+    }
+    fn visit_wal_key(&mut self, val: &Value) {
+        self.wal_keys_visited.num += 1;
+        self.wal_keys_visited.size +=
+            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn visit_image_key(&mut self, val: &Value) {
+        self.image_keys_visited.num += 1;
+        self.image_keys_visited.size +=
+            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn produce_key(&mut self, val: &Value) {
+        match val {
+            Value::Image(img) => self.produce_image_key(img),
+            Value::WalRecord(_) => self.produce_wal_key(val),
+        }
+    }
+    fn produce_wal_key(&mut self, val: &Value) {
+        self.wal_produced.num += 1;
+        self.wal_produced.size +=
+            Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn produce_image_key(&mut self, val: &Bytes) {
+        self.image_produced.num += 1;
+        self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
+    }
+    fn discard_delta_layer(&mut self) {
+        self.num_delta_layer_discarded += 1;
+    }
+    fn discard_image_layer(&mut self) {
+        self.num_image_layer_discarded += 1;
+    }
+    fn produce_delta_layer(&mut self, size: u64) {
+        self.delta_layer_produced.num += 1;
+        self.delta_layer_produced.size += size;
+    }
+    fn produce_image_layer(&mut self, size: u64) {
+        self.image_layer_produced.num += 1;
+        self.image_layer_produced.size += size;
+    }
+}
+
 impl Timeline {
    /// TODO: cancellation
    ///
@@ -118,12 +210,18 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<bool, CompactionError> {
        if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
-            self.compact_with_gc(cancel, ctx)
+            self.compact_with_gc(cancel, flags, ctx)
                .await
                .map_err(CompactionError::Other)?;
            return Ok(false);
        }

+        if flags.contains(CompactFlags::DryRun) {
+            return Err(CompactionError::Other(anyhow!(
+                "dry-run mode is not supported for legacy compaction for now"
+            )));
+        }
+
        // High level strategy for compaction / image creation:
        //
        // 1. First, calculate the desired "partitioning" of the
@@ -273,7 +371,7 @@ impl Timeline {
        );

        let layers = self.layers.read().await;
-        for layer_desc in layers.layer_map().iter_historic_layers() {
+        for layer_desc in layers.layer_map()?.iter_historic_layers() {
            let layer = layers.get_from_desc(&layer_desc);
            if layer.metadata().shard.shard_count == self.shard_identity.count {
                // This layer does not belong to a historic ancestor, no need to re-image it.
@@ -451,7 +549,9 @@ impl Timeline {
    ///
    /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers
    /// that we know won't be needed for reads.
-    pub(super) async fn update_layer_visibility(&self) {
+    pub(super) async fn update_layer_visibility(
+        &self,
+    ) -> Result<(), super::layer_manager::Shutdown> {
        let head_lsn = self.get_last_record_lsn();

        // We will sweep through layers in reverse-LSN order.  We only do historic layers.  L0 deltas
@@ -459,7 +559,7 @@ impl Timeline {
        // Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that
        // they will be subject to L0->L1 compaction in the near future.
        let layer_manager = self.layers.read().await;
-        let layer_map = layer_manager.layer_map();
+        let layer_map = layer_manager.layer_map()?;

        let readable_points = {
            let children = self.gc_info.read().unwrap().retain_lsns.clone();
@@ -482,6 +582,7 @@ impl Timeline {
        // TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can
        // avoid assuming that everything at a branch point is visible.
        drop(covered);
+        Ok(())
    }

    /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
@@ -535,12 +636,8 @@ impl Timeline {
    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
        stats.read_lock_held_spawn_blocking_startup_micros =
            stats.read_lock_acquisition_micros.till_now(); // set by caller
-        let layers = guard.layer_map();
-        let level0_deltas = layers.get_level0_deltas();
-        let mut level0_deltas = level0_deltas
-            .into_iter()
-            .map(|x| guard.get_from_desc(&x))
-            .collect_vec();
+        let layers = guard.layer_map()?;
+        let level0_deltas = layers.level0_deltas();
        stats.level0_deltas_count = Some(level0_deltas.len());

        // Only compact if enough layers have accumulated.
@@ -553,6 +650,11 @@ impl Timeline {
            return Ok(CompactLevel0Phase1Result::default());
        }

+        let mut level0_deltas = level0_deltas
+            .iter()
+            .map(|x| guard.get_from_desc(x))
+            .collect::<Vec<_>>();
+
        // Gather the files to compact in this iteration.
        //
        // Start with the oldest Level 0 delta file, and collect any other
@@ -1006,14 +1108,16 @@ impl Timeline {
                        || contains_hole
                    {
                        // ... if so, flush previous layer and prepare to write new one
-                        new_layers.push(
-                            writer
-                                .take()
-                                .unwrap()
-                                .finish(prev_key.unwrap().next(), self, ctx)
-                                .await
-                                .map_err(CompactionError::Other)?,
-                        );
+                        let (desc, path) = writer
+                            .take()
+                            .unwrap()
+                            .finish(prev_key.unwrap().next(), ctx)
+                            .await
+                            .map_err(CompactionError::Other)?;
+                        let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
+                            .map_err(CompactionError::Other)?;
+
+                        new_layers.push(new_delta);
                        writer = None;

                        if contains_hole {
@@ -1076,12 +1180,13 @@ impl Timeline {
            prev_key = Some(key);
        }
        if let Some(writer) = writer {
-            new_layers.push(
-                writer
-                    .finish(prev_key.unwrap().next(), self, ctx)
-                    .await
-                    .map_err(CompactionError::Other)?,
-            );
+            let (desc, path) = writer
+                .finish(prev_key.unwrap().next(), ctx)
+                .await
+                .map_err(CompactionError::Other)?;
+            let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
+                .map_err(CompactionError::Other)?;
+            new_layers.push(new_delta);
        }

        // Sync layers
@@ -1306,10 +1411,9 @@ impl Timeline {
        // Find the top of the historical layers
        let end_lsn = {
            let guard = self.layers.read().await;
-            let layers = guard.layer_map();
+            let layers = guard.layer_map()?;

-            let l0_deltas = layers.get_level0_deltas();
-            drop(guard);
+            let l0_deltas = layers.level0_deltas();

            // As an optimization, if we find that there are too few L0 layers,
            // bail out early. We know that the compaction algorithm would do
@@ -1641,6 +1745,7 @@ impl Timeline {
    pub(crate) async fn compact_with_gc(
        self: &Arc<Self>,
        cancel: &CancellationToken,
+        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        use std::collections::BTreeSet;
@@ -1664,19 +1769,23 @@ impl Timeline {
        )
        .await?;

-        info!("running enhanced gc bottom-most compaction");
+        let dry_run = flags.contains(CompactFlags::DryRun);
+
+        info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");

        scopeguard::defer! {
            info!("done enhanced gc bottom-most compaction");
        };

+        let mut stat = CompactionStatistics::default();
+
        // Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
        // The layer selection has the following properties:
        // 1. If a layer is in the selection, all layers below it are in the selection.
        // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
        let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = {
            let guard = self.layers.read().await;
-            let layers = guard.layer_map();
+            let layers = guard.layer_map()?;
            let gc_info = self.gc_info.read().unwrap();
            let mut retain_lsns_below_horizon = Vec::new();
            let gc_cutoff = gc_info.cutoffs.select_min();
@@ -1740,6 +1849,9 @@ impl Timeline {
                let key_range = desc.get_key_range();
                delta_split_points.insert(key_range.start);
                delta_split_points.insert(key_range.end);
+                stat.visit_delta_layer(desc.file_size());
+            } else {
+                stat.visit_image_layer(desc.file_size());
            }
        }
        let mut delta_layers = Vec::new();
@@ -1775,6 +1887,8 @@ impl Timeline {
            tline: &Arc<Timeline>,
            lowest_retain_lsn: Lsn,
            ctx: &RequestContext,
+            stats: &mut CompactionStatistics,
+            dry_run: bool,
            last_batch: bool,
        ) -> anyhow::Result<Option<FlushDeltaResult>> {
            // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
@@ -1831,6 +1945,7 @@ impl Timeline {
                    let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
                    drop(guard);
                    if layer_generation == tline.generation {
+                        stats.discard_delta_layer();
                        // TODO: depending on whether we design this compaction process to run along with
                        // other compactions, there could be layer map modifications after we drop the
                        // layer guard, and in case it creates duplicated layer key, we will still error
@@ -1857,9 +1972,16 @@ impl Timeline {
            for (key, lsn, val) in deltas {
                delta_layer_writer.put_value(key, lsn, val, ctx).await?;
            }
-            let delta_layer = delta_layer_writer
-                .finish(delta_key.key_range.end, tline, ctx)
+
+            stats.produce_delta_layer(delta_layer_writer.size());
+            if dry_run {
+                return Ok(None);
+            }
+
+            let (desc, path) = delta_layer_writer
+                .finish(delta_key.key_range.end, ctx)
                .await?;
+            let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?;
            Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer)))
        }

@@ -1951,6 +2073,13 @@ impl Timeline {
        let mut current_delta_split_point = 0;
        let mut delta_layers = Vec::new();
        while let Some((key, lsn, val)) = merge_iter.next().await? {
+            if cancel.is_cancelled() {
+                return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
+            }
+            match val {
+                Value::Image(_) => stat.visit_image_key(&val),
+                Value::WalRecord(_) => stat.visit_wal_key(&val),
+            }
            if last_key.is_none() || last_key.as_ref() == Some(&key) {
                if last_key.is_none() {
                    last_key = Some(key);
@@ -1958,6 +2087,7 @@ impl Timeline {
                accumulated_values.push((key, lsn, val));
            } else {
                let last_key = last_key.as_mut().unwrap();
+                stat.on_unique_key_visited();
                let retention = self
                    .generate_key_retention(
                        *last_key,
@@ -1974,6 +2104,7 @@ impl Timeline {
                        *last_key,
                        &mut delta_values,
                        image_layer_writer.as_mut(),
+                        &mut stat,
                        ctx,
                    )
                    .await?;
@@ -1986,6 +2117,8 @@ impl Timeline {
                        self,
                        lowest_retain_lsn,
                        ctx,
+                        &mut stat,
+                        dry_run,
                        false,
                    )
                    .await?,
@@ -1998,6 +2131,7 @@ impl Timeline {

        let last_key = last_key.expect("no keys produced during compaction");
        // TODO: move this part to the loop body
+        stat.on_unique_key_visited();
        let retention = self
            .generate_key_retention(
                last_key,
@@ -2014,6 +2148,7 @@ impl Timeline {
                last_key,
                &mut delta_values,
                image_layer_writer.as_mut(),
+                &mut stat,
                ctx,
            )
            .await?;
@@ -2026,6 +2161,8 @@ impl Timeline {
                self,
                lowest_retain_lsn,
                ctx,
+                &mut stat,
+                dry_run,
                true,
            )
            .await?,
@@ -2033,12 +2170,28 @@ impl Timeline {
        assert!(delta_values.is_empty(), "unprocessed keys");

        let image_layer = if discard_image_layer {
+            stat.discard_image_layer();
            None
        } else if let Some(writer) = image_layer_writer {
-            Some(writer.finish(self, ctx).await?)
+            stat.produce_image_layer(writer.size());
+            if !dry_run {
+                Some(writer.finish(self, ctx).await?)
+            } else {
+                None
+            }
        } else {
            None
        };
+
+        info!(
+            "gc-compaction statistics: {}",
+            serde_json::to_string(&stat)?
+        );
+
+        if dry_run {
+            return Ok(());
+        }
+
        info!(
            "produced {} delta layers and {} image layers",
            delta_layers.len(),
@@ -2062,10 +2215,13 @@ impl Timeline {
        let mut layer_selection = layer_selection;
        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
        compact_to.extend(image_layer);
+
        // Step 3: Place back to the layer map.
        {
            let mut guard = self.layers.write().await;
-            guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
+            guard
+                .open_mut()?
+                .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
        };
        self.remote_client
            .schedule_compaction_update(&layer_selection, &compact_to)?;
@@ -2145,7 +2301,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
        self.flush_updates().await?;

        let guard = self.timeline.layers.read().await;
-        let layer_map = guard.layer_map();
+        let layer_map = guard.layer_map()?;

        let result = layer_map
            .iter_historic_layers()
@@ -2268,9 +2424,9 @@ impl CompactionJobExecutor for TimelineAdaptor {
            ))
        });

-        let new_delta_layer = writer
-            .finish(prev.unwrap().0.next(), &self.timeline, ctx)
-            .await?;
+        let (desc, path) = writer.finish(prev.unwrap().0.next(), ctx).await?;
+        let new_delta_layer =
+            Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;

        self.new_deltas.push(new_delta_layer);
        Ok(())
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -230,6 +230,8 @@ impl DeleteTimelineFlow {
        // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
        timeline.shutdown(super::ShutdownMode::Hard).await;

+        tenant.gc_block.before_delete(&timeline);
+
        fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
            Err(anyhow::anyhow!(
                "failpoint: timeline-delete-before-index-deleted-at"
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -1,4 +1,4 @@
-use std::sync::Arc;
+use std::{collections::HashSet, sync::Arc};

 use super::{layer_manager::LayerManager, FlushLayerError, Timeline};
 use crate::{
@@ -74,6 +74,11 @@ impl From<crate::tenant::upload_queue::NotInitialized> for Error {
        Error::ShuttingDown
    }
 }
+impl From<super::layer_manager::Shutdown> for Error {
+    fn from(_: super::layer_manager::Shutdown) -> Self {
+        Error::ShuttingDown
+    }
+}

 impl From<FlushLayerError> for Error {
    fn from(value: FlushLayerError) -> Self {
@@ -141,50 +146,9 @@ pub(super) async fn prepare(
            }
        }

-        // detached has previously been detached; let's inspect each of the current timelines and
-        // report back the timelines which have been reparented by our detach
-        let mut all_direct_children = tenant
-            .timelines
-            .lock()
-            .unwrap()
-            .values()
-            .filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)))
-            .map(|tl| (tl.ancestor_lsn, tl.clone()))
-            .collect::<Vec<_>>();
-
-        let mut any_shutdown = false;
-
-        all_direct_children.retain(
-            |(_, tl)| match tl.remote_client.initialized_upload_queue() {
-                Ok(accessor) => accessor
-                    .latest_uploaded_index_part()
-                    .lineage
-                    .is_reparented(),
-                Err(_shutdownalike) => {
-                    // not 100% a shutdown, but let's bail early not to give inconsistent results in
-                    // sharded enviroment.
-                    any_shutdown = true;
-                    true
-                }
-            },
-        );
-
-        if any_shutdown {
-            // it could be one or many being deleted; have client retry
-            return Err(Error::ShuttingDown);
-        }
-
-        let mut reparented = all_direct_children;
-        // why this instead of hashset? there is a reason, but I've forgotten it many times.
-        //
-        // maybe if this was a hashset we would not be able to distinguish some race condition.
-        reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id));
-
+        let reparented_timelines = reparented_direct_children(detached, tenant)?;
        return Ok(Progress::Done(AncestorDetached {
-            reparented_timelines: reparented
-                .into_iter()
-                .map(|(_, tl)| tl.timeline_id)
-                .collect(),
+            reparented_timelines,
        }));
    };

@@ -277,7 +241,7 @@ pub(super) async fn prepare(

        // between retries, these can change if compaction or gc ran in between. this will mean
        // we have to redo work.
-        partition_work(ancestor_lsn, &layers)
+        partition_work(ancestor_lsn, &layers)?
    };

    // TODO: layers are already sorted by something: use that to determine how much of remote
@@ -381,16 +345,67 @@ pub(super) async fn prepare(
    Ok(Progress::Prepared(guard, prepared))
 }

+fn reparented_direct_children(
+    detached: &Arc<Timeline>,
+    tenant: &Tenant,
+) -> Result<HashSet<TimelineId>, Error> {
+    let mut all_direct_children = tenant
+        .timelines
+        .lock()
+        .unwrap()
+        .values()
+        .filter_map(|tl| {
+            let is_direct_child = matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached));
+
+            if is_direct_child {
+                Some(tl.clone())
+            } else {
+                if let Some(timeline) = tl.ancestor_timeline.as_ref() {
+                    assert_ne!(timeline.timeline_id, detached.timeline_id, "we cannot have two timelines with the same timeline_id live");
+                }
+                None
+            }
+        })
+        // Collect to avoid lock taking order problem with Tenant::timelines and
+        // Timeline::remote_client
+        .collect::<Vec<_>>();
+
+    let mut any_shutdown = false;
+
+    all_direct_children.retain(|tl| match tl.remote_client.initialized_upload_queue() {
+        Ok(accessor) => accessor
+            .latest_uploaded_index_part()
+            .lineage
+            .is_reparented(),
+        Err(_shutdownalike) => {
+            // not 100% a shutdown, but let's bail early not to give inconsistent results in
+            // sharded enviroment.
+            any_shutdown = true;
+            true
+        }
+    });
+
+    if any_shutdown {
+        // it could be one or many being deleted; have client retry
+        return Err(Error::ShuttingDown);
+    }
+
+    Ok(all_direct_children
+        .into_iter()
+        .map(|tl| tl.timeline_id)
+        .collect())
+}
+
 fn partition_work(
    ancestor_lsn: Lsn,
-    source_layermap: &LayerManager,
-) -> (usize, Vec<Layer>, Vec<Layer>) {
+    source: &LayerManager,
+) -> Result<(usize, Vec<Layer>, Vec<Layer>), Error> {
    let mut straddling_branchpoint = vec![];
    let mut rest_of_historic = vec![];

    let mut later_by_lsn = 0;

-    for desc in source_layermap.layer_map().iter_historic_layers() {
+    for desc in source.layer_map()?.iter_historic_layers() {
        // off by one chances here:
        // - start is inclusive
        // - end is exclusive
@@ -409,10 +424,10 @@ fn partition_work(
            &mut rest_of_historic
        };

-        target.push(source_layermap.get_from_desc(&desc));
+        target.push(source.get_from_desc(&desc));
    }

-    (later_by_lsn, straddling_branchpoint, rest_of_historic)
+    Ok((later_by_lsn, straddling_branchpoint, rest_of_historic))
 }

 async fn upload_rewritten_layer(
@@ -488,10 +503,12 @@ async fn copy_lsn_prefix(
        // reuse the key instead of adding more holes between layers by using the real
        // highest key in the layer.
        let reused_highest_key = layer.layer_desc().key_range.end;
-        let copied = writer
-            .finish(reused_highest_key, target_timeline, ctx)
+        let (desc, path) = writer
+            .finish(reused_highest_key, ctx)
            .await
            .map_err(CopyDeltaPrefix)?;
+        let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path)
+            .map_err(CopyDeltaPrefix)?;

        tracing::debug!(%layer, %copied, "new layer produced");

@@ -537,11 +554,12 @@ pub(super) async fn complete(
    tenant: &Tenant,
    prepared: PreparedTimelineDetach,
    _ctx: &RequestContext,
-) -> Result<Vec<TimelineId>, anyhow::Error> {
+) -> Result<HashSet<TimelineId>, anyhow::Error> {
    let PreparedTimelineDetach { layers } = prepared;

    let ancestor = detached
-        .get_ancestor_timeline()
+        .ancestor_timeline
+        .as_ref()
        .expect("must still have a ancestor");
    let ancestor_lsn = detached.get_ancestor_lsn();

@@ -581,7 +599,7 @@ pub(super) async fn complete(
            }

            let tl_ancestor = tl.ancestor_timeline.as_ref()?;
-            let is_same = Arc::ptr_eq(&ancestor, tl_ancestor);
+            let is_same = Arc::ptr_eq(ancestor, tl_ancestor);
            let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;

            let is_deleting = tl
@@ -622,13 +640,18 @@ pub(super) async fn complete(
        });

    let reparenting_candidates = tasks.len();
-    let mut reparented = Vec::with_capacity(tasks.len());
+    let mut reparented = HashSet::with_capacity(tasks.len());

    while let Some(res) = tasks.join_next().await {
        match res {
            Ok(Some(timeline)) => {
                tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
-                reparented.push((timeline.ancestor_lsn, timeline.timeline_id));
+
+                assert!(
+                    reparented.insert(timeline.timeline_id),
+                    "duplicate reparenting? timeline_id={}",
+                    timeline.timeline_id
+                );
            }
            Ok(None) => {
                // lets just ignore this for now. one or all reparented timelines could had
@@ -650,12 +673,5 @@ pub(super) async fn complete(
        tracing::info!("failed to reparent some candidates");
    }

-    reparented.sort_unstable();
-
-    let reparented = reparented
-        .into_iter()
-        .map(|(_, timeline_id)| timeline_id)
-        .collect();
-
    Ok(reparented)
 }
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -213,51 +213,45 @@ impl Timeline {
        let mut js = tokio::task::JoinSet::new();
        {
            let guard = self.layers.read().await;
-            let layers = guard.layer_map();
-            for layer in layers.iter_historic_layers() {
-                let layer = guard.get_from_desc(&layer);

-                // guard against eviction while we inspect it; it might be that eviction_task and
-                // disk_usage_eviction_task both select the same layers to be evicted, and
-                // seemingly free up double the space. both succeeding is of no consequence.
+            guard
+                .likely_resident_layers()
+                .filter(|layer| {
+                    let last_activity_ts = layer.latest_activity();

-                if !layer.is_likely_resident() {
-                    continue;
-                }
+                    let no_activity_for = match now.duration_since(last_activity_ts) {
+                        Ok(d) => d,
+                        Err(_e) => {
+                            // We reach here if `now` < `last_activity_ts`, which can legitimately
+                            // happen if there is an access between us getting `now`, and us getting
+                            // the access stats from the layer.
+                            //
+                            // The other reason why it can happen is system clock skew because
+                            // SystemTime::now() is not monotonic, so, even if there is no access
+                            // to the layer after we get `now` at the beginning of this function,
+                            // it could be that `now`  < `last_activity_ts`.
+                            //
+                            // To distinguish the cases, we would need to record `Instant`s in the
+                            // access stats (i.e., monotonic timestamps), but then, the timestamps
+                            // values in the access stats would need to be `Instant`'s, and hence
+                            // they would be meaningless outside of the pageserver process.
+                            // At the time of writing, the trade-off is that access stats are more
+                            // valuable than detecting clock skew.
+                            return false;
+                        }
+                    };

-                let last_activity_ts = layer.access_stats().latest_activity();
-
-                let no_activity_for = match now.duration_since(last_activity_ts) {
-                    Ok(d) => d,
-                    Err(_e) => {
-                        // We reach here if `now` < `last_activity_ts`, which can legitimately
-                        // happen if there is an access between us getting `now`, and us getting
-                        // the access stats from the layer.
-                        //
-                        // The other reason why it can happen is system clock skew because
-                        // SystemTime::now() is not monotonic, so, even if there is no access
-                        // to the layer after we get `now` at the beginning of this function,
-                        // it could be that `now`  < `last_activity_ts`.
-                        //
-                        // To distinguish the cases, we would need to record `Instant`s in the
-                        // access stats (i.e., monotonic timestamps), but then, the timestamps
-                        // values in the access stats would need to be `Instant`'s, and hence
-                        // they would be meaningless outside of the pageserver process.
-                        // At the time of writing, the trade-off is that access stats are more
-                        // valuable than detecting clock skew.
-                        continue;
-                    }
-                };
-
-                if no_activity_for > p.threshold {
+                    no_activity_for > p.threshold
+                })
+                .cloned()
+                .for_each(|layer| {
                    js.spawn(async move {
                        layer
                            .evict_and_wait(std::time::Duration::from_secs(5))
                            .await
                    });
                    stats.candidates += 1;
-                }
-            }
+                });
        };

        let join_all = async move {
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,4 +1,4 @@
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{bail, ensure, Context};
 use itertools::Itertools;
 use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
@@ -24,39 +24,142 @@ use crate::{
 use super::TimelineWriterState;

 /// Provides semantic APIs to manipulate the layer map.
-#[derive(Default)]
-pub(crate) struct LayerManager {
-    layer_map: LayerMap,
-    layer_fmgr: LayerFileManager<Layer>,
+pub(crate) enum LayerManager {
+    /// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate
+    /// the layers.
+    Open(OpenLayerManager),
+    /// Shutdown layer manager where there are no more in-memory layers and persistent layers are
+    /// read-only.
+    Closed {
+        layers: HashMap<PersistentLayerKey, Layer>,
+    },
+}
+
+impl Default for LayerManager {
+    fn default() -> Self {
+        LayerManager::Open(OpenLayerManager::default())
+    }
 }

 impl LayerManager {
-    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
-        self.layer_fmgr.get_from_desc(desc)
+    pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
+        // The assumption for the `expect()` is that all code maintains the following invariant:
+        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
+        self.layers()
+            .get(key)
+            .with_context(|| format!("get layer from key: {key}"))
+            .expect("not found")
+            .clone()
    }

-    pub(crate) fn get_from_key(&self, desc: &PersistentLayerKey) -> Layer {
-        self.layer_fmgr.get_from_key(desc)
+    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
+        self.get_from_key(&desc.key())
    }

    /// Get an immutable reference to the layer map.
    ///
    /// We expect users only to be able to get an immutable layer map. If users want to make modifications,
    /// they should use the below semantic APIs. This design makes us step closer to immutable storage state.
-    pub(crate) fn layer_map(&self) -> &LayerMap {
-        &self.layer_map
+    pub(crate) fn layer_map(&self) -> Result<&LayerMap, Shutdown> {
+        use LayerManager::*;
+        match self {
+            Open(OpenLayerManager { layer_map, .. }) => Ok(layer_map),
+            Closed { .. } => Err(Shutdown),
+        }
    }

+    pub(crate) fn open_mut(&mut self) -> Result<&mut OpenLayerManager, Shutdown> {
+        use LayerManager::*;
+
+        match self {
+            Open(open) => Ok(open),
+            Closed { .. } => Err(Shutdown),
+        }
+    }
+
+    /// LayerManager shutdown. The in-memory layers do cleanup on drop, so we must drop them in
+    /// order to allow shutdown to complete.
+    ///
+    /// If there was a want to flush in-memory layers, it must have happened earlier.
+    pub(crate) fn shutdown(&mut self, writer_state: &mut Option<TimelineWriterState>) {
+        use LayerManager::*;
+        match self {
+            Open(OpenLayerManager {
+                layer_map,
+                layer_fmgr: LayerFileManager(hashmap),
+            }) => {
+                let open = layer_map.open_layer.take();
+                let frozen = layer_map.frozen_layers.len();
+                let taken_writer_state = writer_state.take();
+                tracing::info!(open = open.is_some(), frozen, "dropped inmemory layers");
+                let layers = std::mem::take(hashmap);
+                *self = Closed { layers };
+                assert_eq!(open.is_some(), taken_writer_state.is_some());
+            }
+            Closed { .. } => {
+                tracing::debug!("ignoring multiple shutdowns on layer manager")
+            }
+        }
+    }
+
+    /// Sum up the historic layer sizes
+    pub(crate) fn layer_size_sum(&self) -> u64 {
+        self.layers()
+            .values()
+            .map(|l| l.layer_desc().file_size)
+            .sum()
+    }
+
+    pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = &'_ Layer> + '_ {
+        self.layers().values().filter(|l| l.is_likely_resident())
+    }
+
+    pub(crate) fn contains(&self, layer: &Layer) -> bool {
+        self.contains_key(&layer.layer_desc().key())
+    }
+
+    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
+        self.layers().contains_key(key)
+    }
+
+    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
+        self.layers().keys().cloned().collect_vec()
+    }
+
+    fn layers(&self) -> &HashMap<PersistentLayerKey, Layer> {
+        use LayerManager::*;
+        match self {
+            Open(OpenLayerManager { layer_fmgr, .. }) => &layer_fmgr.0,
+            Closed { layers } => layers,
+        }
+    }
+}
+
+#[derive(Default)]
+pub(crate) struct OpenLayerManager {
+    layer_map: LayerMap,
+    layer_fmgr: LayerFileManager<Layer>,
+}
+
+impl std::fmt::Debug for OpenLayerManager {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("OpenLayerManager")
+            .field("layer_count", &self.layer_fmgr.0.len())
+            .finish()
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+#[error("layer manager has been shutdown")]
+pub(crate) struct Shutdown;
+
+impl OpenLayerManager {
    /// Called from `load_layer_map`. Initialize the layer manager with:
    /// 1. all on-disk layers
    /// 2. next open layer (with disk disk_consistent_lsn LSN)
-    pub(crate) fn initialize_local_layers(
-        &mut self,
-        on_disk_layers: Vec<Layer>,
-        next_open_layer_at: Lsn,
-    ) {
+    pub(crate) fn initialize_local_layers(&mut self, layers: Vec<Layer>, next_open_layer_at: Lsn) {
        let mut updates = self.layer_map.batch_update();
-        for layer in on_disk_layers {
+        for layer in layers {
            Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
        }
        updates.flush();
@@ -68,26 +171,19 @@ impl LayerManager {
        self.layer_map.next_open_layer_at = Some(next_open_layer_at);
    }

-    /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
-    /// called within `get_layer_for_write`.
+    /// Open a new writable layer to append data if there is no open layer, otherwise return the
+    /// current open layer, called within `get_layer_for_write`.
    pub(crate) async fn get_layer_for_write(
        &mut self,
        lsn: Lsn,
-        last_record_lsn: Lsn,
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
+        gate_guard: utils::sync::gate::GateGuard,
        ctx: &RequestContext,
-    ) -> Result<Arc<InMemoryLayer>> {
+    ) -> anyhow::Result<Arc<InMemoryLayer>> {
        ensure!(lsn.is_aligned());

-        ensure!(
-            lsn > last_record_lsn,
-            "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
-            lsn,
-            last_record_lsn,
-        );
-
        // Do we have a layer open for writing already?
        let layer = if let Some(open_layer) = &self.layer_map.open_layer {
            if open_layer.get_lsn_range().start > lsn {
@@ -113,8 +209,15 @@ impl LayerManager {
                lsn
            );

-            let new_layer =
-                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?;
+            let new_layer = InMemoryLayer::create(
+                conf,
+                timeline_id,
+                tenant_shard_id,
+                start_lsn,
+                gate_guard,
+                ctx,
+            )
+            .await?;
            let layer = Arc::new(new_layer);

            self.layer_map.open_layer = Some(layer.clone());
@@ -168,7 +271,7 @@ impl LayerManager {
        froze
    }

-    /// Add image layers to the layer map, called from `create_image_layers`.
+    /// Add image layers to the layer map, called from [`super::Timeline::create_image_layers`].
    pub(crate) fn track_new_image_layers(
        &mut self,
        image_layers: &[ResidentLayer],
@@ -241,7 +344,7 @@ impl LayerManager {
        self.finish_compact_l0(compact_from, compact_to, metrics)
    }

-    /// Called when compaction is completed.
+    /// Called post-compaction when some previous generation image layers were trimmed.
    pub(crate) fn rewrite_layers(
        &mut self,
        rewrite_layers: &[(Layer, ResidentLayer)],
@@ -259,13 +362,10 @@ impl LayerManager {
                new_layer.layer_desc().lsn_range
            );

-            // Transfer visibilty hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
+            // Transfer visibility hint from old to new layer, since the new layer covers the same key space.  This is not guaranteed to
            // be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents
            // always marking rewritten layers as visible.
-            new_layer
-                .as_ref()
-                .access_stats()
-                .set_visibility(old_layer.access_stats().visibility());
+            new_layer.as_ref().set_visibility(old_layer.visibility());

            // Safety: we may never rewrite the same file in-place.  Callers are responsible
            // for ensuring that they only rewrite layers after something changes the path,
@@ -333,31 +433,6 @@ impl LayerManager {
        mapping.remove(layer);
        layer.delete_on_drop();
    }
-
-    pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = Layer> + '_ {
-        // for small layer maps, we most likely have all resident, but for larger more are likely
-        // to be evicted assuming lots of layers correlated with longer lifespan.
-
-        self.layer_map().iter_historic_layers().filter_map(|desc| {
-            self.layer_fmgr
-                .0
-                .get(&desc.key())
-                .filter(|l| l.is_likely_resident())
-                .cloned()
-        })
-    }
-
-    pub(crate) fn contains(&self, layer: &Layer) -> bool {
-        self.layer_fmgr.contains(layer)
-    }
-
-    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
-        self.layer_fmgr.contains_key(key)
-    }
-
-    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
-        self.layer_fmgr.0.keys().cloned().collect_vec()
-    }
 }

 pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
@@ -369,24 +444,6 @@ impl<T> Default for LayerFileManager<T> {
 }

 impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
-    fn get_from_key(&self, key: &PersistentLayerKey) -> T {
-        // The assumption for the `expect()` is that all code maintains the following invariant:
-        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
-        self.0
-            .get(key)
-            .with_context(|| format!("get layer from key: {}", key))
-            .expect("not found")
-            .clone()
-    }
-
-    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
-        self.get_from_key(&desc.key())
-    }
-
-    fn contains_key(&self, key: &PersistentLayerKey) -> bool {
-        self.0.contains_key(key)
-    }
-
    pub(crate) fn insert(&mut self, layer: T) {
        let present = self.0.insert(layer.layer_desc().key(), layer.clone());
        if present.is_some() && cfg!(debug_assertions) {
@@ -394,10 +451,6 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
        }
    }

-    pub(crate) fn contains(&self, layer: &T) -> bool {
-        self.0.contains_key(&layer.layer_desc().key())
-    }
-
    pub(crate) fn remove(&mut self, layer: &T) {
        let present = self.0.remove(&layer.layer_desc().key());
        if present.is_none() && cfg!(debug_assertions) {
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -122,6 +122,10 @@ impl CurrentLogicalSize {
            Self::Exact(_) => Accuracy::Exact,
        }
    }
+
+    pub(crate) fn is_exact(&self) -> bool {
+        matches!(self, Self::Exact(_))
+    }
 }

 impl LogicalSize {
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -30,10 +30,12 @@ use tokio::time::Instant;
 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
 pub use io_engine::feature_test as io_engine_feature_test;
+pub use io_engine::io_engine_for_bench;
 pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
 use self::owned_buffers_io::write::OwnedAsyncWriter;
+pub(crate) use api::DirectIoMode;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -328,3 +328,29 @@ pub fn feature_test() -> anyhow::Result<FeatureTestResult> {
    .join()
    .unwrap()
 }
+
+/// For use in benchmark binaries only.
+///
+/// Benchmarks which initialize `virtual_file` need to know what engine to use, but we also
+/// don't want to silently fall back to slower I/O engines in a benchmark: this could waste
+/// developer time trying to figure out why it's slow.
+///
+/// In practice, this method will either return IoEngineKind::TokioEpollUring, or panic.
+pub fn io_engine_for_bench() -> IoEngineKind {
+    #[cfg(not(target_os = "linux"))]
+    {
+        panic!("This benchmark does I/O and can only give a representative result on Linux");
+    }
+    #[cfg(target_os = "linux")]
+    {
+        match feature_test().unwrap() {
+            FeatureTestResult::PlatformPreferred(engine) => engine,
+            FeatureTestResult::Worse {
+                engine: _engine,
+                remark,
+            } => {
+                panic!("This benchmark does I/O can requires the preferred I/O engine: {remark}");
+            }
+        }
+    }
+}
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -45,6 +45,7 @@ static const char *jwt_token = NULL;
 /* GUCs */
 static char *ConsoleURL = NULL;
 static bool ForwardDDL = true;
+static bool RegressTestMode = false;

 /*
 * CURL docs say that this buffer must exist until we call curl_easy_cleanup
@@ -802,6 +803,14 @@ NeonProcessUtility(
 		case T_DropRoleStmt:
 			HandleDropRole(castNode(DropRoleStmt, parseTree));
 			break;
+		case T_CreateTableSpaceStmt:
+			if (!RegressTestMode)
+			{
+				ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					errmsg("CREATE TABLESPACE is not supported on Neon")));
+			}
+   			break;
 		default:
 			break;
 	}
@@ -864,6 +873,18 @@ InitControlPlaneConnector()
 							 NULL,
 							 NULL);

+	DefineCustomBoolVariable(
+							 "neon.regress_test_mode",
+							 "Controls whether we are running in the regression test mode",
+							 NULL,
+							 &RegressTestMode,
+							 false,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
 	jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN");
 	if (!jwt_token)
 	{
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,91 +1,103 @@
 # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.

+[[package]]
+name = "aiohappyeyeballs"
+version = "2.3.5"
+description = "Happy Eyeballs for asyncio"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "aiohappyeyeballs-2.3.5-py3-none-any.whl", hash = "sha256:4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03"},
+    {file = "aiohappyeyeballs-2.3.5.tar.gz", hash = "sha256:6fa48b9f1317254f122a07a131a86b71ca6946ca989ce6326fff54a99a920105"},
+]
+
 [[package]]
 name = "aiohttp"
-version = "3.9.4"
+version = "3.10.2"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"},
-    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"},
-    {file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"},
-    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"},
-    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"},
-    {file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"},
-    {file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"},
-    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"},
-    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"},
-    {file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"},
-    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"},
-    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"},
-    {file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"},
-    {file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"},
-    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"},
-    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"},
-    {file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"},
-    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"},
-    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"},
-    {file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"},
-    {file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"},
-    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"},
-    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"},
-    {file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"},
-    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"},
-    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"},
-    {file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"},
-    {file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"},
-    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"},
-    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"},
-    {file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"},
-    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"},
-    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"},
-    {file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"},
-    {file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"},
-    {file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"},
+    {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:95213b3d79c7e387144e9cb7b9d2809092d6ff2c044cb59033aedc612f38fb6d"},
+    {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1aa005f060aff7124cfadaa2493f00a4e28ed41b232add5869e129a2e395935a"},
+    {file = "aiohttp-3.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eabe6bf4c199687592f5de4ccd383945f485779c7ffb62a9b9f1f8a3f9756df8"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e010736fc16d21125c7e2dc5c350cd43c528b85085c04bf73a77be328fe944"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99f81f9c1529fd8e03be4a7bd7df32d14b4f856e90ef6e9cbad3415dbfa9166c"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d611d1a01c25277bcdea06879afbc11472e33ce842322496b211319aa95441bb"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00191d38156e09e8c81ef3d75c0d70d4f209b8381e71622165f22ef7da6f101"},
+    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74c091a5ded6cb81785de2d7a8ab703731f26de910dbe0f3934eabef4ae417cc"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:18186a80ec5a701816adbf1d779926e1069392cf18504528d6e52e14b5920525"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5a7ceb2a0d2280f23a02c64cd0afdc922079bb950400c3dd13a1ab2988428aac"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8bd7be6ff6c162a60cb8fce65ee879a684fbb63d5466aba3fa5b9288eb04aefa"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:fae962b62944eaebff4f4fddcf1a69de919e7b967136a318533d82d93c3c6bd1"},
+    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a0fde16d284efcacbe15fb0c1013f0967b6c3e379649239d783868230bf1db42"},
+    {file = "aiohttp-3.10.2-cp310-cp310-win32.whl", hash = "sha256:f81cd85a0e76ec7b8e2b6636fe02952d35befda4196b8c88f3cec5b4fb512839"},
+    {file = "aiohttp-3.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:54ba10eb5a3481c28282eb6afb5f709aedf53cf9c3a31875ffbdc9fc719ffd67"},
+    {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87fab7f948e407444c2f57088286e00e2ed0003ceaf3d8f8cc0f60544ba61d91"},
+    {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ec6ad66ed660d46503243cbec7b2b3d8ddfa020f984209b3b8ef7d98ce69c3f2"},
+    {file = "aiohttp-3.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a4be88807283bd96ae7b8e401abde4ca0bab597ba73b5e9a2d98f36d451e9aac"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01c98041f90927c2cbd72c22a164bb816fa3010a047d264969cf82e1d4bcf8d1"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54e36c67e1a9273ecafab18d6693da0fb5ac48fd48417e4548ac24a918c20998"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7de3ddb6f424af54535424082a1b5d1ae8caf8256ebd445be68c31c662354720"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dd9c7db94b4692b827ce51dcee597d61a0e4f4661162424faf65106775b40e7"},
+    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e57e21e1167705f8482ca29cc5d02702208d8bf4aff58f766d94bcd6ead838cd"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a1a50e59b720060c29e2951fd9f13c01e1ea9492e5a527b92cfe04dd64453c16"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:686c87782481fda5ee6ba572d912a5c26d9f98cc5c243ebd03f95222af3f1b0f"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:dafb4abb257c0ed56dc36f4e928a7341b34b1379bd87e5a15ce5d883c2c90574"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:494a6f77560e02bd7d1ab579fdf8192390567fc96a603f21370f6e63690b7f3d"},
+    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6fe8503b1b917508cc68bf44dae28823ac05e9f091021e0c41f806ebbb23f92f"},
+    {file = "aiohttp-3.10.2-cp311-cp311-win32.whl", hash = "sha256:4ddb43d06ce786221c0dfd3c91b4892c318eaa36b903f7c4278e7e2fa0dd5102"},
+    {file = "aiohttp-3.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:ca2f5abcb0a9a47e56bac173c01e9f6c6e7f27534d91451c5f22e6a35a5a2093"},
+    {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:14eb6b17f6246959fb0b035d4f4ae52caa870c4edfb6170aad14c0de5bfbf478"},
+    {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:465e445ec348d4e4bd349edd8b22db75f025da9d7b6dc1369c48e7935b85581e"},
+    {file = "aiohttp-3.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:341f8ece0276a828d95b70cd265d20e257f5132b46bf77d759d7f4e0443f2906"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c01fbb87b5426381cd9418b3ddcf4fc107e296fa2d3446c18ce6c76642f340a3"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c474af073e1a6763e1c5522bbb2d85ff8318197e4c6c919b8d7886e16213345"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d9076810a5621236e29b2204e67a68e1fe317c8727ee4c9abbfbb1083b442c38"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8f515d6859e673940e08de3922b9c4a2249653b0ac181169313bd6e4b1978ac"},
+    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:655e583afc639bef06f3b2446972c1726007a21003cd0ef57116a123e44601bc"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8da9449a575133828cc99985536552ea2dcd690e848f9d41b48d8853a149a959"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:19073d57d0feb1865d12361e2a1f5a49cb764bf81a4024a3b608ab521568093a"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c8e98e1845805f184d91fda6f9ab93d7c7b0dddf1c07e0255924bfdb151a8d05"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:377220a5efde6f9497c5b74649b8c261d3cce8a84cb661be2ed8099a2196400a"},
+    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92f7f4a4dc9cdb5980973a74d43cdbb16286dacf8d1896b6c3023b8ba8436f8e"},
+    {file = "aiohttp-3.10.2-cp312-cp312-win32.whl", hash = "sha256:9bb2834a6f11d65374ce97d366d6311a9155ef92c4f0cee543b2155d06dc921f"},
+    {file = "aiohttp-3.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:518dc3cb37365255708283d1c1c54485bbacccd84f0a0fb87ed8917ba45eda5b"},
+    {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7f98e70bbbf693086efe4b86d381efad8edac040b8ad02821453083d15ec315f"},
+    {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9f6f0b252a009e98fe84028a4ec48396a948e7a65b8be06ccfc6ef68cf1f614d"},
+    {file = "aiohttp-3.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9360e3ffc7b23565600e729e8c639c3c50d5520e05fdf94aa2bd859eef12c407"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3988044d1635c7821dd44f0edfbe47e9875427464e59d548aece447f8c22800a"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30a9d59da1543a6f1478c3436fd49ec59be3868bca561a33778b4391005e499d"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9f49bdb94809ac56e09a310a62f33e5f22973d6fd351aac72a39cd551e98194"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfd2dca3f11c365d6857a07e7d12985afc59798458a2fdb2ffa4a0332a3fd43"},
+    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c1508ec97b2cd3e120bfe309a4ff8e852e8a7460f1ef1de00c2c0ed01e33c"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:49904f38667c44c041a0b44c474b3ae36948d16a0398a8f8cd84e2bb3c42a069"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:352f3a4e5f11f3241a49b6a48bc5b935fabc35d1165fa0d87f3ca99c1fcca98b"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:fc61f39b534c5d5903490478a0dd349df397d2284a939aa3cbaa2fb7a19b8397"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:ad2274e707be37420d0b6c3d26a8115295fe9d8e6e530fa6a42487a8ca3ad052"},
+    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c836bf3c7512100219fe1123743fd8dd9a2b50dd7cfb0c3bb10d041309acab4b"},
+    {file = "aiohttp-3.10.2-cp38-cp38-win32.whl", hash = "sha256:53e8898adda402be03ff164b0878abe2d884e3ea03a4701e6ad55399d84b92dc"},
+    {file = "aiohttp-3.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:7cc8f65f5b22304693de05a245b6736b14cb5bc9c8a03da6e2ae9ef15f8b458f"},
+    {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9dfc906d656e14004c5bc672399c1cccc10db38df2b62a13fb2b6e165a81c316"},
+    {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:91b10208b222ddf655c3a3d5b727879d7163db12b634492df41a9182a76edaae"},
+    {file = "aiohttp-3.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fd16b5e1a7bdd14668cd6bde60a2a29b49147a535c74f50d8177d11b38433a7"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2bfdda4971bd79201f59adbad24ec2728875237e1c83bba5221284dbbf57bda"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69d73f869cf29e8a373127fc378014e2b17bcfbe8d89134bc6fb06a2f67f3cb3"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df59f8486507c421c0620a2c3dce81fbf1d54018dc20ff4fecdb2c106d6e6abc"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df930015db36b460aa9badbf35eccbc383f00d52d4b6f3de2ccb57d064a6ade"},
+    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:562b1153ab7f766ee6b8b357ec777a302770ad017cf18505d34f1c088fccc448"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d984db6d855de58e0fde1ef908d48fe9a634cadb3cf715962722b4da1c40619d"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:14dc3fcb0d877911d775d511eb617a486a8c48afca0a887276e63db04d3ee920"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b52a27a5c97275e254704e1049f4b96a81e67d6205f52fa37a4777d55b0e98ef"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:cd33d9de8cfd006a0d0fe85f49b4183c57e91d18ffb7e9004ce855e81928f704"},
+    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1238fc979160bc03a92fff9ad021375ff1c8799c6aacb0d8ea1b357ea40932bb"},
+    {file = "aiohttp-3.10.2-cp39-cp39-win32.whl", hash = "sha256:e2f43d238eae4f0b04f58d4c0df4615697d4ca3e9f9b1963d49555a94f0f5a04"},
+    {file = "aiohttp-3.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:947847f07a8f81d7b39b2d0202fd73e61962ebe17ac2d8566f260679e467da7b"},
+    {file = "aiohttp-3.10.2.tar.gz", hash = "sha256:4d1f694b5d6e459352e5e925a42e05bac66655bfde44d81c59992463d2897014"},
 ]

 [package.dependencies]
+aiohappyeyeballs = ">=2.3.0"
 aiosignal = ">=1.1.2"
 async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
 attrs = ">=17.3.0"
@@ -94,7 +106,7 @@ multidict = ">=4.5,<7.0"
 yarl = ">=1.0,<2.0"

 [package.extras]
-speedups = ["Brotli", "aiodns", "brotlicffi"]
+speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"]

 [[package]]
 name = "aiopg"
@@ -1514,6 +1526,20 @@ files = [
 [package.dependencies]
 six = "*"

+[[package]]
+name = "kafka-python"
+version = "2.0.2"
+description = "Pure Python client for Apache Kafka"
+optional = false
+python-versions = "*"
+files = [
+    {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"},
+    {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"},
+]
+
+[package.extras]
+crc32c = ["crc32c"]
+
 [[package]]
 name = "lazy-object-proxy"
 version = "1.10.0"
@@ -3357,4 +3383,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
+content-hash = "c09bcb333ab550958b33dbf4fec968c500d8e701fd4c96402cddbd9bb8048055"
--- a/proxy/core/Cargo.toml
+++ b/proxy/core/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "proxy"
+name = "proxy-core"
 version = "0.1.0"
 edition.workspace = true
 license.workspace = true
@@ -9,8 +9,11 @@ default = []
 testing = []

 [dependencies]
+proxy-sasl = { version = "0.1", path = "../sasl" }
+
 ahash.workspace = true
 anyhow.workspace = true
+arc-swap.workspace = true
 async-compression.workspace = true
 async-trait.workspace = true
 atomic-take.workspace = true
@@ -30,7 +33,6 @@ dashmap.workspace = true
 env_logger.workspace = true
 framed-websockets.workspace = true
 futures.workspace = true
-git-version.workspace = true
 hashbrown.workspace = true
 hashlink.workspace = true
 hex.workspace = true
@@ -51,17 +53,15 @@ md5.workspace = true
 measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
 once_cell.workspace = true
-opentelemetry.workspace = true
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
 pq_proto.workspace = true
-prometheus.workspace = true
 rand.workspace = true
 regex.workspace = true
-remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
+remote_storage = { version = "0.1", path = "../../libs/remote_storage/" }
 reqwest.workspace = true
 reqwest-middleware = { workspace = true, features = ["json"] }
 reqwest-retry.workspace = true
@@ -73,14 +73,13 @@ rustls.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-sha2 = { workspace = true, features = ["asm"] }
+sha2 = { workspace = true, features = ["asm", "oid"] }
 smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
 subtle.workspace = true
 task-local-extensions.workspace = true
 thiserror.workspace = true
-tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
 tokio-postgres.workspace = true
 tokio-postgres-rustls.workspace = true
@@ -92,6 +91,7 @@ tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
+try-lock.workspace = true
 typed-json.workspace = true
 url.workspace = true
 urlencoding.workspace = true
@@ -102,6 +102,14 @@ x509-parser.workspace = true
 postgres-protocol.workspace = true
 redis.workspace = true

+# jwt stuff
+jose-jwa = "0.1.2"
+jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] }
+signature = "2"
+ecdsa = "0.16"
+p256 = "0.13"
+rsa = "0.9"
+
 workspace_hack.workspace = true

 [dev-dependencies]
--- a/proxy/core/src/auth.rs
+++ b/proxy/core/src/auth.rs
@@ -38,7 +38,7 @@ pub enum AuthErrorImpl {

    /// SASL protocol errors (includes [SCRAM](crate::scram)).
    #[error(transparent)]
-    Sasl(#[from] crate::sasl::Error),
+    Sasl(#[from] proxy_sasl::sasl::Error),

    #[error("Unsupported authentication method: {0}")]
    BadAuthMethod(Box<str>),
@@ -148,3 +148,28 @@ impl ReportableError for AuthError {
        }
    }
 }
+
+impl UserFacingError for proxy_sasl::sasl::Error {
+    fn to_string_client(&self) -> String {
+        match self {
+            proxy_sasl::sasl::Error::ChannelBindingFailed(m) => m.to_string(),
+            proxy_sasl::sasl::Error::ChannelBindingBadMethod(m) => {
+                format!("unsupported channel binding method {m}")
+            }
+            _ => "authentication protocol violation".to_string(),
+        }
+    }
+}
+
+impl ReportableError for proxy_sasl::sasl::Error {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        match self {
+            proxy_sasl::sasl::Error::ChannelBindingFailed(_) => crate::error::ErrorKind::User,
+            proxy_sasl::sasl::Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User,
+            proxy_sasl::sasl::Error::BadClientMessage(_) => crate::error::ErrorKind::User,
+            proxy_sasl::sasl::Error::MissingBinding => crate::error::ErrorKind::Service,
+            proxy_sasl::sasl::Error::Base64(_) => crate::error::ErrorKind::ControlPlane,
+            proxy_sasl::sasl::Error::Io(_) => crate::error::ErrorKind::ClientDisconnect,
+        }
+    }
+}
--- a/proxy/core/src/auth/backend.rs
+++ b/proxy/core/src/auth/backend.rs
@@ -1,5 +1,6 @@
 mod classic;
 mod hacks;
+pub mod jwt;
 mod link;

 use std::net::IpAddr;
@@ -8,6 +9,7 @@ use std::time::Duration;

 use ipnet::{Ipv4Net, Ipv6Net};
 pub use link::LinkAuthError;
+use proxy_sasl::scram;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::AuthKeys;
 use tracing::{info, warn};
@@ -35,7 +37,7 @@ use crate::{
    },
    stream, url,
 };
-use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
+use crate::{EndpointCacheKey, EndpointId, RoleName};

 /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
 pub enum MaybeOwned<'a, T> {
@@ -218,7 +220,7 @@ impl RateBucketInfo {
 impl AuthenticationConfig {
    pub fn check_rate_limit(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
        config: &AuthenticationConfig,
        secret: AuthSecret,
        endpoint: &EndpointId,
@@ -243,7 +245,7 @@ impl AuthenticationConfig {
        let limit_not_exceeded = self.rate_limiter.check(
            (
                endpoint_int,
-                MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet),
+                MaskedIp::new(ctx.peer_addr(), config.rate_limit_ip_subnet),
            ),
            password_weight,
        );
@@ -274,7 +276,7 @@ impl AuthenticationConfig {
 ///
 /// All authentication flows will emit an AuthenticationOk message if successful.
 async fn auth_quirks(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
    api: &impl console::Api,
    user_info: ComputeUserInfoMaybeEndpoint,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -303,8 +305,8 @@ async fn auth_quirks(
    let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;

    // check allowed list
-    if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
-        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr));
+    if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
+        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
    }

    if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
@@ -356,7 +358,7 @@ async fn auth_quirks(
 }

 async fn authenticate_with_secret(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
    secret: AuthSecret,
    info: ComputeUserInfo,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -370,8 +372,8 @@ async fn authenticate_with_secret(
        let auth_outcome =
            validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?;
        let keys = match auth_outcome {
-            crate::sasl::Outcome::Success(key) => key,
-            crate::sasl::Outcome::Failure(reason) => {
+            proxy_sasl::sasl::Outcome::Success(key) => key,
+            proxy_sasl::sasl::Outcome::Failure(reason) => {
                info!("auth backend failed with an error: {reason}");
                return Err(auth::AuthError::auth_failed(&*info.user));
            }
@@ -421,7 +423,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
    #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
    pub async fn authenticate(
        self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
        client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
        allow_cleartext: bool,
        config: &'static AuthenticationConfig,
@@ -467,7 +469,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
 impl BackendType<'_, ComputeUserInfo, &()> {
    pub async fn get_role_secret(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
        use BackendType::*;
        match self {
@@ -478,7 +480,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {

    pub async fn get_allowed_ips_and_secret(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
        use BackendType::*;
        match self {
@@ -492,7 +494,7 @@ impl BackendType<'_, ComputeUserInfo, &()> {
 impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
    async fn wake_compute(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
        use BackendType::*;

@@ -514,7 +516,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
 impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
    async fn wake_compute(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
        use BackendType::*;

@@ -557,9 +559,9 @@ mod tests {
        context::RequestMonitoring,
        proxy::NeonOptions,
        rate_limiter::{EndpointRateLimiter, RateBucketInfo},
-        scram::{threadpool::ThreadPool, ServerSecret},
        stream::{PqStream, Stream},
    };
+    use proxy_sasl::scram::{threadpool::ThreadPool, ServerSecret};

    use super::{auth_quirks, AuthRateLimiter};

@@ -571,7 +573,7 @@ mod tests {
    impl console::Api for Auth {
        async fn get_role_secret(
            &self,
-            _ctx: &mut RequestMonitoring,
+            _ctx: &RequestMonitoring,
            _user_info: &super::ComputeUserInfo,
        ) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
            Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
@@ -579,7 +581,7 @@ mod tests {

        async fn get_allowed_ips_and_secret(
            &self,
-            _ctx: &mut RequestMonitoring,
+            _ctx: &RequestMonitoring,
            _user_info: &super::ComputeUserInfo,
        ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
        {
@@ -591,7 +593,7 @@ mod tests {

        async fn wake_compute(
            &self,
-            _ctx: &mut RequestMonitoring,
+            _ctx: &RequestMonitoring,
            _user_info: &super::ComputeUserInfo,
        ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
            unimplemented!()
@@ -665,10 +667,14 @@ mod tests {
        let (mut client, server) = tokio::io::duplex(1024);
        let mut stream = PqStream::new(Stream::from_raw(server));

-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
        let api = Auth {
            ips: vec![],
-            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
+            secret: AuthSecret::Scram(
+                ServerSecret::build_test_secret("my-secret-password")
+                    .await
+                    .unwrap(),
+            ),
        };

        let user_info = ComputeUserInfoMaybeEndpoint {
@@ -723,7 +729,7 @@ mod tests {
        ));

        let _creds = auth_quirks(
-            &mut ctx,
+            &ctx,
            &api,
            user_info,
            &mut stream,
@@ -742,10 +748,14 @@ mod tests {
        let (mut client, server) = tokio::io::duplex(1024);
        let mut stream = PqStream::new(Stream::from_raw(server));

-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
        let api = Auth {
            ips: vec![],
-            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
+            secret: AuthSecret::Scram(
+                ServerSecret::build_test_secret("my-secret-password")
+                    .await
+                    .unwrap(),
+            ),
        };

        let user_info = ComputeUserInfoMaybeEndpoint {
@@ -775,7 +785,7 @@ mod tests {
        ));

        let _creds = auth_quirks(
-            &mut ctx,
+            &ctx,
            &api,
            user_info,
            &mut stream,
@@ -794,10 +804,14 @@ mod tests {
        let (mut client, server) = tokio::io::duplex(1024);
        let mut stream = PqStream::new(Stream::from_raw(server));

-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
        let api = Auth {
            ips: vec![],
-            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
+            secret: AuthSecret::Scram(
+                ServerSecret::build_test_secret("my-secret-password")
+                    .await
+                    .unwrap(),
+            ),
        };

        let user_info = ComputeUserInfoMaybeEndpoint {
@@ -828,7 +842,7 @@ mod tests {
        ));

        let creds = auth_quirks(
-            &mut ctx,
+            &ctx,
            &api,
            user_info,
            &mut stream,
--- a/proxy/core/src/auth/backend/classic.rs
+++ b/proxy/core/src/auth/backend/classic.rs
@@ -5,14 +5,14 @@ use crate::{
    config::AuthenticationConfig,
    console::AuthSecret,
    context::RequestMonitoring,
-    sasl,
    stream::{PqStream, Stream},
 };
+use proxy_sasl::sasl;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};

 pub(super) async fn authenticate(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
    creds: ComputeUserInfo,
    client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    config: &'static AuthenticationConfig,
@@ -27,7 +27,7 @@ pub(super) async fn authenticate(
        }
        AuthSecret::Scram(secret) => {
            info!("auth endpoint chooses SCRAM");
-            let scram = auth::Scram(&secret, &mut *ctx);
+            let scram = auth::Scram(&secret, ctx);

            let auth_outcome = tokio::time::timeout(
                config.scram_protocol_timeout,
--- a/proxy/core/src/auth/backend/hacks.rs
+++ b/proxy/core/src/auth/backend/hacks.rs
@@ -7,9 +7,9 @@ use crate::{
    console::AuthSecret,
    context::RequestMonitoring,
    intern::EndpointIdInt,
-    sasl,
    stream::{self, Stream},
 };
+use proxy_sasl::sasl;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};

@@ -18,7 +18,7 @@ use tracing::{info, warn};
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
 pub async fn authenticate_cleartext(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
    info: ComputeUserInfo,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    secret: AuthSecret,
@@ -28,7 +28,7 @@ pub async fn authenticate_cleartext(
    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);

    // pause the timer while we communicate with the client
-    let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
+    let paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);

    let ep = EndpointIdInt::from(&info.endpoint);

@@ -60,7 +60,7 @@ pub async fn authenticate_cleartext(
 /// Similar to [`authenticate_cleartext`], but there's a specific password format,
 /// and passwords are not yet validated (we don't know how to validate them!)
 pub async fn password_hack_no_authentication(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
    info: ComputeUserInfoNoEndpoint,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
 ) -> auth::Result<ComputeCredentials> {
@@ -68,7 +68,7 @@ pub async fn password_hack_no_authentication(
    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);

    // pause the timer while we communicate with the client
-    let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
+    let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);

    let payload = AuthFlow::new(client)
        .begin(auth::PasswordHack)
--- a/proxy/core/src/auth/backend/jwt.rs
+++ b/proxy/core/src/auth/backend/jwt.rs
@@ -0,0 +1,554 @@
+use std::{future::Future, sync::Arc, time::Duration};
+
+use anyhow::{bail, ensure, Context};
+use arc_swap::ArcSwapOption;
+use dashmap::DashMap;
+use jose_jwk::crypto::KeyInfo;
+use signature::Verifier;
+use tokio::time::Instant;
+
+use crate::{http::parse_json_body_with_limit, intern::EndpointIdInt};
+
+// TODO(conrad): make these configurable.
+const MIN_RENEW: Duration = Duration::from_secs(30);
+const AUTO_RENEW: Duration = Duration::from_secs(300);
+const MAX_RENEW: Duration = Duration::from_secs(3600);
+const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
+
+/// How to get the JWT auth rules
+pub trait FetchAuthRules: Clone + Send + Sync + 'static {
+    fn fetch_auth_rules(&self) -> impl Future<Output = anyhow::Result<AuthRules>> + Send;
+}
+
+#[derive(Clone)]
+struct FetchAuthRulesFromCplane {
+    #[allow(dead_code)]
+    endpoint: EndpointIdInt,
+}
+
+impl FetchAuthRules for FetchAuthRulesFromCplane {
+    async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
+        Err(anyhow::anyhow!("not yet implemented"))
+    }
+}
+
+pub struct AuthRules {
+    jwks_urls: Vec<url::Url>,
+}
+
+#[derive(Default)]
+pub struct JwkCache {
+    client: reqwest::Client,
+
+    map: DashMap<EndpointIdInt, Arc<JwkCacheEntryLock>>,
+}
+
+pub struct JwkCacheEntryLock {
+    cached: ArcSwapOption<JwkCacheEntry>,
+    lookup: tokio::sync::Semaphore,
+}
+
+impl Default for JwkCacheEntryLock {
+    fn default() -> Self {
+        JwkCacheEntryLock {
+            cached: ArcSwapOption::empty(),
+            lookup: tokio::sync::Semaphore::new(1),
+        }
+    }
+}
+
+pub struct JwkCacheEntry {
+    /// Should refetch at least every hour to verify when old keys have been removed.
+    /// Should refetch when new key IDs are seen only every 5 minutes or so
+    last_retrieved: Instant,
+
+    /// cplane will return multiple JWKs urls that we need to scrape.
+    key_sets: ahash::HashMap<url::Url, jose_jwk::JwkSet>,
+}
+
+impl JwkCacheEntryLock {
+    async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
+        JwkRenewalPermit::acquire_permit(self).await
+    }
+
+    fn try_acquire_permit<'a>(self: &'a Arc<Self>) -> Option<JwkRenewalPermit<'a>> {
+        JwkRenewalPermit::try_acquire_permit(self)
+    }
+
+    async fn renew_jwks<F: FetchAuthRules>(
+        &self,
+        _permit: JwkRenewalPermit<'_>,
+        client: &reqwest::Client,
+        auth_rules: &F,
+    ) -> anyhow::Result<Arc<JwkCacheEntry>> {
+        // double check that no one beat us to updating the cache.
+        let now = Instant::now();
+        let guard = self.cached.load_full();
+        if let Some(cached) = guard {
+            let last_update = now.duration_since(cached.last_retrieved);
+            if last_update < Duration::from_secs(300) {
+                return Ok(cached);
+            }
+        }
+
+        let rules = auth_rules.fetch_auth_rules().await?;
+        let mut key_sets = ahash::HashMap::with_capacity_and_hasher(
+            rules.jwks_urls.len(),
+            ahash::RandomState::new(),
+        );
+        // TODO(conrad): run concurrently
+        for url in rules.jwks_urls {
+            let req = client.get(url.clone());
+            // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
+            match req.send().await.and_then(|r| r.error_for_status()) {
+                // todo: should we re-insert JWKs if we want to keep this JWKs URL?
+                // I expect these failures would be quite sparse.
+                Err(e) => tracing::warn!(?url, error=?e, "could not fetch JWKs"),
+                Ok(r) => {
+                    let resp: http::Response<reqwest::Body> = r.into();
+                    match parse_json_body_with_limit::<jose_jwk::JwkSet>(
+                        resp.into_body(),
+                        MAX_JWK_BODY_SIZE,
+                    )
+                    .await
+                    {
+                        Err(e) => tracing::warn!(?url, error=?e, "could not decode JWKs"),
+                        Ok(jwks) => {
+                            key_sets.insert(url, jwks);
+                        }
+                    }
+                }
+            }
+        }
+
+        let entry = Arc::new(JwkCacheEntry {
+            last_retrieved: now,
+            key_sets,
+        });
+        self.cached.swap(Some(Arc::clone(&entry)));
+
+        Ok(entry)
+    }
+
+    async fn get_or_update_jwk_cache<F: FetchAuthRules>(
+        self: &Arc<Self>,
+        client: &reqwest::Client,
+        fetch: &F,
+    ) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
+        let now = Instant::now();
+        let guard = self.cached.load_full();
+
+        // if we have no cached JWKs, try and get some
+        let Some(cached) = guard else {
+            let permit = self.acquire_permit().await;
+            return self.renew_jwks(permit, client, fetch).await;
+        };
+
+        let last_update = now.duration_since(cached.last_retrieved);
+
+        // check if the cached JWKs need updating.
+        if last_update > MAX_RENEW {
+            let permit = self.acquire_permit().await;
+
+            // it's been too long since we checked the keys. wait for them to update.
+            return self.renew_jwks(permit, client, fetch).await;
+        }
+
+        // every 5 minutes we should spawn a job to eagerly update the token.
+        if last_update > AUTO_RENEW {
+            if let Some(permit) = self.try_acquire_permit() {
+                tracing::debug!("JWKs should be renewed. Renewal permit acquired");
+                let permit = permit.into_owned();
+                let entry = self.clone();
+                let client = client.clone();
+                let fetch = fetch.clone();
+                tokio::spawn(async move {
+                    if let Err(e) = entry.renew_jwks(permit, &client, &fetch).await {
+                        tracing::warn!(error=?e, "could not fetch JWKs in background job");
+                    }
+                });
+            } else {
+                tracing::debug!("JWKs should be renewed. Renewal permit already taken, skipping");
+            }
+        }
+
+        Ok(cached)
+    }
+
+    async fn check_jwt<F: FetchAuthRules>(
+        self: &Arc<Self>,
+        jwt: String,
+        client: &reqwest::Client,
+        fetch: &F,
+    ) -> Result<(), anyhow::Error> {
+        // JWT compact form is defined to be
+        // <B64(Header)> || . || <B64(Payload)> || . || <B64(Signature)>
+        // where Signature = alg(<B64(Header)> || . || <B64(Payload)>);
+
+        let (header_payload, signature) = jwt
+            .rsplit_once(".")
+            .context("not a valid compact JWT encoding")?;
+        let (header, _payload) = header_payload
+            .split_once(".")
+            .context("not a valid compact JWT encoding")?;
+
+        let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
+            .context("not a valid compact JWT encoding")?;
+        let header = serde_json::from_slice::<JWTHeader>(&header)
+            .context("not a valid compact JWT encoding")?;
+
+        ensure!(header.typ == "JWT");
+        let kid = header.kid.context("missing key id")?;
+
+        let mut guard = self.get_or_update_jwk_cache(client, fetch).await?;
+
+        // get the key from the JWKs if possible. If not, wait for the keys to update.
+        let jwk = loop {
+            let jwk = guard
+                .key_sets
+                .values()
+                .flat_map(|jwks| &jwks.keys)
+                .find(|jwk| jwk.prm.kid.as_deref() == Some(kid));
+
+            match jwk {
+                Some(jwk) => break jwk,
+                None if guard.last_retrieved.elapsed() > MIN_RENEW => {
+                    let permit = self.acquire_permit().await;
+                    guard = self.renew_jwks(permit, client, fetch).await?;
+                }
+                _ => {
+                    bail!("jwk not found");
+                }
+            }
+        };
+
+        ensure!(
+            jwk.is_supported(&header.alg),
+            "signature algorithm not supported"
+        );
+
+        let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
+            .context("not a valid compact JWT encoding")?;
+        match &jwk.key {
+            jose_jwk::Key::Ec(key) => {
+                verify_ec_signature(header_payload.as_bytes(), &sig, key)?;
+            }
+            jose_jwk::Key::Rsa(key) => {
+                verify_rsa_signature(header_payload.as_bytes(), &sig, key, &jwk.prm.alg)?;
+            }
+            key => bail!("unsupported key type {key:?}"),
+        };
+
+        // TODO(conrad): verify iss, exp, nbf, etc...
+
+        Ok(())
+    }
+}
+
+impl JwkCache {
+    pub async fn check_jwt(
+        &self,
+        endpoint: EndpointIdInt,
+        jwt: String,
+    ) -> Result<(), anyhow::Error> {
+        // try with just a read lock first
+        let entry = self.map.get(&endpoint).as_deref().map(Arc::clone);
+        let entry = match entry {
+            Some(entry) => entry,
+            None => {
+                // acquire a write lock after to insert.
+                let entry = self.map.entry(endpoint).or_default();
+                Arc::clone(&*entry)
+            }
+        };
+
+        let fetch = FetchAuthRulesFromCplane { endpoint };
+        entry.check_jwt(jwt, &self.client, &fetch).await
+    }
+}
+
+fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> anyhow::Result<()> {
+    use ecdsa::Signature;
+    use signature::Verifier;
+
+    match key.crv {
+        jose_jwk::EcCurves::P256 => {
+            let pk =
+                p256::PublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid P256 key"))?;
+            let key = p256::ecdsa::VerifyingKey::from(&pk);
+            let sig = Signature::from_slice(sig)?;
+            key.verify(data, &sig)?;
+        }
+        key => bail!("unsupported ec key type {key:?}"),
+    }
+
+    Ok(())
+}
+
+fn verify_rsa_signature(
+    data: &[u8],
+    sig: &[u8],
+    key: &jose_jwk::Rsa,
+    alg: &Option<jose_jwa::Algorithm>,
+) -> anyhow::Result<()> {
+    use jose_jwa::{Algorithm, Signing};
+    use rsa::{
+        pkcs1v15::{Signature, VerifyingKey},
+        RsaPublicKey,
+    };
+
+    let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?;
+
+    match alg {
+        Some(Algorithm::Signing(Signing::Rs256)) => {
+            let key = VerifyingKey::<sha2::Sha256>::new(key);
+            let sig = Signature::try_from(sig)?;
+            key.verify(data, &sig)?;
+        }
+        _ => bail!("invalid RSA signing algorithm"),
+    };
+
+    Ok(())
+}
+
+/// <https://datatracker.ietf.org/doc/html/rfc7515#section-4.1>
+#[derive(serde::Deserialize, serde::Serialize)]
+struct JWTHeader<'a> {
+    /// must be "JWT"
+    typ: &'a str,
+    /// must be a supported alg
+    alg: jose_jwa::Algorithm,
+    /// key id, must be provided for our usecase
+    kid: Option<&'a str>,
+}
+
+struct JwkRenewalPermit<'a> {
+    inner: Option<JwkRenewalPermitInner<'a>>,
+}
+
+enum JwkRenewalPermitInner<'a> {
+    Owned(Arc<JwkCacheEntryLock>),
+    Borrowed(&'a Arc<JwkCacheEntryLock>),
+}
+
+impl JwkRenewalPermit<'_> {
+    fn into_owned(mut self) -> JwkRenewalPermit<'static> {
+        JwkRenewalPermit {
+            inner: self.inner.take().map(JwkRenewalPermitInner::into_owned),
+        }
+    }
+
+    async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit {
+        match from.lookup.acquire().await {
+            Ok(permit) => {
+                permit.forget();
+                JwkRenewalPermit {
+                    inner: Some(JwkRenewalPermitInner::Borrowed(from)),
+                }
+            }
+            Err(_) => panic!("semaphore should not be closed"),
+        }
+    }
+
+    fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit> {
+        match from.lookup.try_acquire() {
+            Ok(permit) => {
+                permit.forget();
+                Some(JwkRenewalPermit {
+                    inner: Some(JwkRenewalPermitInner::Borrowed(from)),
+                })
+            }
+            Err(tokio::sync::TryAcquireError::NoPermits) => None,
+            Err(tokio::sync::TryAcquireError::Closed) => panic!("semaphore should not be closed"),
+        }
+    }
+}
+
+impl JwkRenewalPermitInner<'_> {
+    fn into_owned(self) -> JwkRenewalPermitInner<'static> {
+        match self {
+            JwkRenewalPermitInner::Owned(p) => JwkRenewalPermitInner::Owned(p),
+            JwkRenewalPermitInner::Borrowed(p) => JwkRenewalPermitInner::Owned(Arc::clone(p)),
+        }
+    }
+}
+
+impl Drop for JwkRenewalPermit<'_> {
+    fn drop(&mut self) {
+        let entry = match &self.inner {
+            None => return,
+            Some(JwkRenewalPermitInner::Owned(p)) => p,
+            Some(JwkRenewalPermitInner::Borrowed(p)) => *p,
+        };
+        entry.lookup.add_permits(1);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::{future::IntoFuture, net::SocketAddr, time::SystemTime};
+
+    use base64::URL_SAFE_NO_PAD;
+    use bytes::Bytes;
+    use http::Response;
+    use http_body_util::Full;
+    use hyper1::service::service_fn;
+    use hyper_util::rt::TokioIo;
+    use rand::rngs::OsRng;
+    use signature::Signer;
+    use tokio::net::TcpListener;
+
+    fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) {
+        let sk = p256::SecretKey::random(&mut OsRng);
+        let pk = sk.public_key().into();
+        let jwk = jose_jwk::Jwk {
+            key: jose_jwk::Key::Ec(pk),
+            prm: jose_jwk::Parameters {
+                kid: Some(kid),
+                alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)),
+                ..Default::default()
+            },
+        };
+        (sk, jwk)
+    }
+
+    fn new_rsa_jwk(kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) {
+        let sk = rsa::RsaPrivateKey::new(&mut OsRng, 2048).unwrap();
+        let pk = sk.to_public_key().into();
+        let jwk = jose_jwk::Jwk {
+            key: jose_jwk::Key::Rsa(pk),
+            prm: jose_jwk::Parameters {
+                kid: Some(kid),
+                alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)),
+                ..Default::default()
+            },
+        };
+        (sk, jwk)
+    }
+
+    fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
+        let header = JWTHeader {
+            typ: "JWT",
+            alg: jose_jwa::Algorithm::Signing(sig),
+            kid: Some(&kid),
+        };
+        let body = typed_json::json! {{
+            "exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
+        }};
+
+        let header =
+            base64::encode_config(serde_json::to_string(&header).unwrap(), URL_SAFE_NO_PAD);
+        let body = base64::encode_config(body.to_string(), URL_SAFE_NO_PAD);
+
+        format!("{header}.{body}")
+    }
+
+    fn new_ec_jwt(kid: String, key: p256::SecretKey) -> String {
+        use p256::ecdsa::{Signature, SigningKey};
+
+        let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256);
+        let sig: Signature = SigningKey::from(key).sign(payload.as_bytes());
+        let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
+
+        format!("{payload}.{sig}")
+    }
+
+    fn new_rsa_jwt(kid: String, key: rsa::RsaPrivateKey) -> String {
+        use rsa::pkcs1v15::SigningKey;
+        use rsa::signature::SignatureEncoding;
+
+        let payload = build_jwt_payload(kid, jose_jwa::Signing::Rs256);
+        let sig = SigningKey::<sha2::Sha256>::new(key).sign(payload.as_bytes());
+        let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
+
+        format!("{payload}.{sig}")
+    }
+
+    #[tokio::test]
+    async fn renew() {
+        let (rs1, jwk1) = new_rsa_jwk("1".into());
+        let (rs2, jwk2) = new_rsa_jwk("2".into());
+        let (ec1, jwk3) = new_ec_jwk("3".into());
+        let (ec2, jwk4) = new_ec_jwk("4".into());
+
+        let jwt1 = new_rsa_jwt("1".into(), rs1);
+        let jwt2 = new_rsa_jwt("2".into(), rs2);
+        let jwt3 = new_ec_jwt("3".into(), ec1);
+        let jwt4 = new_ec_jwt("4".into(), ec2);
+
+        let foo_jwks = jose_jwk::JwkSet {
+            keys: vec![jwk1, jwk3],
+        };
+        let bar_jwks = jose_jwk::JwkSet {
+            keys: vec![jwk2, jwk4],
+        };
+
+        let service = service_fn(move |req| {
+            let foo_jwks = foo_jwks.clone();
+            let bar_jwks = bar_jwks.clone();
+            async move {
+                let jwks = match req.uri().path() {
+                    "/foo" => &foo_jwks,
+                    "/bar" => &bar_jwks,
+                    _ => {
+                        return Response::builder()
+                            .status(404)
+                            .body(Full::new(Bytes::new()));
+                    }
+                };
+                let body = serde_json::to_vec(jwks).unwrap();
+                Response::builder()
+                    .status(200)
+                    .body(Full::new(Bytes::from(body)))
+            }
+        });
+
+        let listener = TcpListener::bind("0.0.0.0:0").await.unwrap();
+        let server = hyper1::server::conn::http1::Builder::new();
+        let addr = listener.local_addr().unwrap();
+        tokio::spawn(async move {
+            loop {
+                let (s, _) = listener.accept().await.unwrap();
+                let serve = server.serve_connection(TokioIo::new(s), service.clone());
+                tokio::spawn(serve.into_future());
+            }
+        });
+
+        let client = reqwest::Client::new();
+
+        #[derive(Clone)]
+        struct Fetch(SocketAddr);
+
+        impl FetchAuthRules for Fetch {
+            async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
+                Ok(AuthRules {
+                    jwks_urls: vec![
+                        format!("http://{}/foo", self.0).parse().unwrap(),
+                        format!("http://{}/bar", self.0).parse().unwrap(),
+                    ],
+                })
+            }
+        }
+
+        let jwk_cache = Arc::new(JwkCacheEntryLock::default());
+
+        jwk_cache
+            .check_jwt(jwt1, &client, &Fetch(addr))
+            .await
+            .unwrap();
+        jwk_cache
+            .check_jwt(jwt2, &client, &Fetch(addr))
+            .await
+            .unwrap();
+        jwk_cache
+            .check_jwt(jwt3, &client, &Fetch(addr))
+            .await
+            .unwrap();
+        jwk_cache
+            .check_jwt(jwt4, &client, &Fetch(addr))
+            .await
+            .unwrap();
+    }
+}
--- a/proxy/core/src/auth/backend/link.rs
+++ b/proxy/core/src/auth/backend/link.rs
@@ -57,7 +57,7 @@ pub fn new_psql_session_id() -> String {
 }

 pub(super) async fn authenticate(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
    link_uri: &reqwest::Url,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
--- a/proxy/core/src/auth/credentials.rs
+++ b/proxy/core/src/auth/credentials.rs
@@ -84,7 +84,7 @@ pub fn endpoint_sni(

 impl ComputeUserInfoMaybeEndpoint {
    pub fn parse(
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
        params: &StartupMessageParams,
        sni: Option<&str>,
        common_names: Option<&HashSet<String>>,
@@ -249,8 +249,8 @@ mod tests {
    fn parse_bare_minimum() -> anyhow::Result<()> {
        // According to postgresql, only `user` should be required.
        let options = StartupMessageParams::new([("user", "john_doe")]);
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id, None);

@@ -264,8 +264,8 @@ mod tests {
            ("database", "world"), // should be ignored
            ("foo", "bar"),        // should be ignored
        ]);
-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id, None);

@@ -279,9 +279,9 @@ mod tests {
        let sni = Some("foo.localhost");
        let common_names = Some(["localhost".into()].into());

-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id.as_deref(), Some("foo"));
        assert_eq!(user_info.options.get_cache_key("foo"), "foo");
@@ -296,8 +296,8 @@ mod tests {
            ("options", "-ckey=1 project=bar -c geqo=off"),
        ]);

-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));

@@ -311,8 +311,8 @@ mod tests {
            ("options", "-ckey=1 endpoint=bar -c geqo=off"),
        ]);

-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));

@@ -329,8 +329,8 @@ mod tests {
            ),
        ]);

-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert!(user_info.endpoint_id.is_none());

@@ -344,8 +344,8 @@ mod tests {
            ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
        ]);

-        let mut ctx = RequestMonitoring::test();
-        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
+        let ctx = RequestMonitoring::test();
+        let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
        assert!(user_info.endpoint_id.is_none());

@@ -359,9 +359,9 @@ mod tests {
        let sni = Some("baz.localhost");
        let common_names = Some(["localhost".into()].into());

-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.user, "john_doe");
        assert_eq!(user_info.endpoint_id.as_deref(), Some("baz"));

@@ -374,16 +374,16 @@ mod tests {

        let common_names = Some(["a.com".into(), "b.com".into()].into());
        let sni = Some("p1.a.com");
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));

        let common_names = Some(["a.com".into(), "b.com".into()].into());
        let sni = Some("p1.b.com");
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));

        Ok(())
@@ -397,10 +397,9 @@ mod tests {
        let sni = Some("second.localhost");
        let common_names = Some(["localhost".into()].into());

-        let mut ctx = RequestMonitoring::test();
-        let err =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
-                .expect_err("should fail");
+        let ctx = RequestMonitoring::test();
+        let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
+            .expect_err("should fail");
        match err {
            InconsistentProjectNames { domain, option } => {
                assert_eq!(option, "first");
@@ -417,10 +416,9 @@ mod tests {
        let sni = Some("project.localhost");
        let common_names = Some(["example.com".into()].into());

-        let mut ctx = RequestMonitoring::test();
-        let err =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())
-                .expect_err("should fail");
+        let ctx = RequestMonitoring::test();
+        let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
+            .expect_err("should fail");
        match err {
            UnknownCommonName { cn } => {
                assert_eq!(cn, "localhost");
@@ -438,9 +436,9 @@ mod tests {

        let sni = Some("project.localhost");
        let common_names = Some(["localhost".into()].into());
-        let mut ctx = RequestMonitoring::test();
+        let ctx = RequestMonitoring::test();
        let user_info =
-            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
+            ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
        assert_eq!(
            user_info.options.get_cache_key("project"),
--- a/proxy/core/src/auth/flow.rs
+++ b/proxy/core/src/auth/flow.rs
@@ -2,16 +2,17 @@

 use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload};
 use crate::{
-    config::TlsServerEndPoint,
    console::AuthSecret,
    context::RequestMonitoring,
    intern::EndpointIdInt,
-    sasl,
-    scram::{self, threadpool::ThreadPool},
    stream::{PqStream, Stream},
 };
 use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
 use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
+use proxy_sasl::{
+    sasl,
+    scram::{self, threadpool::ThreadPool, TlsServerEndPoint},
+};
 use std::{io, sync::Arc};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
@@ -27,7 +28,7 @@ pub trait AuthMethod {
 pub struct Begin;

 /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
-pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring);
+pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a RequestMonitoring);

 impl AuthMethod for Scram<'_> {
    #[inline(always)]
@@ -56,7 +57,7 @@ impl AuthMethod for PasswordHack {
 /// Use clear-text password auth called `password` in docs
 /// <https://www.postgresql.org/docs/current/auth-password.html>
 pub struct CleartextPassword {
-    pub pool: Arc<ThreadPool>,
+    pub pool: Arc<ThreadPool<EndpointIdInt>>,
    pub endpoint: EndpointIdInt,
    pub secret: AuthSecret,
 }
@@ -155,7 +156,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
        let Scram(secret, ctx) = self.state;

        // pause the timer while we communicate with the client
-        let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
+        let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Client);

        // Initial client message contains the chosen auth method's name.
        let msg = self.stream.read_password_message().await?;
@@ -168,15 +169,13 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
        }

        match sasl.method {
-            SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256),
-            SCRAM_SHA_256_PLUS => {
-                ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus)
-            }
+            SCRAM_SHA_256 => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256),
+            SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus),
            _ => {}
        }
        info!("client chooses {}", sasl.method);

-        let outcome = sasl::SaslStream::new(self.stream, sasl.message)
+        let outcome = sasl::SaslStream::new(&mut self.stream.framed, sasl.message)
            .authenticate(scram::Exchange::new(
                secret,
                rand::random,
@@ -193,7 +192,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
 }

 pub(crate) async fn validate_password_and_exchange(
-    pool: &ThreadPool,
+    pool: &ThreadPool<EndpointIdInt>,
    endpoint: EndpointIdInt,
    password: &[u8],
    secret: AuthSecret,
@@ -208,7 +207,8 @@ pub(crate) async fn validate_password_and_exchange(
        }
        // perform scram authentication as both client and server to validate the keys
        AuthSecret::Scram(scram_secret) => {
-            let outcome = crate::scram::exchange(pool, endpoint, &scram_secret, password).await?;
+            let outcome =
+                proxy_sasl::scram::exchange(pool, endpoint, &scram_secret, password).await?;

            let client_key = match outcome {
                sasl::Outcome::Success(client_key) => client_key,
--- a/proxy/core/src/auth/password_hack.rs
+++ b/proxy/core/src/auth/password_hack.rs
--- a/proxy/core/src/cache.rs
+++ b/proxy/core/src/cache.rs
--- a/proxy/core/src/cache/common.rs
+++ b/proxy/core/src/cache/common.rs
--- a/proxy/core/src/cache/endpoints.rs
+++ b/proxy/core/src/cache/endpoints.rs
@@ -68,7 +68,7 @@ impl EndpointsCache {
            ready: AtomicBool::new(false),
        }
    }
-    pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool {
+    pub async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
        if !self.ready.load(Ordering::Acquire) {
            return true;
        }
--- a/proxy/core/src/cache/project_info.rs
+++ b/proxy/core/src/cache/project_info.rs
@@ -371,7 +371,8 @@ impl Cache for ProjectInfoCacheImpl {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::{scram::ServerSecret, ProjectId};
+    use crate::ProjectId;
+    use proxy_sasl::scram::ServerSecret;

    #[tokio::test]
    async fn test_project_info_cache_settings() {
--- a/proxy/core/src/cache/timed_lru.rs
+++ b/proxy/core/src/cache/timed_lru.rs
--- a/proxy/core/src/cancellation.rs
+++ b/proxy/core/src/cancellation.rs
--- a/proxy/core/src/compute.rs
+++ b/proxy/core/src/compute.rs
@@ -103,8 +103,12 @@ impl ConnCfg {

    /// Reuse password or auth keys from the other config.
    pub fn reuse_password(&mut self, other: Self) {
-        if let Some(password) = other.get_auth() {
-            self.auth(password);
+        if let Some(password) = other.get_password() {
+            self.password(password);
+        }
+
+        if let Some(keys) = other.get_auth_keys() {
+            self.auth_keys(keys);
        }
    }

@@ -120,64 +124,48 @@ impl ConnCfg {

    /// Apply startup message params to the connection config.
    pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
-        let mut client_encoding = false;
-        for (k, v) in params.iter() {
-            match k {
-                "user" => {
-                    // Only set `user` if it's not present in the config.
-                    // Link auth flow takes username from the console's response.
-                    if self.get_user().is_none() {
-                        self.user(v);
-                    }
+        // Only set `user` if it's not present in the config.
+        // Link auth flow takes username from the console's response.
+        if let (None, Some(user)) = (self.get_user(), params.get("user")) {
+            self.user(user);
+        }
+
+        // Only set `dbname` if it's not present in the config.
+        // Link auth flow takes dbname from the console's response.
+        if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
+            self.dbname(dbname);
+        }
+
+        // Don't add `options` if they were only used for specifying a project.
+        // Connection pools don't support `options`, because they affect backend startup.
+        if let Some(options) = filtered_options(params) {
+            self.options(&options);
+        }
+
+        if let Some(app_name) = params.get("application_name") {
+            self.application_name(app_name);
+        }
+
+        // TODO: This is especially ugly...
+        if let Some(replication) = params.get("replication") {
+            use tokio_postgres::config::ReplicationMode;
+            match replication {
+                "true" | "on" | "yes" | "1" => {
+                    self.replication_mode(ReplicationMode::Physical);
                }
                "database" => {
-                    // Only set `dbname` if it's not present in the config.
-                    // Link auth flow takes dbname from the console's response.
-                    if self.get_dbname().is_none() {
-                        self.dbname(v);
-                    }
-                }
-                "options" => {
-                    // Don't add `options` if they were only used for specifying a project.
-                    // Connection pools don't support `options`, because they affect backend startup.
-                    if let Some(options) = filtered_options(v) {
-                        self.options(&options);
-                    }
-                }
-
-                // the special ones in tokio-postgres that we don't want being set by the user
-                "dbname" => {}
-                "password" => {}
-                "sslmode" => {}
-                "host" => {}
-                "port" => {}
-                "connect_timeout" => {}
-                "keepalives" => {}
-                "keepalives_idle" => {}
-                "keepalives_interval" => {}
-                "keepalives_retries" => {}
-                "target_session_attrs" => {}
-                "channel_binding" => {}
-                "max_backend_message_size" => {}
-
-                "client_encoding" => {
-                    client_encoding = true;
-                    // only error should be from bad null bytes,
-                    // but we've already checked for those.
-                    _ = self.param("client_encoding", v);
-                }
-
-                _ => {
-                    // only error should be from bad null bytes,
-                    // but we've already checked for those.
-                    _ = self.param(k, v);
+                    self.replication_mode(ReplicationMode::Logical);
                }
+                _other => {}
            }
        }
-        if !client_encoding {
-            // for compatibility since we removed it from tokio-postgres
-            self.param("client_encoding", "UTF8").unwrap();
-        }
+
+        // TODO: extend the list of the forwarded startup parameters.
+        // Currently, tokio-postgres doesn't allow us to pass
+        // arbitrary parameters, but the ones above are a good start.
+        //
+        // This and the reverse params problem can be better addressed
+        // in a bespoke connection machinery (a new library for that sake).
    }
 }

@@ -288,12 +276,12 @@ impl ConnCfg {
    /// Connect to a corresponding compute node.
    pub async fn connect(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
        allow_self_signed_compute: bool,
        aux: MetricsAuxInfo,
        timeout: Duration,
    ) -> Result<PostgresConnection, ConnectionError> {
-        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
        let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
        drop(pause);

@@ -316,14 +304,14 @@ impl ConnCfg {
        )?;

        // connect_raw() will not use TLS if sslmode is "disable"
-        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
        let (client, connection) = self.0.connect_raw(stream, tls).await?;
        drop(pause);
        tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
        let stream = connection.stream.into_inner();

        info!(
-            cold_start_info = ctx.cold_start_info.as_str(),
+            cold_start_info = ctx.cold_start_info().as_str(),
            "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
            self.0.get_ssl_mode()
        );
@@ -342,7 +330,7 @@ impl ConnCfg {
            params,
            cancel_closure,
            aux,
-            _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol),
+            _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()),
        };

        Ok(connection)
@@ -350,9 +338,10 @@ impl ConnCfg {
 }

 /// Retrieve `options` from a startup message, dropping all proxy-secific flags.
-fn filtered_options(options: &str) -> Option<String> {
+fn filtered_options(params: &StartupMessageParams) -> Option<String> {
    #[allow(unstable_name_collisions)]
-    let options: String = StartupMessageParams::parse_options_raw(options)
+    let options: String = params
+        .options_raw()?
        .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
        .intersperse(" ") // TODO: use impl from std once it's stabilized
        .collect();
@@ -424,23 +413,27 @@ mod tests {
    #[test]
    fn test_filtered_options() {
        // Empty options is unlikely to be useful anyway.
-        assert_eq!(filtered_options(""), None);
+        let params = StartupMessageParams::new([("options", "")]);
+        assert_eq!(filtered_options(&params), None);

        // It's likely that clients will only use options to specify endpoint/project.
-        let params = "project=foo";
-        assert_eq!(filtered_options(params), None);
+        let params = StartupMessageParams::new([("options", "project=foo")]);
+        assert_eq!(filtered_options(&params), None);

        // Same, because unescaped whitespaces are no-op.
-        let params = " project=foo ";
-        assert_eq!(filtered_options(params), None);
+        let params = StartupMessageParams::new([("options", " project=foo ")]);
+        assert_eq!(filtered_options(&params).as_deref(), None);

-        let params = r"\  project=foo \ ";
-        assert_eq!(filtered_options(params).as_deref(), Some(r"\  \ "));
+        let params = StartupMessageParams::new([("options", r"\  project=foo \ ")]);
+        assert_eq!(filtered_options(&params).as_deref(), Some(r"\  \ "));

-        let params = "project = foo";
-        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
+        let params = StartupMessageParams::new([("options", "project = foo")]);
+        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));

-        let params = "project = foo neon_endpoint_type:read_write   neon_lsn:0/2";
-        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
+        let params = StartupMessageParams::new([(
+            "options",
+            "project = foo neon_endpoint_type:read_write   neon_lsn:0/2",
+        )]);
+        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
    }
 }
--- a/proxy/core/src/config.rs
+++ b/proxy/core/src/config.rs
@@ -1,27 +1,26 @@
 use crate::{
    auth::{self, backend::AuthRateLimiter},
    console::locks::ApiLocks,
+    intern::EndpointIdInt,
    rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig},
-    scram::threadpool::ThreadPool,
    serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
    Host,
 };
+
 use anyhow::{bail, ensure, Context, Ok};
 use itertools::Itertools;
+use proxy_sasl::scram::{threadpool::ThreadPool, TlsServerEndPoint};
 use remote_storage::RemoteStorageConfig;
 use rustls::{
    crypto::ring::sign,
    pki_types::{CertificateDer, PrivateKeyDer},
 };
-use sha2::{Digest, Sha256};
 use std::{
    collections::{HashMap, HashSet},
    str::FromStr,
    sync::Arc,
    time::Duration,
 };
-use tracing::{error, info};
-use x509_parser::oid_registry;

 pub struct ProxyConfig {
    pub tls_config: Option<TlsConfig>,
@@ -31,11 +30,8 @@ pub struct ProxyConfig {
    pub http_config: HttpConfig,
    pub authentication_config: AuthenticationConfig,
    pub require_client_ip: bool,
-    pub disable_ip_check_for_http: bool,
-    pub redis_rps_limit: Vec<RateBucketInfo>,
    pub region: String,
    pub handshake_timeout: Duration,
-    pub aws_region: String,
    pub wake_compute_retry_config: RetryConfig,
    pub connect_compute_locks: ApiLocks<Host>,
    pub connect_to_compute_retry_config: RetryConfig,
@@ -55,14 +51,13 @@ pub struct TlsConfig {
 }

 pub struct HttpConfig {
-    pub request_timeout: tokio::time::Duration,
    pub pool_options: GlobalConnPoolOptions,
    pub cancel_set: CancelSet,
    pub client_conn_threshold: u64,
 }

 pub struct AuthenticationConfig {
-    pub thread_pool: Arc<ThreadPool>,
+    pub thread_pool: Arc<ThreadPool<EndpointIdInt>>,
    pub scram_protocol_timeout: tokio::time::Duration,
    pub rate_limiter_enabled: bool,
    pub rate_limiter: AuthRateLimiter,
@@ -130,66 +125,6 @@ pub fn configure_tls(
    })
 }

-/// Channel binding parameter
-///
-/// <https://www.rfc-editor.org/rfc/rfc5929#section-4>
-/// Description: The hash of the TLS server's certificate as it
-/// appears, octet for octet, in the server's Certificate message.  Note
-/// that the Certificate message contains a certificate_list, in which
-/// the first element is the server's certificate.
-///
-/// The hash function is to be selected as follows:
-///
-/// * if the certificate's signatureAlgorithm uses a single hash
-///   function, and that hash function is either MD5 or SHA-1, then use SHA-256;
-///
-/// * if the certificate's signatureAlgorithm uses a single hash
-///   function and that hash function neither MD5 nor SHA-1, then use
-///   the hash function associated with the certificate's
-///   signatureAlgorithm;
-///
-/// * if the certificate's signatureAlgorithm uses no hash functions or
-///   uses multiple hash functions, then this channel binding type's
-///   channel bindings are undefined at this time (updates to is channel
-///   binding type may occur to address this issue if it ever arises).
-#[derive(Debug, Clone, Copy)]
-pub enum TlsServerEndPoint {
-    Sha256([u8; 32]),
-    Undefined,
-}
-
-impl TlsServerEndPoint {
-    pub fn new(cert: &CertificateDer) -> anyhow::Result<Self> {
-        let sha256_oids = [
-            // I'm explicitly not adding MD5 or SHA1 here... They're bad.
-            oid_registry::OID_SIG_ECDSA_WITH_SHA256,
-            oid_registry::OID_PKCS1_SHA256WITHRSA,
-        ];
-
-        let pem = x509_parser::parse_x509_certificate(cert)
-            .context("Failed to parse PEM object from cerficiate")?
-            .1;
-
-        info!(subject = %pem.subject, "parsing TLS certificate");
-
-        let reg = oid_registry::OidRegistry::default().with_all_crypto();
-        let oid = pem.signature_algorithm.oid();
-        let alg = reg.get(oid);
-        if sha256_oids.contains(oid) {
-            let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into();
-            info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding");
-            Ok(Self::Sha256(tls_server_end_point))
-        } else {
-            error!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), "unknown channel binding");
-            Ok(Self::Undefined)
-        }
-    }
-
-    pub fn supported(&self) -> bool {
-        !matches!(self, TlsServerEndPoint::Undefined)
-    }
-}
-
 #[derive(Default, Debug)]
 pub struct CertResolver {
    certs: HashMap<String, (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
--- a/proxy/core/src/console.rs
+++ b/proxy/core/src/console.rs
--- a/proxy/core/src/console/messages.rs
+++ b/proxy/core/src/console/messages.rs
--- a/proxy/core/src/console/mgmt.rs
+++ b/proxy/core/src/console/mgmt.rs
--- a/proxy/core/src/console/provider.rs
+++ b/proxy/core/src/console/provider.rs
@@ -16,9 +16,10 @@ use crate::{
    intern::ProjectIdInt,
    metrics::ApiLockMetrics,
    rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token},
-    scram, EndpointCacheKey,
+    EndpointCacheKey,
 };
 use dashmap::DashMap;
+use proxy_sasl::scram;
 use std::{hash::Hash, sync::Arc, time::Duration};
 use tokio::time::Instant;
 use tracing::info;
@@ -292,7 +293,7 @@ pub struct NodeInfo {
 impl NodeInfo {
    pub async fn connect(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
        timeout: Duration,
    ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
        self.config
@@ -330,20 +331,20 @@ pub(crate) trait Api {
    /// We still have to mock the scram to avoid leaking information that user doesn't exist.
    async fn get_role_secret(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;

    async fn get_allowed_ips_and_secret(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;

    /// Wake up the compute node and return the corresponding connection info.
    async fn wake_compute(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 }
@@ -363,7 +364,7 @@ pub enum ConsoleBackend {
 impl Api for ConsoleBackend {
    async fn get_role_secret(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
        use ConsoleBackend::*;
@@ -378,7 +379,7 @@ impl Api for ConsoleBackend {

    async fn get_allowed_ips_and_secret(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
        use ConsoleBackend::*;
@@ -393,7 +394,7 @@ impl Api for ConsoleBackend {

    async fn wake_compute(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
        use ConsoleBackend::*;
@@ -469,15 +470,15 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
        timeout: Duration,
        epoch: std::time::Duration,
        metrics: &'static ApiLockMetrics,
-    ) -> prometheus::Result<Self> {
-        Ok(Self {
+    ) -> Self {
+        Self {
            name,
            node_locks: DashMap::with_shard_amount(shards),
            config,
            timeout,
            epoch,
            metrics,
-        })
+        }
    }

    pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
--- a/proxy/core/src/console/provider/mock.rs
+++ b/proxy/core/src/console/provider/mock.rs
@@ -5,7 +5,7 @@ use super::{
    AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
 };
 use crate::context::RequestMonitoring;
-use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
+use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, url::ApiUrl};
 use crate::{auth::IpPattern, cache::Cached};
 use crate::{
    console::{
@@ -15,6 +15,7 @@ use crate::{
    BranchId, EndpointId, ProjectId,
 };
 use futures::TryFutureExt;
+use proxy_sasl::scram;
 use std::{str::FromStr, sync::Arc};
 use thiserror::Error;
 use tokio_postgres::{config::SslMode, Client};
@@ -158,7 +159,7 @@ impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn get_role_secret(
        &self,
-        _ctx: &mut RequestMonitoring,
+        _ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
        Ok(CachedRoleSecret::new_uncached(
@@ -168,7 +169,7 @@ impl super::Api for Api {

    async fn get_allowed_ips_and_secret(
        &self,
-        _ctx: &mut RequestMonitoring,
+        _ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
        Ok((
@@ -182,7 +183,7 @@ impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn wake_compute(
        &self,
-        _ctx: &mut RequestMonitoring,
+        _ctx: &RequestMonitoring,
        _user_info: &ComputeUserInfo,
    ) -> Result<CachedNodeInfo, WakeComputeError> {
        self.do_wake_compute().map_ok(Cached::new_uncached).await
--- a/proxy/core/src/console/provider/neon.rs
+++ b/proxy/core/src/console/provider/neon.rs
@@ -13,10 +13,11 @@ use crate::{
    http,
    metrics::{CacheOutcome, Metrics},
    rate_limiter::WakeComputeRateLimiter,
-    scram, EndpointCacheKey,
+    EndpointCacheKey,
 };
 use crate::{cache::Cached, context::RequestMonitoring};
 use futures::TryFutureExt;
+use proxy_sasl::scram;
 use std::{sync::Arc, time::Duration};
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
@@ -57,7 +58,7 @@ impl Api {

    async fn do_get_auth_info(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<AuthInfo, GetAuthInfoError> {
        if !self
@@ -69,7 +70,7 @@ impl Api {
            info!("endpoint is not valid, skipping the request");
            return Ok(AuthInfo::default());
        }
-        let request_id = ctx.session_id.to_string();
+        let request_id = ctx.session_id().to_string();
        let application_name = ctx.console_application_name();
        async {
            let request = self
@@ -77,7 +78,7 @@ impl Api {
                .get("proxy_get_role_secret")
                .header("X-Request-ID", &request_id)
                .header("Authorization", format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id)])
+                .query(&[("session_id", ctx.session_id())])
                .query(&[
                    ("application_name", application_name.as_str()),
                    ("project", user_info.endpoint.as_str()),
@@ -87,7 +88,7 @@ impl Api {

            info!(url = request.url().as_str(), "sending http request");
            let start = Instant::now();
-            let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
            let response = self.endpoint.execute(request).await?;
            drop(pause);
            info!(duration = ?start.elapsed(), "received http response");
@@ -130,10 +131,10 @@ impl Api {

    async fn do_wake_compute(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<NodeInfo, WakeComputeError> {
-        let request_id = ctx.session_id.to_string();
+        let request_id = ctx.session_id().to_string();
        let application_name = ctx.console_application_name();
        async {
            let mut request_builder = self
@@ -141,7 +142,7 @@ impl Api {
                .get("proxy_wake_compute")
                .header("X-Request-ID", &request_id)
                .header("Authorization", format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", ctx.session_id)])
+                .query(&[("session_id", ctx.session_id())])
                .query(&[
                    ("application_name", application_name.as_str()),
                    ("project", user_info.endpoint.as_str()),
@@ -156,7 +157,7 @@ impl Api {

            info!(url = request.url().as_str(), "sending http request");
            let start = Instant::now();
-            let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
            let response = self.endpoint.execute(request).await?;
            drop(pause);
            info!(duration = ?start.elapsed(), "received http response");
@@ -192,7 +193,7 @@ impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn get_role_secret(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
        let normalized_ep = &user_info.endpoint.normalize();
@@ -226,7 +227,7 @@ impl super::Api for Api {

    async fn get_allowed_ips_and_secret(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
        let normalized_ep = &user_info.endpoint.normalize();
@@ -268,7 +269,7 @@ impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn wake_compute(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedNodeInfo, WakeComputeError> {
        let key = user_info.endpoint_cache_key();
--- a/proxy/core/src/context.rs
+++ b/proxy/core/src/context.rs
@@ -7,13 +7,14 @@ use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
 use tracing::{field::display, info, info_span, Span};
+use try_lock::TryLock;
 use uuid::Uuid;

 use crate::{
    console::messages::{ColdStartInfo, MetricsAuxInfo},
    error::ErrorKind,
    intern::{BranchIdInt, ProjectIdInt},
-    metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol},
+    metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting},
    DbName, EndpointId, RoleName,
 };

@@ -28,7 +29,15 @@ pub static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<RequestData>>
 ///
 /// This data should **not** be used for connection logic, only for observability and limiting purposes.
 /// All connection logic should instead use strongly typed state machines, not a bunch of Options.
-pub struct RequestMonitoring {
+pub struct RequestMonitoring(
+    /// To allow easier use of the ctx object, we have interior mutability.
+    /// I would typically use a RefCell but that would break the `Send` requirements
+    /// so we need something with thread-safety. `TryLock` is a cheap alternative
+    /// that offers similar semantics to a `RefCell` but with synchronisation.
+    TryLock<RequestMonitoringInner>,
+);
+
+struct RequestMonitoringInner {
    pub peer_addr: IpAddr,
    pub session_id: Uuid,
    pub protocol: Protocol,
@@ -85,7 +94,7 @@ impl RequestMonitoring {
            role = tracing::field::Empty,
        );

-        Self {
+        let inner = RequestMonitoringInner {
            peer_addr,
            session_id,
            protocol,
@@ -110,7 +119,9 @@ impl RequestMonitoring {
            disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
            latency_timer: LatencyTimer::new(protocol),
            disconnect_timestamp: None,
-        }
+        };
+
+        Self(TryLock::new(inner))
    }

    #[cfg(test)]
@@ -119,48 +130,177 @@ impl RequestMonitoring {
    }

    pub fn console_application_name(&self) -> String {
+        let this = self.0.try_lock().expect("should not deadlock");
        format!(
            "{}/{}",
-            self.application.as_deref().unwrap_or_default(),
-            self.protocol
+            this.application.as_deref().unwrap_or_default(),
+            this.protocol
        )
    }

-    pub fn set_rejected(&mut self, rejected: bool) {
-        self.rejected = Some(rejected);
+    pub fn set_rejected(&self, rejected: bool) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.rejected = Some(rejected);
    }

-    pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
+    pub fn set_cold_start_info(&self, info: ColdStartInfo) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .set_cold_start_info(info);
+    }
+
+    pub fn set_db_options(&self, options: StartupMessageParams) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.set_application(options.get("application_name").map(SmolStr::from));
+        if let Some(user) = options.get("user") {
+            this.set_user(user.into());
+        }
+        if let Some(dbname) = options.get("database") {
+            this.set_dbname(dbname.into());
+        }
+
+        this.pg_options = Some(options);
+    }
+
+    pub fn set_project(&self, x: MetricsAuxInfo) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        if this.endpoint_id.is_none() {
+            this.set_endpoint_id(x.endpoint_id.as_str().into())
+        }
+        this.branch = Some(x.branch_id);
+        this.project = Some(x.project_id);
+        this.set_cold_start_info(x.cold_start_info);
+    }
+
+    pub fn set_project_id(&self, project_id: ProjectIdInt) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.project = Some(project_id);
+    }
+
+    pub fn set_endpoint_id(&self, endpoint_id: EndpointId) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .set_endpoint_id(endpoint_id);
+    }
+
+    pub fn set_dbname(&self, dbname: DbName) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .set_dbname(dbname);
+    }
+
+    pub fn set_user(&self, user: RoleName) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .set_user(user);
+    }
+
+    pub fn set_auth_method(&self, auth_method: AuthMethod) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.auth_method = Some(auth_method);
+    }
+
+    pub fn has_private_peer_addr(&self) -> bool {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .has_private_peer_addr()
+    }
+
+    pub fn set_error_kind(&self, kind: ErrorKind) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        // Do not record errors from the private address to metrics.
+        if !this.has_private_peer_addr() {
+            Metrics::get().proxy.errors_total.inc(kind);
+        }
+        if let Some(ep) = &this.endpoint_id {
+            let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
+            let label = metric.with_labels(kind);
+            metric.get_metric(label).measure(ep);
+        }
+        this.error_kind = Some(kind);
+    }
+
+    pub fn set_success(&self) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.success = true;
+    }
+
+    pub fn log_connect(&self) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .log_connect();
+    }
+
+    pub fn protocol(&self) -> Protocol {
+        self.0.try_lock().expect("should not deadlock").protocol
+    }
+
+    pub fn span(&self) -> Span {
+        self.0.try_lock().expect("should not deadlock").span.clone()
+    }
+
+    pub fn session_id(&self) -> Uuid {
+        self.0.try_lock().expect("should not deadlock").session_id
+    }
+
+    pub fn peer_addr(&self) -> IpAddr {
+        self.0.try_lock().expect("should not deadlock").peer_addr
+    }
+
+    pub fn cold_start_info(&self) -> ColdStartInfo {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .cold_start_info
+    }
+
+    pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause {
+        LatencyTimerPause {
+            ctx: self,
+            start: tokio::time::Instant::now(),
+            waiting_for,
+        }
+    }
+
+    pub fn success(&self) {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .latency_timer
+            .success()
+    }
+}
+
+pub struct LatencyTimerPause<'a> {
+    ctx: &'a RequestMonitoring,
+    start: tokio::time::Instant,
+    waiting_for: Waiting,
+}
+
+impl Drop for LatencyTimerPause<'_> {
+    fn drop(&mut self) {
+        self.ctx
+            .0
+            .try_lock()
+            .expect("should not deadlock")
+            .latency_timer
+            .unpause(self.start, self.waiting_for);
+    }
+}
+
+impl RequestMonitoringInner {
+    fn set_cold_start_info(&mut self, info: ColdStartInfo) {
        self.cold_start_info = info;
        self.latency_timer.cold_start_info(info);
    }

-    pub fn set_db_options(&mut self, options: StartupMessageParams) {
-        self.set_application(options.get("application_name").map(SmolStr::from));
-        if let Some(user) = options.get("user") {
-            self.set_user(user.into());
-        }
-        if let Some(dbname) = options.get("database") {
-            self.set_dbname(dbname.into());
-        }
-
-        self.pg_options = Some(options);
-    }
-
-    pub fn set_project(&mut self, x: MetricsAuxInfo) {
-        if self.endpoint_id.is_none() {
-            self.set_endpoint_id(x.endpoint_id.as_str().into())
-        }
-        self.branch = Some(x.branch_id);
-        self.project = Some(x.project_id);
-        self.set_cold_start_info(x.cold_start_info);
-    }
-
-    pub fn set_project_id(&mut self, project_id: ProjectIdInt) {
-        self.project = Some(project_id);
-    }
-
-    pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
+    fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
        if self.endpoint_id.is_none() {
            self.span.record("ep", display(&endpoint_id));
            let metric = &Metrics::get().proxy.connecting_endpoints;
@@ -176,44 +316,23 @@ impl RequestMonitoring {
        }
    }

-    pub fn set_dbname(&mut self, dbname: DbName) {
+    fn set_dbname(&mut self, dbname: DbName) {
        self.dbname = Some(dbname);
    }

-    pub fn set_user(&mut self, user: RoleName) {
+    fn set_user(&mut self, user: RoleName) {
        self.span.record("role", display(&user));
        self.user = Some(user);
    }

-    pub fn set_auth_method(&mut self, auth_method: AuthMethod) {
-        self.auth_method = Some(auth_method);
-    }
-
-    pub fn has_private_peer_addr(&self) -> bool {
+    fn has_private_peer_addr(&self) -> bool {
        match self.peer_addr {
            IpAddr::V4(ip) => ip.is_private(),
            _ => false,
        }
    }

-    pub fn set_error_kind(&mut self, kind: ErrorKind) {
-        // Do not record errors from the private address to metrics.
-        if !self.has_private_peer_addr() {
-            Metrics::get().proxy.errors_total.inc(kind);
-        }
-        if let Some(ep) = &self.endpoint_id {
-            let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
-            let label = metric.with_labels(kind);
-            metric.get_metric(label).measure(ep);
-        }
-        self.error_kind = Some(kind);
-    }
-
-    pub fn set_success(&mut self) {
-        self.success = true;
-    }
-
-    pub fn log_connect(&mut self) {
+    fn log_connect(&mut self) {
        let outcome = if self.success {
            ConnectOutcome::Success
        } else {
@@ -256,7 +375,7 @@ impl RequestMonitoring {
    }
 }

-impl Drop for RequestMonitoring {
+impl Drop for RequestMonitoringInner {
    fn drop(&mut self) {
        if self.sender.is_some() {
            self.log_connect();
--- a/proxy/core/src/context/parquet.rs
+++ b/proxy/core/src/context/parquet.rs
@@ -23,7 +23,7 @@ use utils::backoff;

 use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT};

-use super::{RequestMonitoring, LOG_CHAN};
+use super::{RequestMonitoringInner, LOG_CHAN};

 #[derive(clap::Args, Clone, Debug)]
 pub struct ParquetUploadArgs {
@@ -118,8 +118,8 @@ impl<'a> serde::Serialize for Options<'a> {
    }
 }

-impl From<&RequestMonitoring> for RequestData {
-    fn from(value: &RequestMonitoring) -> Self {
+impl From<&RequestMonitoringInner> for RequestData {
+    fn from(value: &RequestMonitoringInner) -> Self {
        Self {
            session_id: value.session_id,
            peer_addr: value.peer_addr.to_string(),
--- a/proxy/core/src/error.rs
+++ b/proxy/core/src/error.rs
--- a/proxy/core/src/http.rs
+++ b/proxy/core/src/http.rs
@@ -6,6 +6,12 @@ pub mod health_server;

 use std::time::Duration;

+use anyhow::bail;
+use bytes::Bytes;
+use http_body_util::BodyExt;
+use hyper1::body::Body;
+use serde::de::DeserializeOwned;
+
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
@@ -96,6 +102,33 @@ impl Endpoint {
    }
 }

+pub async fn parse_json_body_with_limit<D: DeserializeOwned>(
+    mut b: impl Body<Data = Bytes, Error = reqwest::Error> + Unpin,
+    limit: usize,
+) -> anyhow::Result<D> {
+    // We could use `b.limited().collect().await.to_bytes()` here
+    // but this ends up being slightly more efficient as far as I can tell.
+
+    // check the lower bound of the size hint.
+    // in reqwest, this value is influenced by the Content-Length header.
+    let lower_bound = match usize::try_from(b.size_hint().lower()) {
+        Ok(bound) if bound <= limit => bound,
+        _ => bail!("content length exceeds limit"),
+    };
+    let mut bytes = Vec::with_capacity(lower_bound);
+
+    while let Some(frame) = b.frame().await.transpose()? {
+        if let Ok(data) = frame.into_data() {
+            if bytes.len() + data.len() > limit {
+                bail!("content length exceeds limit")
+            }
+            bytes.extend_from_slice(&data);
+        }
+    }
+
+    Ok(serde_json::from_slice::<D>(&bytes)?)
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/proxy/core/src/http/health_server.rs
+++ b/proxy/core/src/http/health_server.rs
--- a/proxy/core/src/intern.rs
+++ b/proxy/core/src/intern.rs
--- a/proxy/core/src/jemalloc.rs
+++ b/proxy/core/src/jemalloc.rs
--- a/proxy/core/src/lib.rs
+++ b/proxy/core/src/lib.rs
@@ -21,13 +21,13 @@ pub mod intern;
 pub mod jemalloc;
 pub mod logging;
 pub mod metrics;
-pub mod parse;
+// pub mod parse;
 pub mod protocol2;
 pub mod proxy;
 pub mod rate_limiter;
 pub mod redis;
-pub mod sasl;
-pub mod scram;
+// pub mod sasl;
+// pub mod scram;
 pub mod serverless;
 pub mod stream;
 pub mod url;
--- a/proxy/core/src/logging.rs
+++ b/proxy/core/src/logging.rs
--- a/proxy/core/src/metrics.rs
+++ b/proxy/core/src/metrics.rs
@@ -2,13 +2,14 @@ use std::sync::{Arc, OnceLock};

 use lasso::ThreadedRodeo;
 use measured::{
-    label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet},
+    label::StaticLabelSet,
    metric::{histogram::Thresholds, name::MetricName},
-    Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
-    LabelGroup, MetricGroup,
+    Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
+    MetricGroup,
 };
 use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};

+use proxy_sasl::scram::threadpool::ThreadPoolMetrics;
 use tokio::time::{self, Instant};

 use crate::console::messages::ColdStartInfo;
@@ -370,6 +371,7 @@ pub struct CancellationRequest {
    pub kind: CancellationOutcome,
 }

+#[derive(Clone, Copy)]
 pub enum Waiting {
    Cplane,
    Client,
@@ -398,12 +400,6 @@ pub struct LatencyTimer {
    outcome: ConnectOutcome,
 }

-pub struct LatencyTimerPause<'a> {
-    timer: &'a mut LatencyTimer,
-    start: time::Instant,
-    waiting_for: Waiting,
-}
-
 impl LatencyTimer {
    pub fn new(protocol: Protocol) -> Self {
        Self {
@@ -417,11 +413,13 @@ impl LatencyTimer {
        }
    }

-    pub fn pause(&mut self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
-        LatencyTimerPause {
-            timer: self,
-            start: Instant::now(),
-            waiting_for,
+    pub fn unpause(&mut self, start: Instant, waiting_for: Waiting) {
+        let dur = start.elapsed();
+        match waiting_for {
+            Waiting::Cplane => self.accumulated.cplane += dur,
+            Waiting::Client => self.accumulated.client += dur,
+            Waiting::Compute => self.accumulated.compute += dur,
+            Waiting::RetryTimeout => self.accumulated.retry += dur,
        }
    }

@@ -438,18 +436,6 @@ impl LatencyTimer {
    }
 }

-impl Drop for LatencyTimerPause<'_> {
-    fn drop(&mut self) {
-        let dur = self.start.elapsed();
-        match self.waiting_for {
-            Waiting::Cplane => self.timer.accumulated.cplane += dur,
-            Waiting::Client => self.timer.accumulated.client += dur,
-            Waiting::Compute => self.timer.accumulated.compute += dur,
-            Waiting::RetryTimeout => self.timer.accumulated.retry += dur,
-        }
-    }
-}
-
 #[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
 pub enum ConnectOutcome {
    Success,
@@ -561,78 +547,3 @@ pub enum RedisEventsCount {
    PasswordUpdate,
    AllowedIpsUpdate,
 }
-
-pub struct ThreadPoolWorkers(usize);
-pub struct ThreadPoolWorkerId(pub usize);
-
-impl LabelValue for ThreadPoolWorkerId {
-    fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
-        v.write_int(self.0 as i64)
-    }
-}
-
-impl LabelGroup for ThreadPoolWorkerId {
-    fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) {
-        v.write_value(LabelName::from_str("worker"), self);
-    }
-}
-
-impl LabelGroupSet for ThreadPoolWorkers {
-    type Group<'a> = ThreadPoolWorkerId;
-
-    fn cardinality(&self) -> Option<usize> {
-        Some(self.0)
-    }
-
-    fn encode_dense(&self, value: Self::Unique) -> Option<usize> {
-        Some(value)
-    }
-
-    fn decode_dense(&self, value: usize) -> Self::Group<'_> {
-        ThreadPoolWorkerId(value)
-    }
-
-    type Unique = usize;
-
-    fn encode(&self, value: Self::Group<'_>) -> Option<Self::Unique> {
-        Some(value.0)
-    }
-
-    fn decode(&self, value: &Self::Unique) -> Self::Group<'_> {
-        ThreadPoolWorkerId(*value)
-    }
-}
-
-impl LabelSet for ThreadPoolWorkers {
-    type Value<'a> = ThreadPoolWorkerId;
-
-    fn dynamic_cardinality(&self) -> Option<usize> {
-        Some(self.0)
-    }
-
-    fn encode(&self, value: Self::Value<'_>) -> Option<usize> {
-        (value.0 < self.0).then_some(value.0)
-    }
-
-    fn decode(&self, value: usize) -> Self::Value<'_> {
-        ThreadPoolWorkerId(value)
-    }
-}
-
-impl FixedCardinalitySet for ThreadPoolWorkers {
-    fn cardinality(&self) -> usize {
-        self.0
-    }
-}
-
-#[derive(MetricGroup)]
-#[metric(new(workers: usize))]
-pub struct ThreadPoolMetrics {
-    pub injector_queue_depth: Gauge,
-    #[metric(init = GaugeVec::with_label_set(ThreadPoolWorkers(workers)))]
-    pub worker_queue_depth: GaugeVec<ThreadPoolWorkers>,
-    #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
-    pub worker_task_turns_total: CounterVec<ThreadPoolWorkers>,
-    #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
-    pub worker_task_skips_total: CounterVec<ThreadPoolWorkers>,
-}
--- a/proxy/core/src/parse.rs
+++ b/proxy/core/src/parse.rs
--- a/proxy/core/src/protocol2.rs
+++ b/proxy/core/src/protocol2.rs
--- a/proxy/core/src/proxy.rs
+++ b/proxy/core/src/proxy.rs
@@ -113,18 +113,18 @@ pub async fn task_main(
                }
            };

-            let mut ctx = RequestMonitoring::new(
+            let ctx = RequestMonitoring::new(
                session_id,
                peer_addr,
                crate::metrics::Protocol::Tcp,
                &config.region,
            );
-            let span = ctx.span.clone();
+            let span = ctx.span();

            let startup = Box::pin(
                handle_client(
                    config,
-                    &mut ctx,
+                    &ctx,
                    cancellation_handler,
                    socket,
                    ClientMode::Tcp,
@@ -240,7 +240,7 @@ impl ReportableError for ClientRequestError {

 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    config: &'static ProxyConfig,
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
    cancellation_handler: Arc<CancellationHandlerMain>,
    stream: S,
    mode: ClientMode,
@@ -248,25 +248,25 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    conn_gauge: NumClientConnectionsGuard<'static>,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
    info!(
-        protocol = %ctx.protocol,
+        protocol = %ctx.protocol(),
        "handling interactive connection from client"
    );

    let metrics = &Metrics::get().proxy;
-    let proto = ctx.protocol;
+    let proto = ctx.protocol();
    let _request_gauge = metrics.connection_requests.guard(proto);

    let tls = config.tls_config.as_ref();

    let record_handshake_error = !ctx.has_private_peer_addr();
-    let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
-    let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error);
+    let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
+    let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error);
    let (mut stream, params) =
        match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
            HandshakeData::Startup(stream, params) => (stream, params),
            HandshakeData::Cancel(cancel_key_data) => {
                return Ok(cancellation_handler
-                    .cancel_session(cancel_key_data, ctx.session_id)
+                    .cancel_session(cancel_key_data, ctx.session_id())
                    .await
                    .map(|()| None)?)
            }
--- a/proxy/core/src/proxy/connect_compute.rs
+++ b/proxy/core/src/proxy/connect_compute.rs
@@ -46,7 +46,7 @@ pub trait ConnectMechanism {
    type Error: From<Self::ConnectError>;
    async fn connect_once(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
        node_info: &console::CachedNodeInfo,
        timeout: time::Duration,
    ) -> Result<Self::Connection, Self::ConnectError>;
@@ -58,7 +58,7 @@ pub trait ConnectMechanism {
 pub trait ComputeConnectBackend {
    async fn wake_compute(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;

    fn get_keys(&self) -> Option<&ComputeCredentialKeys>;
@@ -81,7 +81,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
    async fn connect_once(
        &self,
-        ctx: &mut RequestMonitoring,
+        ctx: &RequestMonitoring,
        node_info: &console::CachedNodeInfo,
        timeout: time::Duration,
    ) -> Result<PostgresConnection, Self::Error> {
@@ -98,7 +98,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
 /// Try to connect to the compute node, retrying if necessary.
 #[tracing::instrument(skip_all)]
 pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
-    ctx: &mut RequestMonitoring,
+    ctx: &RequestMonitoring,
    mechanism: &M,
    user_info: &B,
    allow_self_signed_compute: bool,
@@ -126,7 +126,7 @@ where
        .await
    {
        Ok(res) => {
-            ctx.latency_timer.success();
+            ctx.success();
            Metrics::get().proxy.retries_metric.observe(
                RetriesMetricGroup {
                    outcome: ConnectOutcome::Success,
@@ -178,7 +178,7 @@ where
            .await
        {
            Ok(res) => {
-                ctx.latency_timer.success();
+                ctx.success();
                Metrics::get().proxy.retries_metric.observe(
                    RetriesMetricGroup {
                        outcome: ConnectOutcome::Success,
@@ -209,9 +209,7 @@ where
        let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
        num_retries += 1;

-        let pause = ctx
-            .latency_timer
-            .pause(crate::metrics::Waiting::RetryTimeout);
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout);
        time::sleep(wait_duration).await;
        drop(pause);
    }
--- a/proxy/core/src/proxy/copy_bidirectional.rs
+++ b/proxy/core/src/proxy/copy_bidirectional.rs
--- a/proxy/core/src/proxy/handshake.rs
+++ b/proxy/core/src/proxy/handshake.rs
@@ -10,6 +10,7 @@ use tracing::{info, warn};
 use crate::{
    auth::endpoint_sni,
    config::{TlsConfig, PG_ALPN_PROTOCOL},
+    context::RequestMonitoring,
    error::ReportableError,
    metrics::Metrics,
    proxy::ERR_INSECURE_CONNECTION,
@@ -67,6 +68,7 @@ pub enum HandshakeData<S> {
 /// we also take an extra care of propagating only the select handshake errors to client.
 #[tracing::instrument(skip_all)]
 pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
+    ctx: &RequestMonitoring,
    stream: S,
    mut tls: Option<&TlsConfig>,
    record_handshake_error: bool,
@@ -80,8 +82,6 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
    let mut stream = PqStream::new(Stream::from_raw(stream));
    loop {
        let msg = stream.read_startup_packet().await?;
-        info!("received {msg:?}");
-
        use FeStartupPacket::*;
        match msg {
            SslRequest { direct } => match stream.get_ref() {
@@ -145,16 +145,20 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(

                        let conn_info = tls_stream.get_ref().1;

+                        // try parse endpoint
+                        let ep = conn_info
+                            .server_name()
+                            .and_then(|sni| endpoint_sni(sni, &tls.common_names).ok().flatten());
+                        if let Some(ep) = ep {
+                            ctx.set_endpoint_id(ep);
+                        }
+
                        // check the ALPN, if exists, as required.
                        match conn_info.alpn_protocol() {
                            None | Some(PG_ALPN_PROTOCOL) => {}
                            Some(other) => {
-                                // try parse ep for better error
-                                let ep = conn_info.server_name().and_then(|sni| {
-                                    endpoint_sni(sni, &tls.common_names).ok().flatten()
-                                });
                                let alpn = String::from_utf8_lossy(other);
-                                warn!(?ep, %alpn, "unexpected ALPN");
+                                warn!(%alpn, "unexpected ALPN");
                                return Err(HandshakeError::ProtocolViolation);
                            }
                        }
@@ -198,7 +202,12 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                        .await?;
                }

-                info!(?version, session_type = "normal", "successful handshake");
+                info!(
+                    ?version,
+                    ?params,
+                    session_type = "normal",
+                    "successful handshake"
+                );
                break Ok(HandshakeData::Startup(stream, params));
            }
            // downgrade protocol version
--- a/proxy/core/src/proxy/passthrough.rs
+++ b/proxy/core/src/proxy/passthrough.rs
--- a/Show More
+++ b/Show More