clean up

split out binaries
proxy: experiment with idea to split crates
2026-03-15 14:20:38 +00:00 · 2024-08-13 15:08:57 +01:00 · 2024-08-13 15:08:57 +01:00 · 2024-08-13 15:08:54 +01:00 · 2024-08-13 15:01:48 +01:00 · 2024-08-13 11:08:25 +01:00
159 changed files with 2139 additions and 3252 deletions
--- a/.github/actions/set-docker-config-dir/action.yml
+++ b/.github/actions/set-docker-config-dir/action.yml
@@ -1,36 +0,0 @@
-name: "Set custom docker config directory"
-description: "Create a directory for docker config and set DOCKER_CONFIG"
-
-# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-runs:
-  using: "composite"
-  steps:
-  - name: Show warning on GitHub-hosted runners
-    if: runner.environment == 'github-hosted'
-    shell: bash -euo pipefail {0}
-    run: |
-      # Using the following environment variables to find a path to the workflow file
-      # ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch
-      # ${GITHUB_REPOSITORY}   - octocat/hello-world
-      # ${GITHUB_REF}          - refs/heads/my_branch
-      # From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables
-
-      filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"}
-      filename=${filename_with_ref%"@$GITHUB_REF"}
-
-      # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message
-      title='Unnecessary usage of `.github/actions/set-docker-config-dir`'
-      message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners'
-      echo "::warning file=${filename},title=${title}::${message}"
-
-  - uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7
-    env:
-      DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }}
-    with:
-      main: |
-        mkdir -p "${DOCKER_CONFIG}"
-        echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV
-      post: |
-        if [ -d "${DOCKER_CONFIG}" ]; then
-          rm -r "${DOCKER_CONFIG}"
-        fi
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -1,152 +0,0 @@
-name: Prepare benchmarking databases by restoring dumps
-
-on:
-  workflow_call:
-    # no inputs needed
-    
-defaults:
-  run:
-    shell: bash -euxo pipefail {0}
-
-jobs:
-  setup-databases:
-    strategy:
-      fail-fast: false
-      matrix:
-        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ] 
-        database: [ clickbench, tpch, userexample ]
-  
-    env:
-      LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib
-      PLATFORM: ${{ matrix.platform }}
-      PG_BINARIES: /tmp/neon/pg_install/v16/bin
-
-    runs-on: [ self-hosted, us-east-2, x64 ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
-      options: --init
-
-    steps:
-    - name: Set up Connection String
-      id: set-up-prep-connstr
-      run: |
-        case "${PLATFORM}" in
-          neon)
-            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} 
-            ;;
-          aws-rds-postgres)
-            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} 
-            ;;
-          aws-aurora-serverless-v2-postgres)
-            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }} 
-            ;;
-          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}"
-            exit 1
-            ;;
-        esac
-
-        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT  
-
-    - name: Download Neon artifact
-      uses: ./.github/actions/download
-      with:
-        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
-        path: /tmp/neon/
-        prefix: latest
-
-    # we create a table that has one row for each database that we want to restore with the status whether the restore is done    
-    - name: Create benchmark_restore_status table if it does not exist
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
-        DATABASE_NAME: ${{ matrix.database }}
-      # to avoid a race condition of multiple jobs trying to create the table at the same time, 
-      # we use an advisory lock
-      run: |
-        ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
-        SELECT pg_advisory_lock(4711);  
-        CREATE TABLE IF NOT EXISTS benchmark_restore_status (
-        databasename text primary key,
-        restore_done boolean
-        );
-        SELECT pg_advisory_unlock(4711);
-        "
-    
-    - name: Check if restore is already done
-      id: check-restore-done
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
-        DATABASE_NAME: ${{ matrix.database }}
-      run: |
-        skip=false
-        if ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM benchmark_restore_status WHERE databasename='${{ env.DATABASE_NAME }}' AND restore_done=true;" | grep -q 1; then
-          echo "Restore already done for database ${{ env.DATABASE_NAME }} on platform ${{ env.PLATFORM }}. Skipping this database."
-          skip=true
-        fi
-        echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
-
-    - name: Check and create database if it does not exist
-      if: steps.check-restore-done.outputs.skip != 'true'
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
-        DATABASE_NAME: ${{ matrix.database }}
-      run: |
-        DB_EXISTS=$(${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM pg_database WHERE datname='${{ env.DATABASE_NAME }}'")
-        if [ "$DB_EXISTS" != "1" ]; then
-          echo "Database ${{ env.DATABASE_NAME }} does not exist. Creating it..."
-          ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "CREATE DATABASE \"${{ env.DATABASE_NAME }}\";"
-        else
-          echo "Database ${{ env.DATABASE_NAME }} already exists."
-        fi
-
-    - name: Download dump from S3 to /tmp/dumps
-      if: steps.check-restore-done.outputs.skip != 'true'
-      env:
-        DATABASE_NAME: ${{ matrix.database }}
-      run: |
-        mkdir -p /tmp/dumps
-        aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/ 
-
-    - name: Replace database name in connection string
-      if: steps.check-restore-done.outputs.skip != 'true'
-      id: replace-dbname
-      env:
-        DATABASE_NAME: ${{ matrix.database }}
-        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
-      run: |
-        # Extract the part before the database name
-        base_connstr="${BENCHMARK_CONNSTR%/*}"
-        # Extract the query parameters (if any) after the database name
-        query_params="${BENCHMARK_CONNSTR#*\?}"
-        # Reconstruct the new connection string
-        if [ "$query_params" != "$BENCHMARK_CONNSTR" ]; then
-          new_connstr="${base_connstr}/${DATABASE_NAME}?${query_params}"
-        else
-          new_connstr="${base_connstr}/${DATABASE_NAME}"
-        fi
-        echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT  
-
-    - name: Restore dump
-      if: steps.check-restore-done.outputs.skip != 'true'
-      env:
-        DATABASE_NAME: ${{ matrix.database }}
-        DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }}
-        # the following works only with larger computes: 
-        # PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
-        # we add the || true because:
-        # the dumps were created with Neon and contain neon extensions that are not 
-        # available in RDS, so we will always report an error, but we can ignore it
-      run: |
-        ${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \
-        -d "${DATABASE_CONNSTR}" /tmp/dumps/${DATABASE_NAME}.pg_dump || true
-
-    - name: Update benchmark_restore_status table
-      if: steps.check-restore-done.outputs.skip != 'true'
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
-        DATABASE_NAME: ${{ matrix.database }}
-      run: |
-        ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
-        INSERT INTO benchmark_restore_status (databasename, restore_done) VALUES ('${{ env.DATABASE_NAME }}', true)
-        ON CONFLICT (databasename) DO UPDATE SET restore_done = true;
-        "
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -280,9 +280,8 @@ jobs:
                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
        }'

-        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
-                                                     { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]')
+        if [ "$(date +%A)" = "Saturday" ]; then
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -322,13 +321,9 @@ jobs:

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT

-  prepare_AWS_RDS_databases:
-    uses: ./.github/workflows/_benchmarking_preparation.yml
-    secrets: inherit
-  
  pgbench-compare:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
-    needs: [ generate-matrices, prepare_AWS_RDS_databases ]
+    needs: [ generate-matrices ]
    permissions:
      contents: write
      statuses: write
@@ -600,7 +595,7 @@ jobs:
    # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
    # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ]
+    needs: [ generate-matrices, pgbench-compare ]

    strategy:
      fail-fast: false
@@ -608,7 +603,7 @@ jobs:

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
      TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
      TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
@@ -660,7 +655,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -690,7 +684,7 @@ jobs:
    #
    # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ]
+    needs: [ generate-matrices, clickbench-compare ]

    strategy:
      fail-fast: false
@@ -698,7 +692,7 @@ jobs:

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -730,7 +724,7 @@ jobs:
            ENV_PLATFORM=RDS_AURORA_TPCH
            ;;
          rds-postgres)
-            ENV_PLATFORM=RDS_POSTGRES_TPCH
+            ENV_PLATFORM=RDS_AURORA_TPCH
            ;;
          *)
            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
@@ -756,7 +750,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_tpch
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -778,7 +771,7 @@ jobs:

  user-examples-compare:
    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
-    needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ]
+    needs: [ generate-matrices, tpch-compare ]

    strategy:
      fail-fast: false
@@ -786,7 +779,7 @@ jobs:

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -56,7 +56,13 @@ jobs:

      - uses: actions/checkout@v4

-      - uses: ./.github/actions/set-docker-config-dir
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p /tmp/.docker-custom
+          echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
+
      - uses: docker/setup-buildx-action@v3
        with:
          cache-binary: false
@@ -83,6 +89,11 @@ jobs:
          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
          tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}

+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf /tmp/.docker-custom
+
  merge-images:
    needs: [ build-image ]
    runs-on: ubuntu-22.04
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -484,7 +484,12 @@ jobs:
          submodules: true
          fetch-depth: 0

-      - uses: ./.github/actions/set-docker-config-dir
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
      - uses: docker/setup-buildx-action@v3
        with:
          cache-binary: false
@@ -516,6 +521,11 @@ jobs:
          tags: |
            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
+
  neon-image:
    needs: [ neon-image-arch, tag ]
    runs-on: ubuntu-22.04
@@ -560,7 +570,12 @@ jobs:
          submodules: true
          fetch-depth: 0

-      - uses: ./.github/actions/set-docker-config-dir
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
      - uses: docker/setup-buildx-action@v3
        with:
          cache-binary: false
@@ -643,6 +658,11 @@ jobs:
          tags: |
            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
+
  compute-node-image:
    needs: [ compute-node-image-arch, tag ]
    runs-on: ubuntu-22.04
@@ -715,7 +735,13 @@ jobs:
          curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
          chmod +x vm-builder

-      - uses: ./.github/actions/set-docker-config-dir
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+
      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -738,6 +764,11 @@ jobs:
        run: |
          docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}

+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
+
  test-images:
    needs: [ check-permissions, tag, neon-image, compute-node-image ]
    strategy:
@@ -753,7 +784,13 @@ jobs:
        with:
          fetch-depth: 0

-      - uses: ./.github/actions/set-docker-config-dir
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+
      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -793,6 +830,11 @@ jobs:
          docker compose -f ./docker-compose/docker-compose.yml logs || 0
          docker compose -f ./docker-compose/docker-compose.yml down

+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
+
  promote-images:
    permissions:
      contents: read  # This is required for actions/checkout
--- a/.github/workflows/label-for-external-users.yml
+++ b/.github/workflows/label-for-external-users.yml
@@ -1,35 +0,0 @@
-name: Add `external` label to issues and PRs created by external users
-
-on:
-  issues:
-    types:
-      - opened
-  pull_request:
-    types:
-      - opened
-
-# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
-permissions: {}
-
-env:
-  LABEL: external
-
-jobs:
-  add-label:
-    # This workflow uses `author_association` for PRs and issues to determine if the user is an external user.
-    # Possible values for `author_association`: https://docs.github.com/en/graphql/reference/enums#commentauthorassociation
-    if: ${{ !contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event[github.event_name == 'pull_request' && 'pull_request' || 'issue'].author_association) }}
-
-    runs-on: ubuntu-22.04
-    permissions:
-      pull-requests: write
-      issues: write
-
-    steps:
-    - name: Label new ${{ github.event_name }}
-      env:
-        GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request' && 'pull_request' || 'issue'].number }}
-        GH_CLI_COMMAND: ${{ github.event_name == 'pull_request' && 'pr' || 'issue' }}
-      run: |
-        gh ${GH_CLI_COMMAND} --repo ${GITHUB_REPOSITORY} edit --add-label=${LABEL} ${ITEM_NUMBER}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -484,7 +484,7 @@ dependencies = [
 "http 0.2.9",
 "http 1.1.0",
 "once_cell",
- "p256",
+ "p256 0.11.1",
 "percent-encoding",
 "ring 0.17.6",
 "sha2",
@@ -848,6 +848,12 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce"

+[[package]]
+name = "base16ct"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf"
+
 [[package]]
 name = "base64"
 version = "0.13.1"
@@ -971,9 +977,9 @@ checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"

 [[package]]
 name = "bytemuck"
-version = "1.16.0"
+version = "1.16.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
+checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83"

 [[package]]
 name = "byteorder"
@@ -1526,8 +1532,10 @@ version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76"
 dependencies = [
+ "generic-array",
 "rand_core 0.6.4",
 "subtle",
+ "zeroize",
 ]

 [[package]]
@@ -1621,6 +1629,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c"
 dependencies = [
 "const-oid",
+ "pem-rfc7468",
 "zeroize",
 ]

@@ -1720,6 +1729,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
 dependencies = [
 "block-buffer",
+ "const-oid",
 "crypto-common",
 "subtle",
 ]
@@ -1771,11 +1781,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c"
 dependencies = [
 "der 0.6.1",
- "elliptic-curve",
- "rfc6979",
+ "elliptic-curve 0.12.3",
+ "rfc6979 0.3.1",
 "signature 1.6.4",
 ]

+[[package]]
+name = "ecdsa"
+version = "0.16.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca"
+dependencies = [
+ "der 0.7.8",
+ "digest",
+ "elliptic-curve 0.13.8",
+ "rfc6979 0.4.0",
+ "signature 2.2.0",
+ "spki 0.7.3",
+]
+
 [[package]]
 name = "either"
 version = "1.8.1"
@@ -1788,16 +1812,36 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3"
 dependencies = [
- "base16ct",
+ "base16ct 0.1.1",
 "crypto-bigint 0.4.9",
 "der 0.6.1",
 "digest",
- "ff",
+ "ff 0.12.1",
 "generic-array",
- "group",
- "pkcs8",
+ "group 0.12.1",
+ "pkcs8 0.9.0",
 "rand_core 0.6.4",
- "sec1",
+ "sec1 0.3.0",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "elliptic-curve"
+version = "0.13.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47"
+dependencies = [
+ "base16ct 0.2.0",
+ "crypto-bigint 0.5.5",
+ "digest",
+ "ff 0.13.0",
+ "generic-array",
+ "group 0.13.0",
+ "pem-rfc7468",
+ "pkcs8 0.10.2",
+ "rand_core 0.6.4",
+ "sec1 0.7.3",
 "subtle",
 "zeroize",
 ]
@@ -1951,6 +1995,16 @@ dependencies = [
 "subtle",
 ]

+[[package]]
+name = "ff"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ded41244b729663b1e574f1b4fb731469f69f79c17667b5d776b16cda0479449"
+dependencies = [
+ "rand_core 0.6.4",
+ "subtle",
+]
+
 [[package]]
 name = "filetime"
 version = "0.2.22"
@@ -2148,6 +2202,7 @@ checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
 dependencies = [
 "typenum",
 "version_check",
+ "zeroize",
 ]

 [[package]]
@@ -2214,7 +2269,18 @@ version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7"
 dependencies = [
- "ff",
+ "ff 0.12.1",
+ "rand_core 0.6.4",
+ "subtle",
+]
+
+[[package]]
+name = "group"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63"
+dependencies = [
+ "ff 0.13.0",
 "rand_core 0.6.4",
 "subtle",
 ]
@@ -2776,6 +2842,42 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "jose-b64"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bec69375368709666b21c76965ce67549f2d2db7605f1f8707d17c9656801b56"
+dependencies = [
+ "base64ct",
+ "serde",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "jose-jwa"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ab78e053fe886a351d67cf0d194c000f9d0dcb92906eb34d853d7e758a4b3a7"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "jose-jwk"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "280fa263807fe0782ecb6f2baadc28dffc04e00558a58e33bfdb801d11fd58e7"
+dependencies = [
+ "jose-b64",
+ "jose-jwa",
+ "p256 0.13.2",
+ "p384",
+ "rsa",
+ "serde",
+ "zeroize",
+]
+
 [[package]]
 name = "js-sys"
 version = "0.3.69"
@@ -2835,6 +2937,9 @@ name = "lazy_static"
 version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+dependencies = [
+ "spin 0.5.2",
+]

 [[package]]
 name = "lazycell"
@@ -3204,6 +3309,23 @@ dependencies = [
 "num-traits",
 ]

+[[package]]
+name = "num-bigint-dig"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151"
+dependencies = [
+ "byteorder",
+ "lazy_static",
+ "libm",
+ "num-integer",
+ "num-iter",
+ "num-traits",
+ "rand 0.8.5",
+ "smallvec",
+ "zeroize",
+]
+
 [[package]]
 name = "num-complex"
 version = "0.4.4"
@@ -3481,11 +3603,33 @@ version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594"
 dependencies = [
- "ecdsa",
- "elliptic-curve",
+ "ecdsa 0.14.8",
+ "elliptic-curve 0.12.3",
 "sha2",
 ]

+[[package]]
+name = "p256"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b"
+dependencies = [
+ "ecdsa 0.16.9",
+ "elliptic-curve 0.13.8",
+ "primeorder",
+ "sha2",
+]
+
+[[package]]
+name = "p384"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70786f51bcc69f6a4c0360e063a4cac5419ef7c5cd5b3c99ad70f3be5ba79209"
+dependencies = [
+ "elliptic-curve 0.13.8",
+ "primeorder",
+]
+
 [[package]]
 name = "pagebench"
 version = "0.1.0"
@@ -3847,6 +3991,15 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "pem-rfc7468"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
+dependencies = [
+ "base64ct",
+]
+
 [[package]]
 name = "percent-encoding"
 version = "2.2.0"
@@ -3863,6 +4016,29 @@ dependencies = [
 "indexmap 1.9.3",
 ]

+[[package]]
+name = "pg_sni_router"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "futures",
+ "git-version",
+ "itertools 0.10.5",
+ "pq_proto",
+ "proxy-core",
+ "proxy-sasl",
+ "rustls 0.22.4",
+ "rustls-pemfile 2.1.1",
+ "socket2 0.5.5",
+ "tokio",
+ "tokio-util",
+ "tracing",
+ "tracing-utils",
+ "utils",
+ "uuid",
+]
+
 [[package]]
 name = "phf"
 version = "0.11.1"
@@ -3913,6 +4089,17 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"

+[[package]]
+name = "pkcs1"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f"
+dependencies = [
+ "der 0.7.8",
+ "pkcs8 0.10.2",
+ "spki 0.7.3",
+]
+
 [[package]]
 name = "pkcs8"
 version = "0.9.0"
@@ -3923,6 +4110,16 @@ dependencies = [
 "spki 0.6.0",
 ]

+[[package]]
+name = "pkcs8"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
+dependencies = [
+ "der 0.7.8",
+ "spki 0.7.3",
+]
+
 [[package]]
 name = "pkg-config"
 version = "0.3.27"
@@ -4116,6 +4313,15 @@ dependencies = [
 "syn 2.0.52",
 ]

+[[package]]
+name = "primeorder"
+version = "0.13.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6"
+dependencies = [
+ "elliptic-curve 0.13.8",
+]
+
 [[package]]
 name = "proc-macro-hack"
 version = "0.5.20+deprecated"
@@ -4230,9 +4436,38 @@ dependencies = [
 [[package]]
 name = "proxy"
 version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "aws-config",
+ "clap",
+ "futures",
+ "git-version",
+ "humantime",
+ "itertools 0.10.5",
+ "metrics",
+ "pq_proto",
+ "proxy-core",
+ "proxy-sasl",
+ "remote_storage",
+ "rustls 0.22.4",
+ "rustls-pemfile 2.1.1",
+ "socket2 0.5.5",
+ "tikv-jemallocator",
+ "tokio",
+ "tokio-util",
+ "tracing",
+ "tracing-utils",
+ "utils",
+ "uuid",
+]
+
+[[package]]
+name = "proxy-core"
+version = "0.1.0"
 dependencies = [
 "ahash",
 "anyhow",
+ "arc-swap",
 "async-compression",
 "async-trait",
 "atomic-take",
@@ -4250,11 +4485,11 @@ dependencies = [
 "consumption_metrics",
 "crossbeam-deque",
 "dashmap",
+ "ecdsa 0.16.9",
 "env_logger",
 "fallible-iterator",
 "framed-websockets",
 "futures",
- "git-version",
 "hashbrown 0.14.5",
 "hashlink",
 "hex",
@@ -4270,12 +4505,14 @@ dependencies = [
 "indexmap 2.0.1",
 "ipnet",
 "itertools 0.10.5",
+ "jose-jwa",
+ "jose-jwk",
 "lasso",
 "md5",
 "measured",
 "metrics",
 "once_cell",
- "opentelemetry",
+ "p256 0.13.2",
 "parking_lot 0.12.1",
 "parquet",
 "parquet_derive",
@@ -4284,7 +4521,7 @@ dependencies = [
 "postgres-protocol",
 "postgres_backend",
 "pq_proto",
- "prometheus",
+ "proxy-sasl",
 "rand 0.8.5",
 "rand_distr",
 "rcgen",
@@ -4296,6 +4533,7 @@ dependencies = [
 "reqwest-retry",
 "reqwest-tracing",
 "routerify",
+ "rsa",
 "rstest",
 "rustc-hash",
 "rustls 0.22.4",
@@ -4305,6 +4543,7 @@ dependencies = [
 "serde",
 "serde_json",
 "sha2",
+ "signature 2.2.0",
 "smallvec",
 "smol_str",
 "socket2 0.5.5",
@@ -4312,7 +4551,6 @@ dependencies = [
 "task-local-extensions",
 "thiserror",
 "tikv-jemalloc-ctl",
- "tikv-jemallocator",
 "tokio",
 "tokio-postgres",
 "tokio-postgres-rustls",
@@ -4335,6 +4573,35 @@ dependencies = [
 "x509-parser",
 ]

+[[package]]
+name = "proxy-sasl"
+version = "0.1.0"
+dependencies = [
+ "ahash",
+ "anyhow",
+ "base64 0.13.1",
+ "bytes",
+ "crossbeam-deque",
+ "hmac",
+ "itertools 0.10.5",
+ "lasso",
+ "measured",
+ "parking_lot 0.12.1",
+ "pbkdf2",
+ "postgres-protocol",
+ "pq_proto",
+ "rand 0.8.5",
+ "rustls 0.22.4",
+ "sha2",
+ "subtle",
+ "thiserror",
+ "tokio",
+ "tracing",
+ "uuid",
+ "workspace_hack",
+ "x509-parser",
+]
+
 [[package]]
 name = "quick-xml"
 version = "0.31.0"
@@ -4807,6 +5074,16 @@ dependencies = [
 "zeroize",
 ]

+[[package]]
+name = "rfc6979"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2"
+dependencies = [
+ "hmac",
+ "subtle",
+]
+
 [[package]]
 name = "ring"
 version = "0.16.20"
@@ -4867,6 +5144,26 @@ dependencies = [
 "archery",
 ]

+[[package]]
+name = "rsa"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc"
+dependencies = [
+ "const-oid",
+ "digest",
+ "num-bigint-dig",
+ "num-integer",
+ "num-traits",
+ "pkcs1",
+ "pkcs8 0.10.2",
+ "rand_core 0.6.4",
+ "signature 2.2.0",
+ "spki 0.7.3",
+ "subtle",
+ "zeroize",
+]
+
 [[package]]
 name = "rstest"
 version = "0.18.2"
@@ -5195,10 +5492,24 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928"
 dependencies = [
- "base16ct",
+ "base16ct 0.1.1",
 "der 0.6.1",
 "generic-array",
- "pkcs8",
+ "pkcs8 0.9.0",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "sec1"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc"
+dependencies = [
+ "base16ct 0.2.0",
+ "der 0.7.8",
+ "generic-array",
+ "pkcs8 0.10.2",
 "subtle",
 "zeroize",
 ]
@@ -5545,6 +5856,7 @@ version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
 dependencies = [
+ "digest",
 "rand_core 0.6.4",
 ]

@@ -7379,13 +7691,17 @@ dependencies = [
 "clap",
 "clap_builder",
 "crossbeam-utils",
+ "crypto-bigint 0.5.5",
+ "der 0.7.8",
 "deranged",
+ "digest",
 "either",
 "fail",
 "futures-channel",
 "futures-executor",
 "futures-io",
 "futures-util",
+ "generic-array",
 "getrandom 0.2.11",
 "hashbrown 0.14.5",
 "hex",
@@ -7393,6 +7709,7 @@ dependencies = [
 "hyper 0.14.26",
 "indexmap 1.9.3",
 "itertools 0.10.5",
+ "lazy_static",
 "libc",
 "log",
 "memchr",
@@ -7416,7 +7733,9 @@ dependencies = [
 "serde",
 "serde_json",
 "sha2",
+ "signature 2.2.0",
 "smallvec",
+ "spki 0.7.3",
 "subtle",
 "syn 1.0.109",
 "syn 2.0.52",
@@ -7527,6 +7846,7 @@ version = "1.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
 dependencies = [
+ "serde",
 "zeroize_derive",
 ]

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,7 +9,10 @@ members = [
    "pageserver/ctl",
    "pageserver/client",
    "pageserver/pagebench",
-    "proxy",
+    "proxy/core",
+    "proxy/sasl",
+    "proxy/proxy",
+    "proxy/pg_sni_router",
    "safekeeper",
    "storage_broker",
    "storage_controller",
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -824,12 +824,11 @@ impl Endpoint {
        // cleanup work to do after postgres stops, like syncing safekeepers,
        // etc.
        //
-        // If destroying or stop mode is immediate, send it SIGTERM before
-        // waiting. Sometimes we do *not* want this cleanup: tests intentionally
-        // do stop when majority of safekeepers is down, so sync-safekeepers
-        // would hang otherwise. This could be a separate flag though.
-        let send_sigterm = destroy || mode == "immediate";
-        self.wait_for_compute_ctl_to_exit(send_sigterm)?;
+        // If destroying, send it SIGTERM before waiting. Sometimes we do *not*
+        // want this cleanup: tests intentionally do stop when majority of
+        // safekeepers is down, so sync-safekeepers would hang otherwise. This
+        // could be a separate flag though.
+        self.wait_for_compute_ctl_to_exit(destroy)?;
        if destroy {
            println!(
                "Destroying postgres data directory '{}'",
--- a/deny.toml
+++ b/deny.toml
@@ -22,7 +22,10 @@ feature-depth = 1
 [advisories]
 db-urls = ["https://github.com/rustsec/advisory-db"]
 yanked = "warn"
-ignore = []
+
+[[advisories.ignore]]
+id = "RUSTSEC-2023-0071"
+reason = "the marvin attack only affects private key decryption, not public key signature verification"

 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -1,495 +0,0 @@
-# Safekeeper dynamic membership change
-
-To quickly recover from safekeeper node failures and do rebalancing we need to
-be able to change set of safekeepers the timeline resides on. The procedure must
-be safe (not lose committed log) regardless of safekeepers and compute state. It
-should be able to progress if any majority of old safekeeper set, any majority
-of new safekeeper set and compute are up and connected. This is known as a
-consensus membership change. It always involves two phases: 1) switch old
-majority to old + new configuration, preventing commits without acknowledge from
-the new set 2) bootstrap the new set by ensuring majority of the new set has all
-data which ever could have been committed before the first phase completed;
-after that switch is safe to finish. Without two phases switch to the new set
-which quorum might not intersect with quorum of the old set (and typical case of
-ABC -> ABD switch is an example of that, because quorums AC and BD don't
-intersect). Furthermore, procedure is typically carried out by the consensus
-leader, and so enumeration of configurations which establishes order between
-them is done through consensus log.
-
-In our case consensus leader is compute (walproposer), and we don't want to wake
-up all computes for the change. Neither we want to fully reimplement the leader
-logic second time outside compute. Because of that the proposed algorithm relies
-for issuing configurations on the external fault tolerant (distributed) strongly
-consisent storage with simple API: CAS (compare-and-swap) on the single key.
-Properly configured postgres suits this.
-
-In the system consensus is implemented at the timeline level, so algorithm below
-applies to the single timeline.
-
-## Algorithm
-
-### Definitions
-
-A configuration is
-
-```
-struct Configuration {
-    generation: Generation, // a number uniquely identifying configuration
-    sk_set: Vec<NodeId>, // current safekeeper set
-    new_sk_set: Optional<Vec<NodeId>>,
-}
-```
-
-Configuration with `new_set` present is used for the intermediate step during
-the change and called joint configuration. Generations establish order of
-generations: we say `c1` is higher than `c2` if `c1.generation` >
-`c2.generation`.
-
-### Persistently stored data changes
-
-Safekeeper starts storing its current configuration in the control file. Update
-of is atomic, so in-memory value always matches the persistent one.
-
-External CAS providing storage (let's call it configuration storage here) also
-stores configuration for each timeline. It is initialized with generation 1 and
-initial set of safekeepers during timeline creation. Executed CAS on it must
-never be lost.
-
-### Compute <-> safekeeper protocol changes
-
-`ProposerGreeting` message carries walproposer's configuration if it is already
-established (see below), else null.  `AcceptorGreeting` message carries
-safekeeper's current `Configuration`. All further messages (`VoteRequest`,
-`VoteResponse`, `ProposerElected`, `AppendRequest`, `AppendResponse`) carry
-generation number, of walproposer in case of wp->sk message or of safekeeper in
-case of sk->wp message.
-
-### Safekeeper changes
-
-Basic rule: once safekeeper observes configuration higher than his own it
-immediately switches to it. It must refuse all messages with lower generation
-that his. It also refuses messages if it is not member of the current generation
-(that is, of either `sk_set` of `sk_new_set`), though it is likely not unsafe to
-process them (walproposer should ignore result anyway).
-
-If there is non null configuration in `ProposerGreeting` and it is higher than
-current safekeeper one, safekeeper switches to it.
-
-Safekeeper sends its current configuration in its first message to walproposer
-`AcceptorGreeting`. It refuses all other walproposer messages if the
-configuration generation in them is less than its current one. Namely, it
-refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
-response it sends its current configuration generation to let walproposer know.
-
-Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration` 
-accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
-current one and ignores it otherwise. In any case it replies with
-```
-struct ConfigurationSwitchResponse {
-    conf: Configuration,
-    term: Term,
-    last_log_term: Term,
-    flush_lsn: Lsn,
-}
-```
-
-### Compute (walproposer) changes
-
-Basic rule is that joint configuration requires votes from majorities in the
-both `set` and `new_sk_set`.
-
-Compute receives list of safekeepers to connect to from the control plane as
-currently and tries to communicate with all of them. However, the list does not
-define consensus members. Instead, on start walproposer tracks highest
-configuration it receives from `AcceptorGreeting`s. Once it assembles greetings
-from majority of `sk_set` and majority of `new_sk_set` (if it is present), it
-establishes this configuration as its own and moves to voting. 
-
-It should stop talking to safekeepers not listed in the configuration at this
-point, though it is not unsafe to continue doing so.
-
-To be elected it must receive votes from both majorites if `new_sk_set` is present.
-Similarly, to commit WAL it must receive flush acknowledge from both majorities.
-
-If walproposer hears from safekeeper configuration higher than his own (i.e.
-refusal to accept due to configuration change) it simply restarts.
-
-### Change algorithm
-
-The following algorithm can be executed anywhere having access to configuration
-storage and safekeepers. It is safe to interrupt / restart it and run multiple
-instances of it concurrently, though likely one of them won't make
-progress then. It accepts `desired_set: Vec<NodeId>` as input. 
-
-Algorithm will refuse to make the change if it encounters previous interrupted
-change attempt, but in this case it will try to finish it.
-
-It will eventually converge if old majority, new majority and configuration
-storage are reachable.
-
-1) Fetch current timeline configuration from the configuration storage.
-2) If it is already joint one and `new_set` is different from `desired_set`
-   refuse to change. However, assign join conf to (in memory) var
-   `join_conf` and proceed to step 4 to finish the ongoing change.
-3) Else, create joint `joint_conf: Configuration`: increment current conf number
-   `n` and put `desired_set` to `new_sk_set`. Persist it in the configuration
-   storage by doing CAS on the current generation: change happens only if
-   current configuration number is still `n`. Apart from guaranteeing uniqueness
-   of configurations, CAS linearizes them, ensuring that new configuration is
-   created only following the previous one when we know that the transition is
-   safe. Failed CAS aborts the procedure.
-4) Call `PUT` `configuration` on safekeepers from the current set,
-   delivering them `joint_conf`. Collecting responses from majority is required
-   to proceed. If any response returned generation higher than 
-   `joint_conf.generation`, abort (another switch raced us). Otherwise, choose
-   max `<last_log_term, flush_lsn>` among responses and establish it as
-   (in memory) `sync_position`. Also choose max `term` and establish it as (in
-   memory) `sync_term`. We can't finish the switch until majority of the new set
-   catches up to this `sync_position` because data before it could be committed
-   without ack from the new set. Similarly, we'll bump term on new majority
-   to `sync_term` so that two computes with the same term are never elected.
-4) Initialize timeline on safekeeper(s) from `new_sk_set` where it
-   doesn't exist yet by doing `pull_timeline` from the majority of the 
-   current set. Doing that on majority of `new_sk_set` is enough to
-   proceed, but it is reasonable to ensure that all `new_sk_set` members
-   are initialized -- if some of them are down why are we migrating there?
-5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set. 
-   Success on majority is enough.
-6) Repeatedly call `PUT` `configuration` on safekeepers from the new set,
-   delivering them `joint_conf` and collecting their positions. This will
-   switch them to the `joint_conf` which generally won't be needed 
-   because `pull_timeline` already includes it and plus additionally would be
-   broadcast by compute. More importantly, we may proceed to the next step
-   only when `<last_log_term, flush_lsn>` on the majority of the new set reached 
-   `sync_position`. Similarly, on the happy path no waiting is not needed because 
-   `pull_timeline` already includes it. However, we should double
-    check to be safe. For example, timeline could have been created earlier e.g.
-    manually or after try-to-migrate, abort, try-to-migrate-again sequence. 
-7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new 
-   safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration 
-   storage under one more CAS.
-8) Call `PUT` `configuration` on safekeepers from the new set,
-   delivering them `new_conf`. It is enough to deliver it to the majority 
-   of the new set; the rest can be updated by compute.
-
-I haven't put huge effort to make the description above very precise, because it
-is natural language prone to interpretations anyway. Instead I'd like to make TLA+
-spec of it.
-
-Description above focuses on safety. To make the flow practical and live, here a few more 
-considerations.
-1) It makes sense to ping new set to ensure it we are migrating to live node(s) before 
-  step 3.
-2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed 
-   it is safe to rollback to the old conf with one more CAS.
-3) On step 4 timeline might be already created on members of the new set for various reasons; 
-   the simplest is the procedure restart. There are more complicated scenarious like mentioned
-   in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving 
-   generations, so seems simpler to treat existing timeline as success. However, this also 
-   has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
-   the step 5 is never reached until compute is (re)awaken up to synchronize new member(s).
-   I don't think we'll observe this in practice, but can add waking up compute if needed.
-4) In the end timeline should be locally deleted on the safekeeper(s) which are
-   in the old set but not in the new one, unless they are unreachable. To be
-   safe this also should be done under generation number (deletion proceeds only if 
-   current configuration is <= than one in request and safekeeper is not memeber of it).
-5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
-   jump to step 7, using it as `new_conf`.
-
-## Implementation
-
-The procedure ought to be driven from somewhere. Obvious candidates are control
-plane and storage_controller; and as each of them already has db we don't want
-yet another storage. I propose to manage safekeepers in storage_controller
-because 1) since it is in rust it simplifies simulation testing (more on this
-below) 2) it already manages pageservers. 
-
-This assumes that migration will be fully usable only after we migrate all
-tenants/timelines to storage_controller. It is discussible whether we want also
-to manage pageserver attachments for all of these, but likely we do.
-
-This requires us to define storcon <-> cplane interface.
-
-### storage_controller <-> control plane interface
-
-First of all, control plane should
-[change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829)
-storing safekeepers per timeline instead of per tenant because we can't migrate
-tenants atomically. 
-
-The important question is how updated configuration is delivered from
-storage_controller to control plane to provide it to computes. As always, there
-are two options, pull and push. Let's do it the same push as with pageserver
-`/notify-attach` because 1) it keeps storage_controller out of critical compute
-start path 2) provides easier upgrade: there won't be such a thing as 'timeline
-managed by control plane / storcon', cplane just takes the value out of its db
-when needed 3) uniformity. It makes storage_controller responsible for retrying notifying
-control plane until it succeeds.
-
-So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and
-updates it in the db if the provided conf generation is higher (the cplane db
-should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it
-should update db which makes the call successful, and then try to schedule
-`apply_config` if possible, it is ok if not. storage_controller 
-should rate limit calling the endpoint, but likely this won't be needed, as migration
-throughput is limited by `pull_timeline`.
-
-Timeline (branch) creation in cplane should call storage_controller POST
-`tenant/:tenant_id/timeline` like it currently does for sharded tenants.
-Response should be augmented with `safekeeper_conf: Configuration`. The call
-should be retried until succeeds.
-
-Timeline deletion and tenant deletion in cplane should call appropriate
-storage_controller endpoints like it currently does for sharded tenants. The
-calls should be retried until they succeed.
-
-### storage_controller implementation
-
-Current 'load everything on startup and keep in memory' easy design is fine.
-Single timeline shouldn't take more than 100 bytes (it's 16 byte tenant_id, 16
-byte timeline_id, int generation, vec of ~3 safekeeper ids plus some flags), so
-10^6 of timelines shouldn't take more than 100MB.
-
-Similar to pageserver attachment Intents storage_controller would have in-memory
-`MigrationRequest` (or its absense) for each timeline and pool of tasks trying
-to make these request reality; this ensures one instance of storage_controller
-won't do several migrations on the same timeline concurrently. In the first
-version it is simpler to have more manual control and no retries, i.e. migration
-failure removes the request. Later we can build retries and automatic
-scheduling/migration. `MigrationRequest` is
-```
-enum MigrationRequest {
-    To(Vec<NodeId>),
-    FinishPending,
-}
-```
-
-`FinishPending` requests to run the procedure to ensure state is clean: current
-configuration is not joint and majority of safekeepers are aware of it, but do
-not attempt to migrate anywhere. If current configuration fetched on step 1 is
-not joint it jumps to step 7. It should be run at startup for all timelines (but
-similarly, in the first version it is ok to trigger it manually).
-
-#### Schema
-
-`safekeepers` table mirroring current `nodes` should be added, except that for
-`scheduling_policy` field (seems like `status` is a better name for it): it is enough
-to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3)
-`decomissioned`.
-
-`timelines` table:
-```
-table! {
-    // timeline_id is primary key
-    timelines (tenant_id, timeline_id) {
-        timeline_id -> Varchar,
-        tenant_id -> Varchar,
-        generation -> Int4,
-        sk_set -> Array<Int4>, // list of safekeeper ids
-        new_sk_set -> Nullable<Array<Int4>>, // list of safekeeper ids, null if not joint conf
-        cplane_notified_generation -> Int4,
-    }
-}
-```
-
-#### API
-
-Node management is similar to pageserver:
-1) POST `/control/v1/safekeepers` upserts safekeeper.
-2) GET `/control/v1/safekeepers` lists safekeepers.
-3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
-4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
-   `offline` or `decomissioned`. Initially it is simpler not to schedule any
-    migrations here.
-
-Safekeeper deploy scripts should register safekeeper at storage_contorller as
-they currently do with cplane, under the same id.
-
-Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline`
-would 1) choose initial set of safekeepers; 2) write to the db initial
-`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in
-case of conflict; 3) create timeline on the majority of safekeepers (already
-created is ok).
-
-We don't want to block timeline creation when one safekeeper is down. Currently
-this is solved by compute implicitly creating timeline on any safekeeper it is
-connected to. This creates ugly timeline state on safekeeper when timeline is
-created, but start LSN is not defined yet. It would be nice to remove this; to
-do that, controller can in the background retry to create timeline on
-safekeeper(s) which missed that during initial creation call. It can do that
-through `pull_timeline` from majority so it doesn't need to remember
-`parent_lsn` in its db.
-
-Timeline deletion removes the row from the db and forwards deletion to the
-current configuration members. Without additional actions deletions might leak,
-see below on this; initially let's ignore these, reporting to cplane success if
-at least one safekeeper deleted the timeline (this will remove s3 data).
-
-Tenant deletion repeats timeline deletion for all timelines.
-
-Migration API: the first version is the simplest and the most imperative:
-1) PUT `/control/v1/safekeepers/migrate` schedules `MigrationRequest`s to move
-all timelines from one safekeeper to another. It accepts json
-```
-{
-    "src_sk": u32,
-    "dst_sk": u32,
-    "limit": Optional<u32>,
-}
-```
-
-Returns list of scheduled requests.
-
-2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest`
-   to move single timeline to given set of safekeepers:
-```
-{
-    "desired_set": Vec<u32>,
-}
-```
-
-Returns scheduled request.
-
-Similar call should be added for the tenant.
-
-It would be great to have some way of subscribing to the results (apart from
-looking at logs/metrics).
-
-Migration is executed as described above. One subtlety is that (local) deletion on
-source safekeeper might fail, which is not a problem if we are going to
-decomission the node but leaves garbage otherwise. I'd propose in the first version
-1) Don't attempt deletion at all if node status is `offline`.
-2) If it failed, just issue warning.
-And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and 
-remove garbage timelines for manual use. It will 1) list all timelines on the 
-safekeeper 2) compare each one against configuration storage: if timeline 
-doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can 
-be deleted under generation number if node is not member of current generation.
-
-Automating this is untrivial; we'd need to register all potential missing
-deletions <tenant_id, timeline_id, generation, node_id> in the same transaction
-which switches configurations. Similarly when timeline is fully deleted to
-prevent cplane operation from blocking when some safekeeper is not available
-deletion should be also registered.
-
-One more task pool should infinitely retry notifying control plane about changed
-safekeeper sets.
-
-3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return
-   current in memory state of the timeline and pending `MigrationRequest`,
-   if any.
-
-4) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate_abort` tries to abort the
-   migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS
-   (incrementing generation as always).
-
-#### Dealing with multiple instances of storage_controller
-
-Operations described above executed concurrently might create some errors but do
-not prevent progress, so while we normally don't want to run multiple instances
-of storage_controller it is fine to have it temporarily, e.g. during redeploy.
-
-Any interactions with db update in-memory controller state, e.g. if migration
-request failed because different one is in progress, controller remembers that
-and tries to finish it.
-
-## Testing
-
-`neon_local` should be switched to use storage_controller, playing role of
-control plane.
-
-There should be following layers of tests:
-1) Model checked TLA+ spec specifies the algorithm and verifies its basic safety.
-
-2) To cover real code and at the same time test many schedules we should have
-   simulation tests. For that, configuration storage, storage_controller <->
-   safekeeper communication and pull_timeline need to be mocked and main switch
-   procedure wrapped to as a node (thread) in simulation tests, using these
-   mocks. Test would inject migrations like it currently injects
-   safekeeper/walproposer restars. Main assert is the same -- committed WAL must
-   not be lost.
-
-3) Since simulation testing injects at relatively high level points (not
-   syscalls), it omits some code, in particular `pull_timeline`. Thus it is
-   better to have basic tests covering whole system as well. Extended version of
-   `test_restarts_under_load` would do: start background load and do migration 
-   under it, then restart endpoint and check that no reported commits 
-   had been lost. I'd also add one more creating classic network split scenario, with
-   one compute talking to AC and another to BD while migration from nodes ABC to ABD
-   happens.
-
-4) Simple e2e test should ensure that full flow including cplane notification works.
-
-## Order of implementation and rollout
-
-Note that 
- Control plane parts and integration with it is fully independent from everything else
-  (tests would use simulation and neon_local).
- There is a lot of infra work making storage_controller aware of timelines and safekeepers
-  and its impl/rollout should be separate from migration itself.
- Initially walproposer can just stop working while it observers joint configuration.
-  Such window would be typically very short anyway.
-
-To rollout smoothly, both walproposer and safekeeper should have flag
-`configurations_enabled`; when set to false, they would work as currently, i.e.
-walproposer is able to commit on whatever safekeeper set it is provided. Until
-all timelines are managed by storcon we'd need to use current script to migrate
-and update/drop entries in the storage_controller database if it has any.
-
-Safekeepers would need to be able to talk both current and new protocol version
-with compute to reduce number of computes restarted in prod once v2 protocol is
-deployed (though before completely switching we'd need to force this).
-
-Let's have the following rollout order:
- storage_controller becomes aware of safekeepers;
- storage_controller gets timeline creation for new timelines and deletion requests, but
-  doesn't manage all timelines yet. Migration can be tested on these new timelines.
-  To keep control plane and storage_controller databases in sync while control 
-  plane still chooses the safekeepers initially (until all timelines are imported
-  it can choose better), `TimelineCreateRequest` can get optional safekeepers
-  field with safekeepers chosen by cplane.
- Then we can import all existing timelines from control plane to
-  storage_controller and gradually enable configurations region by region.
-
-
-Very rough implementation order:
- Add concept of configurations to safekeepers (including control file),
-  implement v3 protocol.
- Implement walproposer changes, including protocol.
- Implement storconn part. Use it in neon_local (and pytest).
- Make cplane store safekeepers per timeline instead of per tenant.
- Implement cplane/storcon integration. Route branch creation/deletion 
-  through storcon. Then we can test migration of new branches.
- Finally import existing branches. Then we can drop cplane 
-  safekeeper selection code. Gradually enable configurations at 
-  computes and safekeepers. Before that, all computes must talk only
-  v3 protocol version.
-
-## Integration with evicted timelines
-
-Currently, `pull_timeline` doesn't work correctly with evicted timelines because
-copy would point to original partial file. To fix let's just do s3 copy of the
-file. It is a bit stupid as generally unnecessary work, but it makes sense to
-implement proper migration before doing smarter timeline archival. [Issue](https://github.com/neondatabase/neon/issues/8542)
-
-## Possible optimizations
-
-Steps above suggest walproposer restart (with re-election) and thus reconnection
-to safekeepers. Since by bumping term on new majority we ensure that leader
-terms are unique even across generation switches it is possible to preserve
-connections. However, it is more complicated, reconnection is very fast and it
-is much more important to avoid compute restart than millisecond order of write
-stall.
-
-Multiple joint consensus: algorithm above rejects attempt to change membership
-while another attempt is in progress. It is possible to overlay them and AFAIK
-Aurora does this but similarly I don't think this is needed.
-
-## Misc
-
-We should use Compute <-> safekeeper protocol change to include other (long
-yearned) modifications:
- send data in network order to make arm work.
- remove term_start_lsn from AppendRequest
- add horizon to TermHistory
- add to ProposerGreeting number of connection from this wp to sk
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -22,11 +22,6 @@ pub struct Key {
    pub field6: u32,
 }

-/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
-/// a struct of fields.
-#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd)]
-pub struct CompactKey(i128);
-
 /// The storage key size.
 pub const KEY_SIZE: usize = 18;

@@ -135,14 +130,6 @@ impl Key {
        }
    }

-    pub fn to_compact(&self) -> CompactKey {
-        CompactKey(self.to_i128())
-    }
-
-    pub fn from_compact(k: CompactKey) -> Self {
-        Self::from_i128(k.0)
-    }
-
    pub const fn next(&self) -> Key {
        self.add(1)
    }
@@ -212,13 +199,6 @@ impl fmt::Display for Key {
    }
 }

-impl fmt::Display for CompactKey {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let k = Key::from_compact(*self);
-        k.fmt(f)
-    }
-}
-
 impl Key {
    pub const MIN: Key = Key {
        field1: u8::MIN,
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -1,5 +1,4 @@
-use std::time::SystemTime;
-use utils::{serde_percent::Percent, serde_system_time};
+use utils::serde_system_time::SystemTime;

 /// Pageserver current utilization and scoring for how good candidate the pageserver would be for
 /// the next tenant.
@@ -10,88 +9,19 @@ use utils::{serde_percent::Percent, serde_system_time};
 /// not handle full u64 values properly.
 #[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
 pub struct PageserverUtilization {
-    /// Used disk space (physical, ground truth from statfs())
+    /// Used disk space
    #[serde(serialize_with = "ser_saturating_u63")]
    pub disk_usage_bytes: u64,
    /// Free disk space
    #[serde(serialize_with = "ser_saturating_u63")]
    pub free_space_bytes: u64,
-
-    /// Wanted disk space, based on the tenant shards currently present on this pageserver: this
-    /// is like disk_usage_bytes, but it is stable and does not change with the cache state of
-    /// tenants, whereas disk_usage_bytes may reach the disk eviction `max_usage_pct` and stay
-    /// there, or may be unrealistically low if the pageserver has attached tenants which haven't
-    /// downloaded layers yet.
-    #[serde(serialize_with = "ser_saturating_u63", default)]
-    pub disk_wanted_bytes: u64,
-
-    // What proportion of total disk space will this pageserver use before it starts evicting data?
-    #[serde(default = "unity_percent")]
-    pub disk_usable_pct: Percent,
-
-    // How many shards are currently on this node?
-    #[serde(default)]
-    pub shard_count: u32,
-
-    // How many shards should this node be able to handle at most?
-    #[serde(default)]
-    pub max_shard_count: u32,
-
-    /// Cached result of [`Self::score`]
+    /// Lower is better score for how good candidate for a next tenant would this pageserver be.
+    #[serde(serialize_with = "ser_saturating_u63")]
    pub utilization_score: u64,
-
    /// When was this snapshot captured, pageserver local time.
    ///
    /// Use millis to give confidence that the value is regenerated often enough.
-    pub captured_at: serde_system_time::SystemTime,
-}
-
-fn unity_percent() -> Percent {
-    Percent::new(0).unwrap()
-}
-
-impl PageserverUtilization {
-    const UTILIZATION_FULL: u64 = 1000000;
-
-    /// Calculate a utilization score.  The result is to be inrepreted as a fraction of
-    /// Self::UTILIZATION_FULL.
-    ///
-    /// Lower values are more affine to scheduling more work on this node.
-    /// - UTILIZATION_FULL represents an ideal node which is fully utilized but should not receive any more work.
-    /// - 0.0 represents an empty node.
-    /// - Negative values are forbidden
-    /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
-    ///   layer eviction.
-    pub fn score(&self) -> u64 {
-        let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
-            * self.disk_usable_pct.get() as u64)
-            / 100;
-        let disk_utilization_score =
-            self.disk_wanted_bytes * Self::UTILIZATION_FULL / disk_usable_capacity;
-
-        let shard_utilization_score =
-            self.shard_count as u64 * Self::UTILIZATION_FULL / self.max_shard_count as u64;
-        std::cmp::max(disk_utilization_score, shard_utilization_score)
-    }
-
-    pub fn refresh_score(&mut self) {
-        self.utilization_score = self.score();
-    }
-
-    /// A utilization structure that has a full utilization score: use this as a placeholder when
-    /// you need a utilization but don't have real values yet.
-    pub fn full() -> Self {
-        Self {
-            disk_usage_bytes: 1,
-            free_space_bytes: 0,
-            disk_wanted_bytes: 1,
-            disk_usable_pct: Percent::new(100).unwrap(),
-            shard_count: 1,
-            max_shard_count: 1,
-            utilization_score: Self::UTILIZATION_FULL,
-            captured_at: serde_system_time::SystemTime(SystemTime::now()),
-        }
-    }
+    pub captured_at: SystemTime,
 }

 /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
@@ -119,19 +49,15 @@ mod tests {
        let doc = PageserverUtilization {
            disk_usage_bytes: u64::MAX,
            free_space_bytes: 0,
-            disk_wanted_bytes: u64::MAX,
-            utilization_score: 13,
-            disk_usable_pct: Percent::new(90).unwrap(),
-            shard_count: 100,
-            max_shard_count: 200,
-            captured_at: serde_system_time::SystemTime(
+            utilization_score: u64::MAX,
+            captured_at: SystemTime(
                std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
            ),
        };

        let s = serde_json::to_string(&doc).unwrap();

-        let expected = "{\"disk_usage_bytes\":9223372036854775807,\"free_space_bytes\":0,\"disk_wanted_bytes\":9223372036854775807,\"disk_usable_pct\":90,\"shard_count\":100,\"max_shard_count\":200,\"utilization_score\":13,\"captured_at\":\"2024-02-21T10:02:59.000Z\"}";
+        let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#;

        assert_eq!(s, expected);
    }
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -5,40 +5,13 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
 /// Can be cloned, moved and kept around in futures as "guard objects".
 #[derive(Clone)]
 pub struct Completion {
-    token: TaskTrackerToken,
-}
-
-impl std::fmt::Debug for Completion {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("Completion")
-            .field("siblings", &self.token.task_tracker().len())
-            .finish()
-    }
-}
-
-impl Completion {
-    /// Returns true if this completion is associated with the given barrier.
-    pub fn blocks(&self, barrier: &Barrier) -> bool {
-        TaskTracker::ptr_eq(self.token.task_tracker(), &barrier.0)
-    }
-
-    pub fn barrier(&self) -> Barrier {
-        Barrier(self.token.task_tracker().clone())
-    }
+    _token: TaskTrackerToken,
 }

 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
 pub struct Barrier(TaskTracker);

-impl std::fmt::Debug for Barrier {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("Barrier")
-            .field("remaining", &self.0.len())
-            .finish()
-    }
-}
-
 impl Default for Barrier {
    fn default() -> Self {
        let (_, rx) = channel();
@@ -78,5 +51,5 @@ pub fn channel() -> (Completion, Barrier) {
    tracker.close();

    let token = tracker.token();
-    (Completion { token }, Barrier(tracker))
+    (Completion { _token: token }, Barrier(tracker))
 }
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -95,7 +95,7 @@ async fn ingest(
            }
        }

-        layer.put_value(key.to_compact(), lsn, &data, &ctx).await?;
+        layer.put_value(key, lsn, &data, &ctx).await?;
    }
    layer.freeze(lsn + 1).await;

--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -124,6 +124,8 @@ fn main() -> anyhow::Result<()> {
    // after setting up logging, log the effective IO engine choice and read path implementations
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
+    info!(?conf.get_impl, "starting with get page implementation");
+    info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");

    let tenants_path = conf.tenants_path();
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -29,12 +29,12 @@ use utils::{
    logging::LogFormat,
 };

-use crate::l0_flush::L0FlushConfig;
-use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
+use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
+use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl};
 use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};

@@ -133,8 +133,14 @@ pub mod defaults {

 #virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'

+#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
+
+#get_impl = '{DEFAULT_GET_IMPL}'
+
 #max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'

+#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -272,8 +278,14 @@ pub struct PageServerConf {

    pub virtual_file_io_engine: virtual_file::IoEngineKind,

+    pub get_vectored_impl: GetVectoredImpl,
+
+    pub get_impl: GetImpl,
+
    pub max_vectored_read_bytes: MaxVectoredReadBytes,

+    pub validate_vectored_get: bool,
+
    pub image_compression: ImageCompressionAlgorithm,

    /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
@@ -384,8 +396,14 @@ struct PageServerConfigBuilder {

    virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,

+    get_vectored_impl: BuilderValue<GetVectoredImpl>,
+
+    get_impl: BuilderValue<GetImpl>,
+
    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,

+    validate_vectored_get: BuilderValue<bool>,
+
    image_compression: BuilderValue<ImageCompressionAlgorithm>,

    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
@@ -475,10 +493,13 @@ impl PageServerConfigBuilder {

            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),

+            get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
+            get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()),
            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
            image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
+            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: Set(L0FlushConfig::default()),
            compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
@@ -638,10 +659,22 @@ impl PageServerConfigBuilder {
        self.virtual_file_io_engine = BuilderValue::Set(value);
    }

+    pub fn get_vectored_impl(&mut self, value: GetVectoredImpl) {
+        self.get_vectored_impl = BuilderValue::Set(value);
+    }
+
+    pub fn get_impl(&mut self, value: GetImpl) {
+        self.get_impl = BuilderValue::Set(value);
+    }
+
    pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
        self.max_vectored_read_bytes = BuilderValue::Set(value);
    }

+    pub fn get_validate_vectored_get(&mut self, value: bool) {
+        self.validate_vectored_get = BuilderValue::Set(value);
+    }
+
    pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
        self.image_compression = BuilderValue::Set(value);
    }
@@ -712,7 +745,10 @@ impl PageServerConfigBuilder {
                heatmap_upload_concurrency,
                secondary_download_concurrency,
                ingest_batch_size,
+                get_vectored_impl,
+                get_impl,
                max_vectored_read_bytes,
+                validate_vectored_get,
                image_compression,
                ephemeral_bytes_per_memory_kb,
                l0_flush,
@@ -966,12 +1002,21 @@ impl PageServerConf {
                "virtual_file_io_engine" => {
                    builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
                }
+                "get_vectored_impl" => {
+                    builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
+                }
+                "get_impl" => {
+                    builder.get_impl(parse_toml_from_str("get_impl", item)?)
+                }
                "max_vectored_read_bytes" => {
                    let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
                    builder.get_max_vectored_read_bytes(
                        MaxVectoredReadBytes(
                            NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0")))
                }
+                "validate_vectored_get" => {
+                    builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
+                }
                "image_compression" => {
                    builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
                }
@@ -1061,11 +1106,14 @@ impl PageServerConf {
            secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
+            get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+            get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
            max_vectored_read_bytes: MaxVectoredReadBytes(
                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                    .expect("Invalid default constant"),
            ),
            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
+            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
            l0_flush: L0FlushConfig::default(),
            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
@@ -1301,10 +1349,13 @@ background_task_maximum_delay = '334 s'
                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
+                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                max_vectored_read_bytes: MaxVectoredReadBytes(
                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                        .expect("Invalid default constant")
                ),
+                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
@@ -1374,10 +1425,13 @@ background_task_maximum_delay = '334 s'
                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                ingest_batch_size: 100,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
+                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                max_vectored_read_bytes: MaxVectoredReadBytes(
                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                        .expect("Invalid default constant")
                ),
+                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1787,11 +1787,9 @@ async fn timeline_checkpoint_handler(
        }

        if wait_until_uploaded {
-            tracing::info!("Waiting for uploads to complete...");
            timeline.remote_client.wait_completion().await
            // XXX map to correct ApiError for the cases where it's due to shutdown
            .context("wait completion").map_err(ApiError::InternalServerError)?;
-            tracing::info!("Uploads completed up to {}", timeline.get_remote_consistent_lsn_projected().unwrap_or(Lsn(0)));
        }

        json_response(StatusCode::OK, ())
@@ -1889,7 +1887,7 @@ async fn timeline_detach_ancestor_handler(
        // drop(tenant);

        let resp = match progress {
-            detach_ancestor::Progress::Prepared(attempt, prepared) => {
+            detach_ancestor::Progress::Prepared(_guard, prepared) => {
                // it would be great to tag the guard on to the tenant activation future
                let reparented_timelines = state
                    .tenant_manager
@@ -1897,10 +1895,10 @@ async fn timeline_detach_ancestor_handler(
                        tenant_shard_id,
                        timeline_id,
                        prepared,
-                        attempt,
                        ctx,
                    )
                    .await
+                    .context("timeline detach ancestor completion")
                    .map_err(ApiError::InternalServerError)?;

                AncestorDetached {
@@ -2359,9 +2357,8 @@ async fn get_utilization(
    // regenerate at most 1Hz to allow polling at any rate.
    if !still_valid {
        let path = state.conf.tenants_path();
-        let doc =
-            crate::utilization::regenerate(state.conf, path.as_std_path(), &state.tenant_manager)
-                .map_err(ApiError::InternalServerError)?;
+        let doc = crate::utilization::regenerate(path.as_std_path())
+            .map_err(ApiError::InternalServerError)?;

        let mut buf = Vec::new();
        serde_json::to_writer(&mut buf, &doc)
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -41,7 +41,6 @@ use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use upload_queue::NotInitialized;
 use utils::backoff;
 use utils::circuit_breaker::CircuitBreaker;
 use utils::completion;
@@ -302,11 +301,7 @@ pub struct Tenant {
    pub(crate) timeline_get_throttle:
        Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,

-    /// An ongoing timeline detach concurrency limiter.
-    ///
-    /// As a tenant will likely be restarted as part of timeline detach ancestor it makes no sense
-    /// to have two running at the same time. A different one can be started if an earlier one
-    /// has failed for whatever reason.
+    /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
    ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,

    /// `index_part.json` based gc blocking reason tracking.
@@ -606,15 +601,6 @@ impl From<PageReconstructError> for GcError {
    }
 }

-impl From<NotInitialized> for GcError {
-    fn from(value: NotInitialized) -> Self {
-        match value {
-            NotInitialized::Uninitialized => GcError::Remote(value.into()),
-            NotInitialized::Stopped | NotInitialized::ShuttingDown => GcError::TimelineCancelled,
-        }
-    }
-}
-
 impl From<timeline::layer_manager::Shutdown> for GcError {
    fn from(_: timeline::layer_manager::Shutdown) -> Self {
        GcError::TimelineCancelled
@@ -837,9 +823,9 @@ impl Tenant {
                            // The Stopping case is for when we have passed control on to DeleteTenantFlow:
                            // if it errors, we will call make_broken when tenant is already in Stopping.
                            assert!(
-                                matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
-                                "the attach task owns the tenant state until activation is complete"
-                            );
+                            matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
+                            "the attach task owns the tenant state until activation is complete"
+                        );

                            *state = TenantState::broken_from_reason(err.to_string());
                        });
@@ -3736,19 +3722,6 @@ impl Tenant {
    pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
        self.tenant_conf.load().tenant_conf.clone()
    }
-
-    /// How much local storage would this tenant like to have?  It can cope with
-    /// less than this (via eviction and on-demand downloads), but this function enables
-    /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
-    /// by keeping important things on local disk.
-    pub(crate) fn local_storage_wanted(&self) -> u64 {
-        let mut wanted = 0;
-        let timelines = self.timelines.lock().unwrap();
-        for timeline in timelines.values() {
-            wanted += timeline.metrics.visible_physical_size_gauge.get();
-        }
-        wanted
-    }
 }

 /// Create the cluster temporarily in 'initdbpath' directory inside the repository
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -285,15 +285,12 @@ impl TimelineMetadata {
    }

    /// When reparenting, the `ancestor_lsn` does not change.
-    ///
-    /// Returns true if anything was changed.
    pub fn reparent(&mut self, timeline: &TimelineId) {
        assert!(self.body.ancestor_timeline.is_some());
        // no assertion for redoing this: it's fine, we may have to repeat this multiple times over
        self.body.ancestor_timeline = Some(*timeline);
    }

-    /// Returns true if anything was changed
    pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) {
        if let Some(ancestor) = self.body.ancestor_timeline {
            assert_eq!(ancestor, branchpoint.0);
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -54,7 +54,7 @@ use utils::id::{TenantId, TimelineId};

 use super::remote_timeline_client::remote_tenant_path;
 use super::secondary::SecondaryTenant;
-use super::timeline::detach_ancestor::{self, PreparedTimelineDetach};
+use super::timeline::detach_ancestor::PreparedTimelineDetach;
 use super::{GlobalShutDown, TenantSharedResources};

 /// For a tenant that appears in TenantsMap, it may either be
@@ -1927,10 +1927,8 @@ impl TenantManager {
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        prepared: PreparedTimelineDetach,
-        mut attempt: detach_ancestor::Attempt,
        ctx: &RequestContext,
    ) -> Result<HashSet<TimelineId>, anyhow::Error> {
-        use crate::tenant::timeline::detach_ancestor::Error;
        // FIXME: this is unnecessary, slotguard already has these semantics
        struct RevertOnDropSlot(Option<SlotGuard>);

@@ -1979,98 +1977,43 @@ impl TenantManager {

        let timeline = tenant.get_timeline(timeline_id, true)?;

-        let resp = timeline
-            .detach_from_ancestor_and_reparent(&tenant, prepared, ctx)
+        let reparented = timeline
+            .complete_detaching_timeline_ancestor(&tenant, prepared, ctx)
            .await?;

        let mut slot_guard = slot_guard.into_inner();

-        let tenant = if resp.reset_tenant_required() {
-            attempt.before_reset_tenant();
-
-            let (_guard, progress) = utils::completion::channel();
-            match tenant.shutdown(progress, ShutdownMode::Hard).await {
-                Ok(()) => {
-                    slot_guard.drop_old_value()?;
-                }
-                Err(_barrier) => {
-                    slot_guard.revert();
-                    // this really should not happen, at all, unless shutdown was already going?
-                    anyhow::bail!("Cannot restart Tenant, already shutting down");
-                }
+        let (_guard, progress) = utils::completion::channel();
+        match tenant.shutdown(progress, ShutdownMode::Hard).await {
+            Ok(()) => {
+                slot_guard.drop_old_value()?;
            }
-
-            let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-            let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
-
-            let shard_identity = config.shard;
-            let tenant = tenant_spawn(
-                self.conf,
-                tenant_shard_id,
-                &tenant_path,
-                self.resources.clone(),
-                AttachedTenantConf::try_from(config)?,
-                shard_identity,
-                None,
-                SpawnMode::Eager,
-                ctx,
-            )?;
-
-            {
-                let mut g = tenant.ongoing_timeline_detach.lock().unwrap();
-                assert!(
-                    g.is_none(),
-                    "there cannot be any new timeline detach ancestor on newly created tenant"
-                );
-                *g = Some((attempt.timeline_id, attempt.new_barrier()));
+            Err(_barrier) => {
+                slot_guard.revert();
+                // this really should not happen, at all, unless shutdown was already going?
+                anyhow::bail!("Cannot restart Tenant, already shutting down");
            }
-
-            slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?;
-            tenant
-        } else {
-            tracing::info!("skipping tenant_reset as no changes made required it");
-            tenant
-        };
-
-        if let Some(reparented) = resp.completed() {
-            // finally ask the restarted tenant to complete the detach
-            //
-            // rationale for 9999s: we don't really have a timetable here; if retried, the caller
-            // will get an 503.
-            tenant
-                .wait_to_become_active(std::time::Duration::from_secs(9999))
-                .await
-                .map_err(|e| {
-                    use pageserver_api::models::TenantState;
-                    use GetActiveTenantError::{Cancelled, WillNotBecomeActive};
-                    match e {
-                        Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) => {
-                            Error::ShuttingDown
-                        }
-                        other => Error::Unexpected(other.into()),
-                    }
-                })?;
-
-            utils::pausable_failpoint!(
-                "timeline-detach-ancestor::after_activating_before_finding-pausable"
-            );
-
-            let timeline = tenant
-                .get_timeline(attempt.timeline_id, true)
-                .map_err(|_| Error::DetachedNotFoundAfterRestart)?;
-
-            timeline
-                .complete_detaching_timeline_ancestor(&tenant, attempt, ctx)
-                .await
-                .map(|()| reparented)
-                .map_err(|e| e.into())
-        } else {
-            // at least the latest versions have now been downloaded and refreshed; be ready to
-            // retry another time.
-            Err(anyhow::anyhow!(
-                "failed to reparent all candidate timelines, please retry"
-            ))
        }
+
+        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
+        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
+
+        let shard_identity = config.shard;
+        let tenant = tenant_spawn(
+            self.conf,
+            tenant_shard_id,
+            &tenant_path,
+            self.resources.clone(),
+            AttachedTenantConf::try_from(config)?,
+            shard_identity,
+            None,
+            SpawnMode::Eager,
+            ctx,
+        )?;
+
+        slot_guard.upsert(TenantSlot::Attached(tenant))?;
+
+        Ok(reparented)
    }

    /// A page service client sends a TenantId, and to look up the correct Tenant we must
@@ -2142,57 +2085,6 @@ impl TenantManager {
            }
        }
    }
-
-    /// Calculate the tenant shards' contributions to this pageserver's utilization metrics.  The
-    /// returned values are:
-    ///  - the number of bytes of local disk space this pageserver's shards are requesting, i.e.
-    ///    how much space they would use if not impacted by disk usage eviction.
-    ///  - the number of tenant shards currently on this pageserver, including attached
-    ///    and secondary.
-    ///
-    /// This function is quite expensive: callers are expected to cache the result and
-    /// limit how often they call it.
-    pub(crate) fn calculate_utilization(&self) -> Result<(u64, u32), TenantMapListError> {
-        let tenants = self.tenants.read().unwrap();
-        let m = match &*tenants {
-            TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
-        };
-        let shard_count = m.len();
-        let mut wanted_bytes = 0;
-
-        for tenant_slot in m.values() {
-            match tenant_slot {
-                TenantSlot::InProgress(_barrier) => {
-                    // While a slot is being changed, we can't know how much storage it wants.  This
-                    // means this function's output can fluctuate if a lot of changes are going on
-                    // (such as transitions from secondary to attached).
-                    //
-                    // We could wait for the barrier and retry, but it's important that the utilization
-                    // API is responsive, and the data quality impact is not very significant.
-                    continue;
-                }
-                TenantSlot::Attached(tenant) => {
-                    wanted_bytes += tenant.local_storage_wanted();
-                }
-                TenantSlot::Secondary(secondary) => {
-                    let progress = secondary.progress.lock().unwrap();
-                    wanted_bytes += if progress.heatmap_mtime.is_some() {
-                        // If we have heatmap info, then we will 'want' the sum
-                        // of the size of layers in the heatmap: this is how much space
-                        // we would use if not doing any eviction.
-                        progress.bytes_total
-                    } else {
-                        // In the absence of heatmap info, assume that the secondary location simply
-                        // needs as much space as it is currently using.
-                        secondary.resident_size_metric.get()
-                    }
-                }
-            }
-        }
-
-        Ok((wanted_bytes, shard_count as u32))
-    }
 }

 #[derive(Debug, thiserror::Error)]
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -736,13 +736,12 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    /// Reparent this timeline to a new parent.
-    ///
-    /// A retryable step of timeline ancestor detach.
    pub(crate) async fn schedule_reparenting_and_wait(
        self: &Arc<Self>,
        new_parent: &TimelineId,
    ) -> anyhow::Result<()> {
+        // FIXME: because of how Timeline::schedule_uploads works when called from layer flushing
+        // and reads the in-memory part we cannot do the detaching like this
        let receiver = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;
@@ -753,25 +752,17 @@ impl RemoteTimelineClient {
                ));
            };

-            let uploaded = &upload_queue.clean.0.metadata;
+            upload_queue.dirty.metadata.reparent(new_parent);
+            upload_queue.dirty.lineage.record_previous_ancestor(&prev);

-            if uploaded.ancestor_timeline().is_none() && !uploaded.ancestor_lsn().is_valid() {
-                // nothing to do
-                None
-            } else {
-                upload_queue.dirty.metadata.reparent(new_parent);
-                upload_queue.dirty.lineage.record_previous_ancestor(&prev);
+            self.schedule_index_upload(upload_queue)?;

-                self.schedule_index_upload(upload_queue)?;
-
-                Some(self.schedule_barrier0(upload_queue))
-            }
+            self.schedule_barrier0(upload_queue)
        };

-        if let Some(receiver) = receiver {
-            Self::wait_completion0(receiver).await?;
-        }
-        Ok(())
+        Self::wait_completion0(receiver)
+            .await
+            .context("wait completion")
    }

    /// Schedules uploading a new version of `index_part.json` with the given layers added,
@@ -787,30 +778,26 @@ impl RemoteTimelineClient {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;

-            if upload_queue.clean.0.lineage.detached_previous_ancestor() == Some(adopted) {
-                None
-            } else {
-                upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
-                upload_queue.dirty.lineage.record_detaching(&adopted);
+            upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
+            upload_queue.dirty.lineage.record_detaching(&adopted);

-                for layer in layers {
-                    let prev = upload_queue
-                        .dirty
-                        .layer_metadata
-                        .insert(layer.layer_desc().layer_name(), layer.metadata());
-                    assert!(prev.is_none(), "copied layer existed already {layer}");
-                }
-
-                self.schedule_index_upload(upload_queue)?;
-
-                Some(self.schedule_barrier0(upload_queue))
+            for layer in layers {
+                upload_queue
+                    .dirty
+                    .layer_metadata
+                    .insert(layer.layer_desc().layer_name(), layer.metadata());
            }
+
+            self.schedule_index_upload(upload_queue)?;
+
+            let barrier = self.schedule_barrier0(upload_queue);
+            self.launch_queued_tasks(upload_queue);
+            barrier
        };

-        if let Some(barrier) = barrier {
-            Self::wait_completion0(barrier).await?;
-        }
-        Ok(())
+        Self::wait_completion0(barrier)
+            .await
+            .context("wait completion")
    }

    /// Adds a gc blocking reason for this timeline if one does not exist already.
@@ -886,7 +873,12 @@ impl RemoteTimelineClient {
            let upload_queue = guard.initialized_mut()?;

            if let index::GcBlockingReason::DetachAncestor = reason {
-                if !upload_queue.clean.0.lineage.is_detached_from_ancestor() {
+                if !upload_queue
+                    .clean
+                    .0
+                    .lineage
+                    .is_detached_from_original_ancestor()
+                {
                    drop(guard);
                    panic!("cannot complete timeline_ancestor_detach while not detached");
                }
@@ -993,10 +985,7 @@ impl RemoteTimelineClient {
    ///
    /// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`]
    /// is invoked on them.
-    pub(crate) fn schedule_gc_update(
-        self: &Arc<Self>,
-        gc_layers: &[Layer],
-    ) -> Result<(), NotInitialized> {
+    pub(crate) fn schedule_gc_update(self: &Arc<Self>, gc_layers: &[Layer]) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -216,47 +216,26 @@ fn is_false(b: &bool) -> bool {
 impl Lineage {
    const REMEMBER_AT_MOST: usize = 100;

-    pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) -> bool {
+    pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) {
        if self.reparenting_history.last() == Some(old_ancestor) {
            // do not re-record it
-            false
-        } else {
-            #[cfg(feature = "testing")]
-            {
-                let existing = self
-                    .reparenting_history
-                    .iter()
-                    .position(|x| x == old_ancestor);
-                assert_eq!(
-                    existing, None,
-                    "we cannot reparent onto and off and onto the same timeline twice"
-                );
-            }
-            let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
-
-            self.reparenting_history_truncated |= drop_oldest;
-            if drop_oldest {
-                self.reparenting_history.remove(0);
-            }
-            self.reparenting_history.push(*old_ancestor);
-            true
+            return;
        }
+
+        let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
+
+        self.reparenting_history_truncated |= drop_oldest;
+        if drop_oldest {
+            self.reparenting_history.remove(0);
+        }
+        self.reparenting_history.push(*old_ancestor);
    }

-    /// Returns true if anything changed.
-    pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) -> bool {
-        if let Some((id, lsn, _)) = self.original_ancestor {
-            assert_eq!(
-                &(id, lsn),
-                branchpoint,
-                "detaching attempt has to be for the same ancestor we are already detached from"
-            );
-            false
-        } else {
-            self.original_ancestor =
-                Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
-            true
-        }
+    pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) {
+        assert!(self.original_ancestor.is_none());
+
+        self.original_ancestor =
+            Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
    }

    /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
@@ -268,16 +247,10 @@ impl Lineage {
            .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
    }

-    /// Returns true if the timeline originally had an ancestor, and no longer has one.
-    pub(crate) fn is_detached_from_ancestor(&self) -> bool {
+    pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
        self.original_ancestor.is_some()
    }

-    /// Returns original ancestor timeline id and lsn that this timeline has been detached from.
-    pub(crate) fn detached_previous_ancestor(&self) -> Option<(TimelineId, Lsn)> {
-        self.original_ancestor.map(|(id, lsn, _)| (id, lsn))
-    }
-
    pub(crate) fn is_reparented(&self) -> bool {
        !self.reparenting_history.is_empty()
    }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -369,6 +369,9 @@ impl ImageLayerInner {
        self.lsn
    }

+    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
+    /// - inner has the success or transient failure
+    /// - outer has the permanent failure
    pub(super) async fn load(
        path: &Utf8Path,
        lsn: Lsn,
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -15,7 +15,6 @@ use crate::tenant::PageReconstructError;
 use crate::{l0_flush, page_cache, walrecord};
 use anyhow::{anyhow, Result};
 use camino::Utf8PathBuf;
-use pageserver_api::key::CompactKey;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
@@ -79,7 +78,7 @@ pub struct InMemoryLayerInner {
    /// All versions of all pages in the layer are kept here. Indexed
    /// by block number and LSN. The value is an offset into the
    /// ephemeral file where the page version is stored.
-    index: BTreeMap<CompactKey, VecMap<Lsn, u64>>,
+    index: BTreeMap<Key, VecMap<Lsn, u64>>,

    /// The values are stored in a serialized format in this file.
    /// Each serialized Value is preceded by a 'u32' length field.
@@ -313,12 +312,8 @@ impl InMemoryLayer {
        let reader = inner.file.block_cursor();

        for range in keyspace.ranges.iter() {
-            for (key, vec_map) in inner
-                .index
-                .range(range.start.to_compact()..range.end.to_compact())
-            {
-                let key = Key::from_compact(*key);
-                let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
+            for (key, vec_map) in inner.index.range(range.start..range.end) {
+                let lsn_range = match reconstruct_state.get_cached_lsn(key) {
                    Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
                    None => self.start_lsn..end_lsn,
                };
@@ -329,18 +324,20 @@ impl InMemoryLayer {
                    // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
                    let buf = reader.read_blob(*pos, &ctx).await;
                    if let Err(e) = buf {
-                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
+                        reconstruct_state
+                            .on_key_error(*key, PageReconstructError::from(anyhow!(e)));
                        break;
                    }

                    let value = Value::des(&buf.unwrap());
                    if let Err(e) = value {
-                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
+                        reconstruct_state
+                            .on_key_error(*key, PageReconstructError::from(anyhow!(e)));
                        break;
                    }

                    let key_situation =
-                        reconstruct_state.update_key(&key, *entry_lsn, value.unwrap());
+                        reconstruct_state.update_key(key, *entry_lsn, value.unwrap());
                    if key_situation == ValueReconstructSituation::Complete {
                        break;
                    }
@@ -420,7 +417,7 @@ impl InMemoryLayer {
    /// Adds the page version to the in-memory tree
    pub async fn put_value(
        &self,
-        key: CompactKey,
+        key: Key,
        lsn: Lsn,
        buf: &[u8],
        ctx: &RequestContext,
@@ -433,7 +430,7 @@ impl InMemoryLayer {
    async fn put_value_locked(
        &self,
        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
-        key: CompactKey,
+        key: Key,
        lsn: Lsn,
        buf: &[u8],
        ctx: &RequestContext,
@@ -542,8 +539,6 @@ impl InMemoryLayer {
        let end_lsn = *self.end_lsn.get().unwrap();

        let key_count = if let Some(key_range) = key_range {
-            let key_range = key_range.start.to_compact()..key_range.end.to_compact();
-
            inner
                .index
                .iter()
@@ -583,7 +578,7 @@ impl InMemoryLayer {
                        let will_init = Value::des(&buf)?.will_init();
                        let res;
                        (buf, res) = delta_layer_writer
-                            .put_value_bytes(Key::from_compact(*key), *lsn, buf, will_init, &ctx)
+                            .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
                            .await;
                        res?;
                    }
@@ -622,7 +617,7 @@ impl InMemoryLayer {
                        let will_init = Value::des(&buf)?.will_init();
                        let res;
                        (buf, res) = delta_layer_writer
-                            .put_value_bytes(Key::from_compact(*key), *lsn, buf, will_init, ctx)
+                            .put_value_bytes(*key, *lsn, buf, will_init, ctx)
                            .await;
                        res?;
                    }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1848,8 +1848,8 @@ impl ResidentLayer {
    /// Read all they keys in this layer which match the ShardIdentity, and write them all to
    /// the provided writer.  Return the number of keys written.
    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
-    pub(crate) async fn filter(
-        &self,
+    pub(crate) async fn filter<'a>(
+        &'a self,
        shard_identity: &ShardIdentity,
        writer: &mut ImageLayerWriter,
        ctx: &RequestContext,
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -211,11 +211,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            } else {
                // Run compaction
                match tenant.compaction_iteration(&cancel, &ctx).await {
-                    Ok(has_pending_task) => {
-                        error_run_count = 0;
-                        // schedule the next compaction immediately in case there is a pending compaction task
-                        if has_pending_task { Duration::ZERO } else { period }
-                    }
                    Err(e) => {
                        let wait_duration = backoff::exponential_backoff_duration_seconds(
                            error_run_count + 1,
@@ -232,6 +227,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                        );
                        wait_duration
                    }
+                    Ok(has_pending_task) => {
+                        error_run_count = 0;
+                        // schedule the next compaction immediately in case there is a pending compaction task
+                        if has_pending_task { Duration::from_secs(0) } else { period }
+                    }
                }
            };

@@ -265,8 +265,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                    count_throttled,
                    sum_throttled_usecs,
                    allowed_rps=%format_args!("{allowed_rps:.0}"),
-                    "shard was throttled in the last n_seconds"
-                );
+                    "shard was throttled in the last n_seconds")
            });

            // Sleep
@@ -366,13 +365,14 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            if first {
                first = false;

-                let delays = async {
-                    delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?;
-                    random_init_delay(period, &cancel).await?;
-                    Ok::<_, Cancelled>(())
-                };
+                if delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel)
+                    .await
+                    .is_err()
+                {
+                    break;
+                }

-                if delays.await.is_err() {
+                if random_init_delay(period, &cancel).await.is_err() {
                    break;
                }
            }
@@ -424,6 +424,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);

+            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
                .await
                .is_ok()
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -802,6 +802,40 @@ impl From<GetReadyAncestorError> for PageReconstructError {
    }
 }

+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum GetVectoredImpl {
+    Sequential,
+    Vectored,
+}
+
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum GetImpl {
+    Legacy,
+    Vectored,
+}
+
 pub(crate) enum WaitLsnWaiter<'a> {
    Timeline(&'a Timeline),
    Tenant,
@@ -961,10 +995,11 @@ impl Timeline {
        }

        trace!(
-            "get vectored request for {:?}@{} from task kind {:?}",
+            "get vectored request for {:?}@{} from task kind {:?} will use {} implementation",
            keyspace,
            lsn,
            ctx.task_kind(),
+            self.conf.get_vectored_impl
        );

        let start = crate::metrics::GET_VECTORED_LATENCY
@@ -3917,10 +3952,6 @@ impl Timeline {
                        .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
                        .await?;

-                    if self.cancel.is_cancelled() {
-                        return Err(CreateImageLayersError::Cancelled);
-                    }
-
                    for (img_key, img) in results {
                        let img = match img {
                            Ok(img) => img,
@@ -4028,9 +4059,6 @@ impl Timeline {
                next_start_key: img_range.end,
            });
        }
-        if self.cancel.is_cancelled() {
-            return Err(CreateImageLayersError::Cancelled);
-        }
        let mut wrote_any_image = false;
        for (k, v) in data {
            if v.is_empty() {
@@ -4145,10 +4173,6 @@ impl Timeline {
        let check_for_image_layers = self.should_check_if_image_layers_required(lsn);

        for partition in partitioning.parts.iter() {
-            if self.cancel.is_cancelled() {
-                return Err(CreateImageLayersError::Cancelled);
-            }
-
            let img_range = start..partition.ranges.last().unwrap().end;
            let compact_metadata = partition.overlaps(&Key::metadata_key_range());
            if compact_metadata {
@@ -4328,34 +4352,18 @@ impl Timeline {
        detach_ancestor::prepare(self, tenant, options, ctx).await
    }

-    /// Second step of detach from ancestor; detaches the `self` from it's current ancestor and
-    /// reparents any reparentable children of previous ancestor.
+    /// Completes the ancestor detach. This method is to be called while holding the
+    /// TenantManager's tenant slot, so during this method we cannot be deleted nor can any
+    /// timeline be deleted. After this method returns successfully, tenant must be reloaded.
    ///
-    /// This method is to be called while holding the TenantManager's tenant slot, so during this
-    /// method we cannot be deleted nor can any timeline be deleted. After this method returns
-    /// successfully, tenant must be reloaded.
-    ///
-    /// Final step will be to [`Self::complete_detaching_timeline_ancestor`] after optionally
-    /// resetting the tenant.
-    pub(crate) async fn detach_from_ancestor_and_reparent(
+    /// Pageserver receiving a SIGKILL during this operation is not supported (yet).
+    pub(crate) async fn complete_detaching_timeline_ancestor(
        self: &Arc<Timeline>,
        tenant: &crate::tenant::Tenant,
        prepared: detach_ancestor::PreparedTimelineDetach,
        ctx: &RequestContext,
-    ) -> Result<detach_ancestor::DetachingAndReparenting, anyhow::Error> {
-        detach_ancestor::detach_and_reparent(self, tenant, prepared, ctx).await
-    }
-
-    /// Final step which unblocks the GC.
-    ///
-    /// The tenant must've been reset if ancestry was modified previously (in tenant manager).
-    pub(crate) async fn complete_detaching_timeline_ancestor(
-        self: &Arc<Timeline>,
-        tenant: &crate::tenant::Tenant,
-        attempt: detach_ancestor::Attempt,
-        ctx: &RequestContext,
-    ) -> Result<(), detach_ancestor::Error> {
-        detach_ancestor::complete(self, tenant, attempt, ctx).await
+    ) -> Result<HashSet<TimelineId>, anyhow::Error> {
+        detach_ancestor::complete(self, tenant, prepared, ctx).await
    }

    /// Switch aux file policy and schedule upload to the index part.
@@ -4413,24 +4421,22 @@ impl From<super::upload_queue::NotInitialized> for CompactionError {
    }
 }

-impl From<super::storage_layer::layer::DownloadError> for CompactionError {
-    fn from(e: super::storage_layer::layer::DownloadError) -> Self {
+impl CompactionError {
+    /// We cannot do compaction because we could not download a layer that is input to the compaction.
+    pub(crate) fn input_layer_download_failed(
+        e: super::storage_layer::layer::DownloadError,
+    ) -> Self {
        match e {
-            super::storage_layer::layer::DownloadError::TimelineShutdown
-            | super::storage_layer::layer::DownloadError::DownloadCancelled => {
-                CompactionError::ShuttingDown
-            }
-            super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads
-            | super::storage_layer::layer::DownloadError::DownloadRequired
-            | super::storage_layer::layer::DownloadError::NotFile(_)
-            | super::storage_layer::layer::DownloadError::DownloadFailed
-            | super::storage_layer::layer::DownloadError::PreStatFailed(_) => {
-                CompactionError::Other(anyhow::anyhow!(e))
-            }
+            super::storage_layer::layer::DownloadError::TimelineShutdown |
+            /* TODO DownloadCancelled correct here? */
+            super::storage_layer::layer::DownloadError::DownloadCancelled  => CompactionError::ShuttingDown,
+            super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads |
+            super::storage_layer::layer::DownloadError::DownloadRequired |
+            super::storage_layer::layer::DownloadError::NotFile(_) |
+            super::storage_layer::layer::DownloadError::DownloadFailed |
+            super::storage_layer::layer::DownloadError::PreStatFailed(_)=>CompactionError::Other(anyhow::anyhow!(e)),
            #[cfg(test)]
-            super::storage_layer::layer::DownloadError::Failpoint(_) => {
-                CompactionError::Other(anyhow::anyhow!(e))
-            }
+            super::storage_layer::layer::DownloadError::Failpoint(_) =>  CompactionError::Other(anyhow::anyhow!(e)),
        }
    }
 }
@@ -4984,7 +4990,15 @@ impl Timeline {

            result.layers_removed = gc_layers.len() as u64;

-            self.remote_client.schedule_gc_update(&gc_layers)?;
+            self.remote_client
+                .schedule_gc_update(&gc_layers)
+                .map_err(|e| {
+                    if self.cancel.is_cancelled() {
+                        GcError::TimelineCancelled
+                    } else {
+                        GcError::Remote(e)
+                    }
+                })?;

            guard.open_mut()?.finish_gc_timeline(&gc_layers);

@@ -5545,7 +5559,7 @@ impl<'a> TimelineWriter<'a> {

        let action = self.get_open_layer_action(lsn, buf_size);
        let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
-        let res = layer.put_value(key.to_compact(), lsn, &buf, ctx).await;
+        let res = layer.put_value(key, lsn, &buf, ctx).await;

        if res.is_ok() {
            // Update the current size only when the entire write was ok.
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -489,7 +489,10 @@ impl Timeline {
            // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
            //    - GC, which at worst witnesses us "undelete" a layer that they just deleted.
            //    - ingestion, which only inserts layers, therefore cannot collide with us.
-            let resident = layer.download_and_keep_resident().await?;
+            let resident = layer
+                .download_and_keep_resident()
+                .await
+                .map_err(CompactionError::input_layer_download_failed)?;

            let keys_written = resident
                .filter(&self.shard_identity, &mut image_layer_writer, ctx)
@@ -690,14 +693,23 @@ impl Timeline {

        let mut fully_compacted = true;

-        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
+        deltas_to_compact.push(
+            first_level0_delta
+                .download_and_keep_resident()
+                .await
+                .map_err(CompactionError::input_layer_download_failed)?,
+        );
        for l in level0_deltas_iter {
            let lsn_range = &l.layer_desc().lsn_range;

            if lsn_range.start != prev_lsn_end {
                break;
            }
-            deltas_to_compact.push(l.download_and_keep_resident().await?);
+            deltas_to_compact.push(
+                l.download_and_keep_resident()
+                    .await
+                    .map_err(CompactionError::input_layer_download_failed)?,
+            );
            deltas_to_compact_bytes += l.metadata().file_size;
            prev_lsn_end = lsn_range.end;

@@ -748,9 +760,6 @@ impl Timeline {
        let all_keys = {
            let mut all_keys = Vec::new();
            for l in deltas_to_compact.iter() {
-                if self.cancel.is_cancelled() {
-                    return Err(CompactionError::ShuttingDown);
-                }
                all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
            }
            // The current stdlib sorting implementation is designed in a way where it is
@@ -833,11 +842,6 @@ impl Timeline {
        };
        stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
        drop_rlock(guard);
-
-        if self.cancel.is_cancelled() {
-            return Err(CompactionError::ShuttingDown);
-        }
-
        stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();

        // This iterator walks through all key-value pairs from all the layers
@@ -1133,10 +1137,6 @@ impl Timeline {

            if !self.shard_identity.is_key_disposable(&key) {
                if writer.is_none() {
-                    if self.cancel.is_cancelled() {
-                        // to be somewhat responsive to cancellation, check for each new layer
-                        return Err(CompactionError::ShuttingDown);
-                    }
                    // Create writer if not initiaized yet
                    writer = Some(
                        DeltaLayerWriter::new(
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -5,16 +5,12 @@ use crate::{
    context::{DownloadBehavior, RequestContext},
    task_mgr::TaskKind,
    tenant::{
-        mgr::GetActiveTenantError,
-        remote_timeline_client::index::GcBlockingReason::DetachAncestor,
        storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer},
        Tenant,
    },
    virtual_file::{MaybeFatalIo, VirtualFile},
 };
-use anyhow::Context;
 use pageserver_api::models::detach_ancestor::AncestorDetached;
-use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
 use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
@@ -42,12 +38,6 @@ pub(crate) enum Error {
    #[error("remote copying layer failed")]
    CopyFailed(#[source] anyhow::Error),

-    #[error("wait for tenant to activate after restarting")]
-    WaitToActivate(#[source] GetActiveTenantError),
-
-    #[error("detached timeline was not found after restart")]
-    DetachedNotFoundAfterRestart,
-
    #[error("unexpected error")]
    Unexpected(#[source] anyhow::Error),

@@ -65,10 +55,6 @@ impl From<Error> for ApiError {
            Error::OtherTimelineDetachOngoing(_) => {
                ApiError::ResourceUnavailable("other timeline detach is already ongoing".into())
            }
-            e @ Error::WaitToActivate(_) => {
-                let s = utils::error::report_compact_sources(&e).to_string();
-                ApiError::ResourceUnavailable(s.into())
-            }
            // All of these contain shutdown errors, in fact, it's the most common
            e @ Error::FlushAncestor(_)
            | e @ Error::RewrittenDeltaDownloadFailed(_)
@@ -77,7 +63,6 @@ impl From<Error> for ApiError {
            | e @ Error::CopyFailed(_)
            | e @ Error::Unexpected(_)
            | e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()),
-            Error::DetachedNotFoundAfterRestart => ApiError::NotFound(value.into()),
        }
    }
 }
@@ -111,25 +96,8 @@ impl From<FlushLayerError> for Error {
    }
 }

-impl From<GetActiveTenantError> for Error {
-    fn from(value: GetActiveTenantError) -> Self {
-        use pageserver_api::models::TenantState;
-        use GetActiveTenantError::*;
-
-        match value {
-            Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) | SwitchedTenant => {
-                Error::ShuttingDown
-            }
-            WaitForActiveTimeout { .. } | NotFound(_) | Broken(_) | WillNotBecomeActive(_) => {
-                // NotFound seems out-of-place
-                Error::WaitToActivate(value)
-            }
-        }
-    }
-}
-
 pub(crate) enum Progress {
-    Prepared(Attempt, PreparedTimelineDetach),
+    Prepared(completion::Completion, PreparedTimelineDetach),
    Done(AncestorDetached),
 }

@@ -153,26 +121,6 @@ impl Default for Options {
    }
 }

-/// Represents an across tenant reset exclusive single attempt to detach ancestor.
-#[derive(Debug)]
-pub(crate) struct Attempt {
-    pub(crate) timeline_id: TimelineId,
-
-    _guard: completion::Completion,
-    gate_entered: Option<utils::sync::gate::GateGuard>,
-}
-
-impl Attempt {
-    pub(crate) fn before_reset_tenant(&mut self) {
-        let taken = self.gate_entered.take();
-        assert!(taken.is_some());
-    }
-
-    pub(crate) fn new_barrier(&self) -> completion::Barrier {
-        self._guard.barrier()
-    }
-}
-
 /// See [`Timeline::prepare_to_detach_from_ancestor`]
 pub(super) async fn prepare(
    detached: &Arc<Timeline>,
@@ -187,33 +135,15 @@ pub(super) async fn prepare(
        .as_ref()
        .map(|tl| (tl.clone(), detached.ancestor_lsn))
    else {
-        let still_in_progress = {
+        {
            let accessor = detached.remote_client.initialized_upload_queue()?;

            // we are safe to inspect the latest uploaded, because we can only witness this after
            // restart is complete and ancestor is no more.
            let latest = accessor.latest_uploaded_index_part();
-            if latest.lineage.detached_previous_ancestor().is_none() {
+            if !latest.lineage.is_detached_from_original_ancestor() {
                return Err(NoAncestor);
-            };
-
-            latest
-                .gc_blocking
-                .as_ref()
-                .is_some_and(|b| b.blocked_by(DetachAncestor))
-        };
-
-        if still_in_progress {
-            // gc is still blocked, we can still reparent and complete.
-            // we are safe to reparent remaining, because they were locked in in the beginning.
-            let attempt = continue_with_blocked_gc(detached, tenant).await?;
-
-            // because the ancestor of detached is already set to none, we have published all
-            // of the layers, so we are still "prepared."
-            return Ok(Progress::Prepared(
-                attempt,
-                PreparedTimelineDetach { layers: Vec::new() },
-            ));
+            }
        }

        let reparented_timelines = reparented_direct_children(detached, tenant)?;
@@ -234,7 +164,22 @@ pub(super) async fn prepare(
        return Err(TooManyAncestors);
    }

-    let attempt = start_new_attempt(detached, tenant).await?;
+    // before we acquire the gate, we must mark the ancestor as having a detach operation
+    // ongoing which will block other concurrent detach operations so we don't get to ackward
+    // situations where there would be two branches trying to reparent earlier branches.
+    let (guard, barrier) = completion::channel();
+
+    {
+        let mut guard = tenant.ongoing_timeline_detach.lock().unwrap();
+        if let Some((tl, other)) = guard.as_ref() {
+            if !other.is_ready() {
+                return Err(OtherTimelineDetachOngoing(*tl));
+            }
+        }
+        *guard = Some((detached.timeline_id, barrier));
+    }
+
+    let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;

    utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable");

@@ -300,8 +245,7 @@ pub(super) async fn prepare(
    };

    // TODO: layers are already sorted by something: use that to determine how much of remote
-    // copies are already done -- gc is blocked, but a compaction could had happened on ancestor,
-    // which is something to keep in mind if copy skipping is implemented.
+    // copies are already done.
    tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers");

    // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after
@@ -315,33 +259,29 @@ pub(super) async fn prepare(

        let mut wrote_any = false;

-        let limiter = Arc::new(Semaphore::new(options.rewrite_concurrency.get()));
+        let limiter = Arc::new(tokio::sync::Semaphore::new(
+            options.rewrite_concurrency.get(),
+        ));

        for layer in straddling_branchpoint {
            let limiter = limiter.clone();
            let timeline = detached.clone();
            let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download);

-            let span = tracing::info_span!("upload_rewritten_layer", %layer);
-            tasks.spawn(
-                async move {
-                    let _permit = limiter.acquire().await;
-                    let copied =
-                        upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx)
-                            .await?;
-                    if let Some(copied) = copied.as_ref() {
-                        tracing::info!(%copied, "rewrote and uploaded");
-                    }
-                    Ok(copied)
-                }
-                .instrument(span),
-            );
+            tasks.spawn(async move {
+                let _permit = limiter.acquire().await;
+                let copied =
+                    upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx)
+                        .await?;
+                Ok(copied)
+            });
        }

        while let Some(res) = tasks.join_next().await {
            match res {
                Ok(Ok(Some(copied))) => {
                    wrote_any = true;
+                    tracing::info!(layer=%copied, "rewrote and uploaded");
                    new_layers.push(copied);
                }
                Ok(Ok(None)) => {}
@@ -368,7 +308,7 @@ pub(super) async fn prepare(
    }

    let mut tasks = tokio::task::JoinSet::new();
-    let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get()));
+    let limiter = Arc::new(tokio::sync::Semaphore::new(options.copy_concurrency.get()));

    for adopted in rest_of_historic {
        let limiter = limiter.clone();
@@ -402,56 +342,7 @@ pub(super) async fn prepare(

    let prepared = PreparedTimelineDetach { layers: new_layers };

-    Ok(Progress::Prepared(attempt, prepared))
-}
-
-async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result<Attempt, Error> {
-    let attempt = obtain_exclusive_attempt(detached, tenant)?;
-
-    // insert the block in the index_part.json, if not already there.
-    let _dont_care = tenant
-        .gc_block
-        .insert(
-            detached,
-            crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor,
-        )
-        .await
-        // FIXME: better error
-        .map_err(Error::Unexpected)?;
-
-    Ok(attempt)
-}
-
-async fn continue_with_blocked_gc(detached: &Timeline, tenant: &Tenant) -> Result<Attempt, Error> {
-    // FIXME: it would be nice to confirm that there is an in-memory version, since we've just
-    // verified there is a persistent one?
-    obtain_exclusive_attempt(detached, tenant)
-}
-
-fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result<Attempt, Error> {
-    use Error::{OtherTimelineDetachOngoing, ShuttingDown};
-
-    // ensure we are the only active attempt for this tenant
-    let (guard, barrier) = completion::channel();
-    {
-        let mut guard = tenant.ongoing_timeline_detach.lock().unwrap();
-        if let Some((tl, other)) = guard.as_ref() {
-            if !other.is_ready() {
-                return Err(OtherTimelineDetachOngoing(*tl));
-            }
-            // FIXME: no test enters here
-        }
-        *guard = Some((detached.timeline_id, barrier));
-    }
-
-    // ensure the gate is still open
-    let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
-
-    Ok(Attempt {
-        timeline_id: detached.timeline_id,
-        _guard: guard,
-        gate_entered: Some(_gate_entered),
-    })
+    Ok(Progress::Prepared(guard, prepared))
 }

 fn reparented_direct_children(
@@ -657,207 +548,96 @@ async fn remote_copy(
        .map_err(CopyFailed)
 }

-pub(crate) enum DetachingAndReparenting {
-    /// All of the following timeline ids were reparented and the timeline ancestor detach must be
-    /// marked as completed.
-    Reparented(HashSet<TimelineId>),
-
-    /// Some of the reparentings failed. The timeline ancestor detach must **not** be marked as
-    /// completed.
-    ///
-    /// Nested `must_reset_tenant` is set to true when any restart requiring changes were made.
-    SomeReparentingFailed { must_reset_tenant: bool },
-
-    /// Detaching and reparentings were completed in a previous attempt. Timeline ancestor detach
-    /// must be marked as completed.
-    AlreadyDone(HashSet<TimelineId>),
-}
-
-impl DetachingAndReparenting {
-    pub(crate) fn reset_tenant_required(&self) -> bool {
-        use DetachingAndReparenting::*;
-        match self {
-            Reparented(_) => true,
-            SomeReparentingFailed { must_reset_tenant } => *must_reset_tenant,
-            AlreadyDone(_) => false,
-        }
-    }
-
-    pub(crate) fn completed(self) -> Option<HashSet<TimelineId>> {
-        use DetachingAndReparenting::*;
-        match self {
-            Reparented(x) | AlreadyDone(x) => Some(x),
-            SomeReparentingFailed { .. } => None,
-        }
-    }
-}
-
-/// See [`Timeline::detach_from_ancestor_and_reparent`].
-pub(super) async fn detach_and_reparent(
+/// See [`Timeline::complete_detaching_timeline_ancestor`].
+pub(super) async fn complete(
    detached: &Arc<Timeline>,
    tenant: &Tenant,
    prepared: PreparedTimelineDetach,
    _ctx: &RequestContext,
-) -> Result<DetachingAndReparenting, anyhow::Error> {
+) -> Result<HashSet<TimelineId>, anyhow::Error> {
    let PreparedTimelineDetach { layers } = prepared;

-    #[derive(Debug)]
-    enum Ancestor {
-        NotDetached(Arc<Timeline>, Lsn),
-        Detached(Arc<Timeline>, Lsn),
-    }
-
-    let (recorded_branchpoint, still_ongoing) = {
-        let access = detached.remote_client.initialized_upload_queue()?;
-        let latest = access.latest_uploaded_index_part();
-
-        (
-            latest.lineage.detached_previous_ancestor(),
-            latest
-                .gc_blocking
-                .as_ref()
-                .is_some_and(|b| b.blocked_by(DetachAncestor)),
-        )
-    };
-    assert!(
-        still_ongoing,
-        "cannot (detach? reparent)? complete if the operation is not still ongoing"
-    );
-
-    let ancestor = match (detached.ancestor_timeline.as_ref(), recorded_branchpoint) {
-        (Some(ancestor), None) => {
-            assert!(
-                !layers.is_empty(),
-                "there should always be at least one layer to inherit"
-            );
-            Ancestor::NotDetached(ancestor.clone(), detached.ancestor_lsn)
-        }
-        (Some(_), Some(_)) => {
-            panic!(
-                "it should be impossible to get to here without having gone through the tenant reset; if the tenant was reset, then the ancestor_timeline would be None"
-            );
-        }
-        (None, Some((ancestor_id, ancestor_lsn))) => {
-            // it has been either:
-            // - detached but still exists => we can try reparenting
-            // - detached and deleted
-            //
-            // either way, we must complete
-            assert!(
-                layers.is_empty(),
-                "no layers should had been copied as detach is done"
-            );
-
-            let existing = tenant.timelines.lock().unwrap().get(&ancestor_id).cloned();
-
-            if let Some(ancestor) = existing {
-                Ancestor::Detached(ancestor, ancestor_lsn)
-            } else {
-                let direct_children = reparented_direct_children(detached, tenant)?;
-                return Ok(DetachingAndReparenting::AlreadyDone(direct_children));
-            }
-        }
-        (None, None) => {
-            // TODO: make sure there are no `?` before tenant_reset from after a questionmark from
-            // here.
-            panic!(
-            "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor"
-            );
-        }
-    };
+    let ancestor = detached
+        .ancestor_timeline
+        .as_ref()
+        .expect("must still have a ancestor");
+    let ancestor_lsn = detached.get_ancestor_lsn();

    // publish the prepared layers before we reparent any of the timelines, so that on restart
    // reparented timelines find layers. also do the actual detaching.
    //
-    // if we crash after this operation, a retry will allow reparenting the remaining timelines as
-    // gc is blocked.
-
-    let (ancestor, ancestor_lsn, was_detached) = match ancestor {
-        Ancestor::NotDetached(ancestor, ancestor_lsn) => {
-            // this has to complete before any reparentings because otherwise they would not have
-            // layers on the new parent.
-            detached
-                .remote_client
-                .schedule_adding_existing_layers_to_index_detach_and_wait(
-                    &layers,
-                    (ancestor.timeline_id, ancestor_lsn),
-                )
-                .await
-                .context("publish layers and detach ancestor")?;
-
-            tracing::info!(
-                ancestor=%ancestor.timeline_id,
-                %ancestor_lsn,
-                inherited_layers=%layers.len(),
-                "detached from ancestor"
-            );
-            (ancestor, ancestor_lsn, true)
-        }
-        Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn, false),
-    };
+    // if we crash after this operation, we will at least come up having detached a timeline, but
+    // we cannot go back and reparent the timelines which would had been reparented in normal
+    // execution.
+    //
+    // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
+    // which could give us a completely wrong layer combination.
+    detached
+        .remote_client
+        .schedule_adding_existing_layers_to_index_detach_and_wait(
+            &layers,
+            (ancestor.timeline_id, ancestor_lsn),
+        )
+        .await?;

    let mut tasks = tokio::task::JoinSet::new();

-    // Returns a single permit semaphore which will be used to make one reparenting succeed,
-    // others will fail as if those timelines had been stopped for whatever reason.
-    #[cfg(feature = "testing")]
-    let failpoint_sem = || -> Option<Arc<Semaphore>> {
-        fail::fail_point!("timeline-detach-ancestor::allow_one_reparented", |_| Some(
-            Arc::new(Semaphore::new(1))
-        ));
-        None
-    }();
-
    // because we are now keeping the slot in progress, it is unlikely that there will be any
    // timeline deletions during this time. if we raced one, then we'll just ignore it.
-    {
-        let g = tenant.timelines.lock().unwrap();
-        reparentable_timelines(g.values(), detached, &ancestor, ancestor_lsn)
-            .cloned()
-            .for_each(|timeline| {
-                // important in this scope: we are holding the Tenant::timelines lock
-                let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id);
-                let new_parent = detached.timeline_id;
-                #[cfg(feature = "testing")]
-                let failpoint_sem = failpoint_sem.clone();
+    tenant
+        .timelines
+        .lock()
+        .unwrap()
+        .values()
+        .filter_map(|tl| {
+            if Arc::ptr_eq(tl, detached) {
+                return None;
+            }

-                tasks.spawn(
-                    async move {
-                        let res = async {
-                            #[cfg(feature = "testing")]
-                            if let Some(failpoint_sem) = failpoint_sem {
-                                let _permit = failpoint_sem.acquire().await.map_err(|_| {
-                                    anyhow::anyhow!(
-                                        "failpoint: timeline-detach-ancestor::allow_one_reparented",
-                                    )
-                                })?;
-                                failpoint_sem.close();
-                            }
+            if !tl.is_active() {
+                return None;
+            }

-                            timeline
-                                .remote_client
-                                .schedule_reparenting_and_wait(&new_parent)
-                                .await
-                        }
+            let tl_ancestor = tl.ancestor_timeline.as_ref()?;
+            let is_same = Arc::ptr_eq(ancestor, tl_ancestor);
+            let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
+
+            let is_deleting = tl
+                .delete_progress
+                .try_lock()
+                .map(|flow| !flow.is_not_started())
+                .unwrap_or(true);
+
+            if is_same && is_earlier && !is_deleting {
+                Some(tl.clone())
+            } else {
+                None
+            }
+        })
+        .for_each(|timeline| {
+            // important in this scope: we are holding the Tenant::timelines lock
+            let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id);
+            let new_parent = detached.timeline_id;
+
+            tasks.spawn(
+                async move {
+                    let res = timeline
+                        .remote_client
+                        .schedule_reparenting_and_wait(&new_parent)
                        .await;

-                        match res {
-                            Ok(()) => {
-                                tracing::info!("reparented");
-                                Some(timeline)
-                            }
-                            Err(e) => {
-                                // with the use of tenant slot, raced timeline deletion is the most
-                                // likely reason.
-                                tracing::warn!("reparenting failed: {e:#}");
-                                None
-                            }
+                    match res {
+                        Ok(()) => Some(timeline),
+                        Err(e) => {
+                            // with the use of tenant slot, we no longer expect these.
+                            tracing::warn!("reparenting failed: {e:#}");
+                            None
                        }
                    }
-                    .instrument(span),
-                );
-            });
-    }
+                }
+                .instrument(span),
+            );
+        });

    let reparenting_candidates = tasks.len();
    let mut reparented = HashSet::with_capacity(tasks.len());
@@ -865,103 +645,33 @@ pub(super) async fn detach_and_reparent(
    while let Some(res) = tasks.join_next().await {
        match res {
            Ok(Some(timeline)) => {
+                tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
+
                assert!(
                    reparented.insert(timeline.timeline_id),
                    "duplicate reparenting? timeline_id={}",
                    timeline.timeline_id
                );
            }
+            Ok(None) => {
+                // lets just ignore this for now. one or all reparented timelines could had
+                // started deletion, and that is fine.
+            }
            Err(je) if je.is_cancelled() => unreachable!("not used"),
-            // just ignore failures now, we can retry
-            Ok(None) => {}
-            Err(je) if je.is_panic() => {}
+            Err(je) if je.is_panic() => {
+                // ignore; it's better to continue with a single reparenting failing (or even
+                // all of them) in order to get to the goal state.
+                //
+                // these timelines will never be reparentable, but they can be always detached as
+                // separate tree roots.
+            }
            Err(je) => tracing::error!("unexpected join error: {je:?}"),
        }
    }

-    let reparented_all = reparenting_candidates == reparented.len();
-
-    if reparented_all {
-        Ok(DetachingAndReparenting::Reparented(reparented))
-    } else {
-        tracing::info!(
-            reparented = reparented.len(),
-            candidates = reparenting_candidates,
-            "failed to reparent all candidates; they can be retried after the tenant_reset",
-        );
-
-        let must_reset_tenant = !reparented.is_empty() || was_detached;
-        Ok(DetachingAndReparenting::SomeReparentingFailed { must_reset_tenant })
-    }
-}
-
-pub(super) async fn complete(
-    detached: &Arc<Timeline>,
-    tenant: &Tenant,
-    mut attempt: Attempt,
-    _ctx: &RequestContext,
-) -> Result<(), Error> {
-    assert_eq!(detached.timeline_id, attempt.timeline_id);
-
-    if attempt.gate_entered.is_none() {
-        let entered = detached.gate.enter().map_err(|_| Error::ShuttingDown)?;
-        attempt.gate_entered = Some(entered);
-    } else {
-        // Some(gate_entered) means the tenant was not restarted, as is not required
+    if reparenting_candidates != reparented.len() {
+        tracing::info!("failed to reparent some candidates");
    }

-    assert!(detached.ancestor_timeline.is_none());
-
-    // this should be an 503 at least...?
-    fail::fail_point!(
-        "timeline-detach-ancestor::complete_before_uploading",
-        |_| Err(Error::Failpoint(
-            "timeline-detach-ancestor::complete_before_uploading"
-        ))
-    );
-
-    tenant
-        .gc_block
-        .remove(
-            detached,
-            crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor,
-        )
-        .await
-        // FIXME: better error
-        .map_err(Error::Unexpected)?;
-
-    Ok(())
-}
-
-/// Query against a locked `Tenant::timelines`.
-fn reparentable_timelines<'a, I>(
-    timelines: I,
-    detached: &'a Arc<Timeline>,
-    ancestor: &'a Arc<Timeline>,
-    ancestor_lsn: Lsn,
-) -> impl Iterator<Item = &'a Arc<Timeline>> + 'a
-where
-    I: Iterator<Item = &'a Arc<Timeline>> + 'a,
-{
-    timelines.filter_map(move |tl| {
-        if Arc::ptr_eq(tl, detached) {
-            return None;
-        }
-
-        let tl_ancestor = tl.ancestor_timeline.as_ref()?;
-        let is_same = Arc::ptr_eq(ancestor, tl_ancestor);
-        let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
-
-        let is_deleting = tl
-            .delete_progress
-            .try_lock()
-            .map(|flow| !flow.is_not_started())
-            .unwrap_or(true);
-
-        if is_same && is_earlier && !is_deleting {
-            Some(tl)
-        } else {
-            None
-        }
-    })
+    Ok(reparented)
 }
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -335,9 +335,6 @@ pub(super) async fn handle_walreceiver_connection(
                            filtered_records += 1;
                        }

-                        // FIXME: this cannot be made pausable_failpoint without fixing the
-                        // failpoint library; in tests, the added amount of debugging will cause us
-                        // to timeout the tests.
                        fail_point!("walreceiver-after-ingest");

                        last_rec_lsn = lsn;
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -5,17 +5,12 @@

 use anyhow::Context;
 use std::path::Path;
-use utils::serde_percent::Percent;

 use pageserver_api::models::PageserverUtilization;

-use crate::{config::PageServerConf, tenant::mgr::TenantManager};
+pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtilization> {
+    // TODO: currently the http api ratelimits this to 1Hz at most, which is probably good enough

-pub(crate) fn regenerate(
-    conf: &PageServerConf,
-    tenants_path: &Path,
-    tenant_manager: &TenantManager,
-) -> anyhow::Result<PageserverUtilization> {
    let statvfs = nix::sys::statvfs::statvfs(tenants_path)
        .map_err(std::io::Error::from)
        .context("statvfs tenants directory")?;
@@ -39,31 +34,16 @@ pub(crate) fn regenerate(

    let captured_at = std::time::SystemTime::now();

-    // Calculate aggregate utilization from tenants on this pageserver
-    let (disk_wanted_bytes, shard_count) = tenant_manager.calculate_utilization()?;
-
-    // Fetch the fraction of disk space which may be used
-    let disk_usable_pct = match conf.disk_usage_based_eviction.clone() {
-        Some(e) => e.max_usage_pct,
-        None => Percent::new(100).unwrap(),
-    };
-
-    // Express a static value for how many shards we may schedule on one node
-    const MAX_SHARDS: u32 = 20000;
-
-    let mut doc = PageserverUtilization {
+    let doc = PageserverUtilization {
        disk_usage_bytes: used,
        free_space_bytes: free,
-        disk_wanted_bytes,
-        disk_usable_pct,
-        shard_count,
-        max_shard_count: MAX_SHARDS,
-        utilization_score: 0,
+        // lower is better; start with a constant
+        //
+        // note that u64::MAX will be output as i64::MAX as u64, but that should not matter
+        utilization_score: u64::MAX,
        captured_at: utils::serde_system_time::SystemTime(captured_at),
    };

-    doc.refresh_score();
-
    // TODO: make utilization_score into a metric

    Ok(doc)
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -107,10 +107,8 @@ enum ProcessOnceCell {
 }

 struct Process {
-    process: process::WalRedoProcess,
-    /// This field is last in this struct so the guard gets dropped _after_ [`Self::process`].
-    /// (Reminder: dropping [`Self::process`] synchronously sends SIGKILL and then `wait()`s for it to exit).
    _launched_processes_guard: utils::sync::gate::GateGuard,
+    process: process::WalRedoProcess,
 }

 impl std::ops::Deref for Process {
@@ -329,23 +327,20 @@ impl PostgresRedoManager {
                },
                Err(permit) => {
                    let start = Instant::now();
-                    // acquire guard before spawning process, so that we don't spawn new processes
-                    // if the gate is already closed.
-                    let _launched_processes_guard = match self.launched_processes.enter() {
+                    let proc = Arc::new(Process {
+                            _launched_processes_guard: match self.launched_processes.enter() {
                                Ok(guard) => guard,
                                Err(GateError::GateClosed) => unreachable!(
                                    "shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
                                ),
-                            };
-                    let proc = Arc::new(Process {
-                        process: process::WalRedoProcess::launch(
-                            self.conf,
-                            self.tenant_shard_id,
-                            pg_version,
-                        )
-                        .context("launch walredo process")?,
-                        _launched_processes_guard,
-                    });
+                            },
+                            process: process::WalRedoProcess::launch(
+                                self.conf,
+                                self.tenant_shard_id,
+                                pg_version,
+                            )
+                            .context("launch walredo process")?,
+                        });
                    let duration = start.elapsed();
                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
                    info!(
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -32,7 +32,6 @@
 #include "utils/builtins.h"
 #include "utils/pg_lsn.h"
 #include "utils/guc.h"
-#include "utils/guc_tables.h"
 #include "utils/wait_event.h"

 #include "extension_server.h"
@@ -69,10 +68,10 @@ InitLogicalReplicationMonitor(void)

 	DefineCustomIntVariable(
 							"neon.logical_replication_max_snap_files",
-							"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
+							"Maximum allowed logical replication .snap files",
 							NULL,
 							&logical_replication_max_snap_files,
-							300, -1, INT_MAX,
+							300, 0, INT_MAX,
 							PGC_SIGHUP,
 							0,
 							NULL, NULL, NULL);
@@ -585,40 +584,6 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n
 	return false;
 }

-
-/*
- * pgbouncer is able to track GUCs reported by Postgres.
- * But most parameters cannot be tracked this way. The only parameters that can be tracked are ones
- * that Postgres reports to the client. Unfortunately `search_path` is not reported by Postgres:
- * https://www.postgresql.org/message-id/flat/CAGECzQQ6xFcgrg%2Be0p9mCumtK362TiA6vTiiZKoYbS8OXggwuQ%40mail.gmail.com#be4bfd7a9cf1f0633bdb2d1790a0a1be
- * This code sets GUC_REPORT flag for `search_path`making it possible to include it in
- * pgbouncer's `track_extra_parameters` list.
- *
- * This code is inspired by how the Citus extension does this, see
- * https://github.com/citusdata/citus/blob/2a263fe69a707d16ef24378f7650742386b0968f/src/backend/distributed/shared_library_init.c#L2694
- */
-static void
-ReportSearchPath(void)
-{
-#if PG_VERSION_NUM >= 160000
-	int nGucs = 0;
-	struct config_generic **gucs = get_guc_variables(&nGucs);
-#else
-	struct config_generic **gucs = get_guc_variables();
-	int nGucs = GetNumConfigOptions();
-#endif
-
-	for (int i = 0; i < nGucs; i++)
-	{
-		struct config_generic *guc = (struct config_generic *) gucs[i];
-
-		if (strcmp(guc->name, "search_path") == 0)
-		{
-			guc->flags |= GUC_REPORT;
-		}
-	}
-}
-
 void
 _PG_init(void)
 {
@@ -634,7 +599,6 @@ _PG_init(void)
 	pg_init_walproposer();
 	WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 	LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
-	SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;

 	InitLogicalReplicationMonitor();

@@ -662,8 +626,6 @@ _PG_init(void)
 	 * extension was loaded will be removed.
 	 */
 	EmitWarningsOnPlaceholders("neon");
-
-	ReportSearchPath();
 }

 PG_FUNCTION_INFO_V1(pg_cluster_size);
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -512,7 +512,7 @@ replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRe
 }

 /*
- * Start walproposer streaming replication
+ * Start walsender streaming replication
 */
 static void
 walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)
--- a/pgxn/neon/walsender_hooks.c
+++ b/pgxn/neon/walsender_hooks.c
@@ -20,7 +20,6 @@
 #include "utils/guc.h"
 #include "postmaster/interrupt.h"

-#include "neon.h"
 #include "neon_walreader.h"
 #include "walproposer.h"

@@ -182,13 +181,6 @@ NeonWALReadSegmentClose(XLogReaderState *xlogreader)
 void
 NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr)
 {
-	/*
-	 * If safekeepers are not configured, assume we don't need neon_walreader,
-	 * i.e. running neon fork locally.
-	 */
-	if (wal_acceptors_list[0] == '\0')
-		return;
-
 	if (!wal_reader)
 	{
 		XLogRecPtr	epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn);
--- a/pgxn/neon_rmgr/neon_rmgr.c
+++ b/pgxn/neon_rmgr/neon_rmgr.c
@@ -186,7 +186,7 @@ static void
 fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
 {
 	*infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY |
-				   HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK | HEAP_COMBOCID);
+				   HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK);
 	*infomask2 &= ~HEAP_KEYS_UPDATED;

 	if (infobits & XLHL_XMAX_IS_MULTI)
@@ -195,8 +195,6 @@ fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
 		*infomask |= HEAP_XMAX_LOCK_ONLY;
 	if (infobits & XLHL_XMAX_EXCL_LOCK)
 		*infomask |= HEAP_XMAX_EXCL_LOCK;
-	if (infobits & XLHL_COMBOCID)
-		*infomask |= HEAP_COMBOCID;
 	/* note HEAP_XMAX_SHR_LOCK isn't considered here */
 	if (infobits & XLHL_XMAX_KEYSHR_LOCK)
 		*infomask |= HEAP_XMAX_KEYSHR_LOCK;
@@ -286,7 +284,7 @@ redo_neon_heap_insert(XLogReaderState *record)
 		htup->t_infomask = xlhdr.t_infomask;
 		htup->t_hoff = xlhdr.t_hoff;
 		HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
-		htup->t_choice.t_heap.t_field3.t_cid = xlhdr.t_cid;
+		HeapTupleHeaderSetCmin(htup, xlhdr.t_cid);
 		htup->t_ctid = target_tid;

 		if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum,
@@ -375,7 +373,7 @@ redo_neon_heap_delete(XLogReaderState *record)
 			HeapTupleHeaderSetXmax(htup, xlrec->xmax);
 		else
 			HeapTupleHeaderSetXmin(htup, InvalidTransactionId);
-		htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid;
+		HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false);

 		/* Mark the page as a candidate for pruning */
 		PageSetPrunable(page, XLogRecGetXid(record));
@@ -492,7 +490,7 @@ redo_neon_heap_update(XLogReaderState *record, bool hot_update)
 		fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
 								   &htup->t_infomask2);
 		HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
-		htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid;
+		HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false);
 		/* Set forward chain link in t_ctid */
 		htup->t_ctid = newtid;

@@ -625,7 +623,7 @@ redo_neon_heap_update(XLogReaderState *record, bool hot_update)
 		htup->t_hoff = xlhdr.t_hoff;

 		HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
-		htup->t_choice.t_heap.t_field3.t_cid = xlhdr.t_cid;
+		HeapTupleHeaderSetCmin(htup, xlhdr.t_cid);
 		HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
 		/* Make sure there is no forward chain link in t_ctid */
 		htup->t_ctid = newtid;
@@ -730,7 +728,7 @@ redo_neon_heap_lock(XLogReaderState *record)
 						   offnum);
 		}
 		HeapTupleHeaderSetXmax(htup, xlrec->xmax);
-		htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid;
+		HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false);
 		PageSetLSN(page, lsn);
 		MarkBufferDirty(buffer);
 	}
@@ -842,7 +840,7 @@ redo_neon_heap_multi_insert(XLogReaderState *record)
 			htup->t_infomask = xlhdr->t_infomask;
 			htup->t_hoff = xlhdr->t_hoff;
 			HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
-			htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid;
+			HeapTupleHeaderSetCmin(htup, xlrec->t_cid);
 			ItemPointerSetBlockNumber(&htup->t_ctid, blkno);
 			ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);

--- a/proxy/core/Cargo.toml
+++ b/proxy/core/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "proxy"
+name = "proxy-core"
 version = "0.1.0"
 edition.workspace = true
 license.workspace = true
@@ -9,8 +9,11 @@ default = []
 testing = []

 [dependencies]
+proxy-sasl = { version = "0.1", path = "../sasl" }
+
 ahash.workspace = true
 anyhow.workspace = true
+arc-swap.workspace = true
 async-compression.workspace = true
 async-trait.workspace = true
 atomic-take.workspace = true
@@ -30,7 +33,6 @@ dashmap.workspace = true
 env_logger.workspace = true
 framed-websockets.workspace = true
 futures.workspace = true
-git-version.workspace = true
 hashbrown.workspace = true
 hashlink.workspace = true
 hex.workspace = true
@@ -51,17 +53,15 @@ md5.workspace = true
 measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
 once_cell.workspace = true
-opentelemetry.workspace = true
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
 pq_proto.workspace = true
-prometheus.workspace = true
 rand.workspace = true
 regex.workspace = true
-remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
+remote_storage = { version = "0.1", path = "../../libs/remote_storage/" }
 reqwest.workspace = true
 reqwest-middleware = { workspace = true, features = ["json"] }
 reqwest-retry.workspace = true
@@ -73,14 +73,13 @@ rustls.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-sha2 = { workspace = true, features = ["asm"] }
+sha2 = { workspace = true, features = ["asm", "oid"] }
 smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
 subtle.workspace = true
 task-local-extensions.workspace = true
 thiserror.workspace = true
-tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
 tokio-postgres.workspace = true
 tokio-postgres-rustls.workspace = true
@@ -103,6 +102,14 @@ x509-parser.workspace = true
 postgres-protocol.workspace = true
 redis.workspace = true

+# jwt stuff
+jose-jwa = "0.1.2"
+jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] }
+signature = "2"
+ecdsa = "0.16"
+p256 = "0.13"
+rsa = "0.9"
+
 workspace_hack.workspace = true

 [dev-dependencies]
--- a/proxy/core/src/auth.rs
+++ b/proxy/core/src/auth.rs
@@ -38,7 +38,7 @@ pub enum AuthErrorImpl {

    /// SASL protocol errors (includes [SCRAM](crate::scram)).
    #[error(transparent)]
-    Sasl(#[from] crate::sasl::Error),
+    Sasl(#[from] proxy_sasl::sasl::Error),

    #[error("Unsupported authentication method: {0}")]
    BadAuthMethod(Box<str>),
@@ -148,3 +148,28 @@ impl ReportableError for AuthError {
        }
    }
 }
+
+impl UserFacingError for proxy_sasl::sasl::Error {
+    fn to_string_client(&self) -> String {
+        match self {
+            proxy_sasl::sasl::Error::ChannelBindingFailed(m) => m.to_string(),
+            proxy_sasl::sasl::Error::ChannelBindingBadMethod(m) => {
+                format!("unsupported channel binding method {m}")
+            }
+            _ => "authentication protocol violation".to_string(),
+        }
+    }
+}
+
+impl ReportableError for proxy_sasl::sasl::Error {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        match self {
+            proxy_sasl::sasl::Error::ChannelBindingFailed(_) => crate::error::ErrorKind::User,
+            proxy_sasl::sasl::Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User,
+            proxy_sasl::sasl::Error::BadClientMessage(_) => crate::error::ErrorKind::User,
+            proxy_sasl::sasl::Error::MissingBinding => crate::error::ErrorKind::Service,
+            proxy_sasl::sasl::Error::Base64(_) => crate::error::ErrorKind::ControlPlane,
+            proxy_sasl::sasl::Error::Io(_) => crate::error::ErrorKind::ClientDisconnect,
+        }
+    }
+}
--- a/proxy/core/src/auth/backend.rs
+++ b/proxy/core/src/auth/backend.rs
@@ -1,5 +1,6 @@
 mod classic;
 mod hacks;
+pub mod jwt;
 mod link;

 use std::net::IpAddr;
@@ -8,6 +9,7 @@ use std::time::Duration;

 use ipnet::{Ipv4Net, Ipv6Net};
 pub use link::LinkAuthError;
+use proxy_sasl::scram;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::AuthKeys;
 use tracing::{info, warn};
@@ -35,7 +37,7 @@ use crate::{
    },
    stream, url,
 };
-use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
+use crate::{EndpointCacheKey, EndpointId, RoleName};

 /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
 pub enum MaybeOwned<'a, T> {
@@ -370,8 +372,8 @@ async fn authenticate_with_secret(
        let auth_outcome =
            validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?;
        let keys = match auth_outcome {
-            crate::sasl::Outcome::Success(key) => key,
-            crate::sasl::Outcome::Failure(reason) => {
+            proxy_sasl::sasl::Outcome::Success(key) => key,
+            proxy_sasl::sasl::Outcome::Failure(reason) => {
                info!("auth backend failed with an error: {reason}");
                return Err(auth::AuthError::auth_failed(&*info.user));
            }
@@ -557,9 +559,9 @@ mod tests {
        context::RequestMonitoring,
        proxy::NeonOptions,
        rate_limiter::{EndpointRateLimiter, RateBucketInfo},
-        scram::{threadpool::ThreadPool, ServerSecret},
        stream::{PqStream, Stream},
    };
+    use proxy_sasl::scram::{threadpool::ThreadPool, ServerSecret};

    use super::{auth_quirks, AuthRateLimiter};

@@ -668,7 +670,11 @@ mod tests {
        let ctx = RequestMonitoring::test();
        let api = Auth {
            ips: vec![],
-            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
+            secret: AuthSecret::Scram(
+                ServerSecret::build_test_secret("my-secret-password")
+                    .await
+                    .unwrap(),
+            ),
        };

        let user_info = ComputeUserInfoMaybeEndpoint {
@@ -745,7 +751,11 @@ mod tests {
        let ctx = RequestMonitoring::test();
        let api = Auth {
            ips: vec![],
-            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
+            secret: AuthSecret::Scram(
+                ServerSecret::build_test_secret("my-secret-password")
+                    .await
+                    .unwrap(),
+            ),
        };

        let user_info = ComputeUserInfoMaybeEndpoint {
@@ -797,7 +807,11 @@ mod tests {
        let ctx = RequestMonitoring::test();
        let api = Auth {
            ips: vec![],
-            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
+            secret: AuthSecret::Scram(
+                ServerSecret::build_test_secret("my-secret-password")
+                    .await
+                    .unwrap(),
+            ),
        };

        let user_info = ComputeUserInfoMaybeEndpoint {
--- a/proxy/core/src/auth/backend/classic.rs
+++ b/proxy/core/src/auth/backend/classic.rs
@@ -5,9 +5,9 @@ use crate::{
    config::AuthenticationConfig,
    console::AuthSecret,
    context::RequestMonitoring,
-    sasl,
    stream::{PqStream, Stream},
 };
+use proxy_sasl::sasl;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};

--- a/proxy/core/src/auth/backend/hacks.rs
+++ b/proxy/core/src/auth/backend/hacks.rs
@@ -7,9 +7,9 @@ use crate::{
    console::AuthSecret,
    context::RequestMonitoring,
    intern::EndpointIdInt,
-    sasl,
    stream::{self, Stream},
 };
+use proxy_sasl::sasl;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};

--- a/proxy/core/src/auth/backend/jwt.rs
+++ b/proxy/core/src/auth/backend/jwt.rs
@@ -0,0 +1,554 @@
+use std::{future::Future, sync::Arc, time::Duration};
+
+use anyhow::{bail, ensure, Context};
+use arc_swap::ArcSwapOption;
+use dashmap::DashMap;
+use jose_jwk::crypto::KeyInfo;
+use signature::Verifier;
+use tokio::time::Instant;
+
+use crate::{http::parse_json_body_with_limit, intern::EndpointIdInt};
+
+// TODO(conrad): make these configurable.
+const MIN_RENEW: Duration = Duration::from_secs(30);
+const AUTO_RENEW: Duration = Duration::from_secs(300);
+const MAX_RENEW: Duration = Duration::from_secs(3600);
+const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
+
+/// How to get the JWT auth rules
+pub trait FetchAuthRules: Clone + Send + Sync + 'static {
+    fn fetch_auth_rules(&self) -> impl Future<Output = anyhow::Result<AuthRules>> + Send;
+}
+
+#[derive(Clone)]
+struct FetchAuthRulesFromCplane {
+    #[allow(dead_code)]
+    endpoint: EndpointIdInt,
+}
+
+impl FetchAuthRules for FetchAuthRulesFromCplane {
+    async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
+        Err(anyhow::anyhow!("not yet implemented"))
+    }
+}
+
+pub struct AuthRules {
+    jwks_urls: Vec<url::Url>,
+}
+
+#[derive(Default)]
+pub struct JwkCache {
+    client: reqwest::Client,
+
+    map: DashMap<EndpointIdInt, Arc<JwkCacheEntryLock>>,
+}
+
+pub struct JwkCacheEntryLock {
+    cached: ArcSwapOption<JwkCacheEntry>,
+    lookup: tokio::sync::Semaphore,
+}
+
+impl Default for JwkCacheEntryLock {
+    fn default() -> Self {
+        JwkCacheEntryLock {
+            cached: ArcSwapOption::empty(),
+            lookup: tokio::sync::Semaphore::new(1),
+        }
+    }
+}
+
+pub struct JwkCacheEntry {
+    /// Should refetch at least every hour to verify when old keys have been removed.
+    /// Should refetch when new key IDs are seen only every 5 minutes or so
+    last_retrieved: Instant,
+
+    /// cplane will return multiple JWKs urls that we need to scrape.
+    key_sets: ahash::HashMap<url::Url, jose_jwk::JwkSet>,
+}
+
+impl JwkCacheEntryLock {
+    async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
+        JwkRenewalPermit::acquire_permit(self).await
+    }
+
+    fn try_acquire_permit<'a>(self: &'a Arc<Self>) -> Option<JwkRenewalPermit<'a>> {
+        JwkRenewalPermit::try_acquire_permit(self)
+    }
+
+    async fn renew_jwks<F: FetchAuthRules>(
+        &self,
+        _permit: JwkRenewalPermit<'_>,
+        client: &reqwest::Client,
+        auth_rules: &F,
+    ) -> anyhow::Result<Arc<JwkCacheEntry>> {
+        // double check that no one beat us to updating the cache.
+        let now = Instant::now();
+        let guard = self.cached.load_full();
+        if let Some(cached) = guard {
+            let last_update = now.duration_since(cached.last_retrieved);
+            if last_update < Duration::from_secs(300) {
+                return Ok(cached);
+            }
+        }
+
+        let rules = auth_rules.fetch_auth_rules().await?;
+        let mut key_sets = ahash::HashMap::with_capacity_and_hasher(
+            rules.jwks_urls.len(),
+            ahash::RandomState::new(),
+        );
+        // TODO(conrad): run concurrently
+        for url in rules.jwks_urls {
+            let req = client.get(url.clone());
+            // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
+            match req.send().await.and_then(|r| r.error_for_status()) {
+                // todo: should we re-insert JWKs if we want to keep this JWKs URL?
+                // I expect these failures would be quite sparse.
+                Err(e) => tracing::warn!(?url, error=?e, "could not fetch JWKs"),
+                Ok(r) => {
+                    let resp: http::Response<reqwest::Body> = r.into();
+                    match parse_json_body_with_limit::<jose_jwk::JwkSet>(
+                        resp.into_body(),
+                        MAX_JWK_BODY_SIZE,
+                    )
+                    .await
+                    {
+                        Err(e) => tracing::warn!(?url, error=?e, "could not decode JWKs"),
+                        Ok(jwks) => {
+                            key_sets.insert(url, jwks);
+                        }
+                    }
+                }
+            }
+        }
+
+        let entry = Arc::new(JwkCacheEntry {
+            last_retrieved: now,
+            key_sets,
+        });
+        self.cached.swap(Some(Arc::clone(&entry)));
+
+        Ok(entry)
+    }
+
+    async fn get_or_update_jwk_cache<F: FetchAuthRules>(
+        self: &Arc<Self>,
+        client: &reqwest::Client,
+        fetch: &F,
+    ) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
+        let now = Instant::now();
+        let guard = self.cached.load_full();
+
+        // if we have no cached JWKs, try and get some
+        let Some(cached) = guard else {
+            let permit = self.acquire_permit().await;
+            return self.renew_jwks(permit, client, fetch).await;
+        };
+
+        let last_update = now.duration_since(cached.last_retrieved);
+
+        // check if the cached JWKs need updating.
+        if last_update > MAX_RENEW {
+            let permit = self.acquire_permit().await;
+
+            // it's been too long since we checked the keys. wait for them to update.
+            return self.renew_jwks(permit, client, fetch).await;
+        }
+
+        // every 5 minutes we should spawn a job to eagerly update the token.
+        if last_update > AUTO_RENEW {
+            if let Some(permit) = self.try_acquire_permit() {
+                tracing::debug!("JWKs should be renewed. Renewal permit acquired");
+                let permit = permit.into_owned();
+                let entry = self.clone();
+                let client = client.clone();
+                let fetch = fetch.clone();
+                tokio::spawn(async move {
+                    if let Err(e) = entry.renew_jwks(permit, &client, &fetch).await {
+                        tracing::warn!(error=?e, "could not fetch JWKs in background job");
+                    }
+                });
+            } else {
+                tracing::debug!("JWKs should be renewed. Renewal permit already taken, skipping");
+            }
+        }
+
+        Ok(cached)
+    }
+
+    async fn check_jwt<F: FetchAuthRules>(
+        self: &Arc<Self>,
+        jwt: String,
+        client: &reqwest::Client,
+        fetch: &F,
+    ) -> Result<(), anyhow::Error> {
+        // JWT compact form is defined to be
+        // <B64(Header)> || . || <B64(Payload)> || . || <B64(Signature)>
+        // where Signature = alg(<B64(Header)> || . || <B64(Payload)>);
+
+        let (header_payload, signature) = jwt
+            .rsplit_once(".")
+            .context("not a valid compact JWT encoding")?;
+        let (header, _payload) = header_payload
+            .split_once(".")
+            .context("not a valid compact JWT encoding")?;
+
+        let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
+            .context("not a valid compact JWT encoding")?;
+        let header = serde_json::from_slice::<JWTHeader>(&header)
+            .context("not a valid compact JWT encoding")?;
+
+        ensure!(header.typ == "JWT");
+        let kid = header.kid.context("missing key id")?;
+
+        let mut guard = self.get_or_update_jwk_cache(client, fetch).await?;
+
+        // get the key from the JWKs if possible. If not, wait for the keys to update.
+        let jwk = loop {
+            let jwk = guard
+                .key_sets
+                .values()
+                .flat_map(|jwks| &jwks.keys)
+                .find(|jwk| jwk.prm.kid.as_deref() == Some(kid));
+
+            match jwk {
+                Some(jwk) => break jwk,
+                None if guard.last_retrieved.elapsed() > MIN_RENEW => {
+                    let permit = self.acquire_permit().await;
+                    guard = self.renew_jwks(permit, client, fetch).await?;
+                }
+                _ => {
+                    bail!("jwk not found");
+                }
+            }
+        };
+
+        ensure!(
+            jwk.is_supported(&header.alg),
+            "signature algorithm not supported"
+        );
+
+        let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
+            .context("not a valid compact JWT encoding")?;
+        match &jwk.key {
+            jose_jwk::Key::Ec(key) => {
+                verify_ec_signature(header_payload.as_bytes(), &sig, key)?;
+            }
+            jose_jwk::Key::Rsa(key) => {
+                verify_rsa_signature(header_payload.as_bytes(), &sig, key, &jwk.prm.alg)?;
+            }
+            key => bail!("unsupported key type {key:?}"),
+        };
+
+        // TODO(conrad): verify iss, exp, nbf, etc...
+
+        Ok(())
+    }
+}
+
+impl JwkCache {
+    pub async fn check_jwt(
+        &self,
+        endpoint: EndpointIdInt,
+        jwt: String,
+    ) -> Result<(), anyhow::Error> {
+        // try with just a read lock first
+        let entry = self.map.get(&endpoint).as_deref().map(Arc::clone);
+        let entry = match entry {
+            Some(entry) => entry,
+            None => {
+                // acquire a write lock after to insert.
+                let entry = self.map.entry(endpoint).or_default();
+                Arc::clone(&*entry)
+            }
+        };
+
+        let fetch = FetchAuthRulesFromCplane { endpoint };
+        entry.check_jwt(jwt, &self.client, &fetch).await
+    }
+}
+
+fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> anyhow::Result<()> {
+    use ecdsa::Signature;
+    use signature::Verifier;
+
+    match key.crv {
+        jose_jwk::EcCurves::P256 => {
+            let pk =
+                p256::PublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid P256 key"))?;
+            let key = p256::ecdsa::VerifyingKey::from(&pk);
+            let sig = Signature::from_slice(sig)?;
+            key.verify(data, &sig)?;
+        }
+        key => bail!("unsupported ec key type {key:?}"),
+    }
+
+    Ok(())
+}
+
+fn verify_rsa_signature(
+    data: &[u8],
+    sig: &[u8],
+    key: &jose_jwk::Rsa,
+    alg: &Option<jose_jwa::Algorithm>,
+) -> anyhow::Result<()> {
+    use jose_jwa::{Algorithm, Signing};
+    use rsa::{
+        pkcs1v15::{Signature, VerifyingKey},
+        RsaPublicKey,
+    };
+
+    let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?;
+
+    match alg {
+        Some(Algorithm::Signing(Signing::Rs256)) => {
+            let key = VerifyingKey::<sha2::Sha256>::new(key);
+            let sig = Signature::try_from(sig)?;
+            key.verify(data, &sig)?;
+        }
+        _ => bail!("invalid RSA signing algorithm"),
+    };
+
+    Ok(())
+}
+
+/// <https://datatracker.ietf.org/doc/html/rfc7515#section-4.1>
+#[derive(serde::Deserialize, serde::Serialize)]
+struct JWTHeader<'a> {
+    /// must be "JWT"
+    typ: &'a str,
+    /// must be a supported alg
+    alg: jose_jwa::Algorithm,
+    /// key id, must be provided for our usecase
+    kid: Option<&'a str>,
+}
+
+struct JwkRenewalPermit<'a> {
+    inner: Option<JwkRenewalPermitInner<'a>>,
+}
+
+enum JwkRenewalPermitInner<'a> {
+    Owned(Arc<JwkCacheEntryLock>),
+    Borrowed(&'a Arc<JwkCacheEntryLock>),
+}
+
+impl JwkRenewalPermit<'_> {
+    fn into_owned(mut self) -> JwkRenewalPermit<'static> {
+        JwkRenewalPermit {
+            inner: self.inner.take().map(JwkRenewalPermitInner::into_owned),
+        }
+    }
+
+    async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit {
+        match from.lookup.acquire().await {
+            Ok(permit) => {
+                permit.forget();
+                JwkRenewalPermit {
+                    inner: Some(JwkRenewalPermitInner::Borrowed(from)),
+                }
+            }
+            Err(_) => panic!("semaphore should not be closed"),
+        }
+    }
+
+    fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit> {
+        match from.lookup.try_acquire() {
+            Ok(permit) => {
+                permit.forget();
+                Some(JwkRenewalPermit {
+                    inner: Some(JwkRenewalPermitInner::Borrowed(from)),
+                })
+            }
+            Err(tokio::sync::TryAcquireError::NoPermits) => None,
+            Err(tokio::sync::TryAcquireError::Closed) => panic!("semaphore should not be closed"),
+        }
+    }
+}
+
+impl JwkRenewalPermitInner<'_> {
+    fn into_owned(self) -> JwkRenewalPermitInner<'static> {
+        match self {
+            JwkRenewalPermitInner::Owned(p) => JwkRenewalPermitInner::Owned(p),
+            JwkRenewalPermitInner::Borrowed(p) => JwkRenewalPermitInner::Owned(Arc::clone(p)),
+        }
+    }
+}
+
+impl Drop for JwkRenewalPermit<'_> {
+    fn drop(&mut self) {
+        let entry = match &self.inner {
+            None => return,
+            Some(JwkRenewalPermitInner::Owned(p)) => p,
+            Some(JwkRenewalPermitInner::Borrowed(p)) => *p,
+        };
+        entry.lookup.add_permits(1);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::{future::IntoFuture, net::SocketAddr, time::SystemTime};
+
+    use base64::URL_SAFE_NO_PAD;
+    use bytes::Bytes;
+    use http::Response;
+    use http_body_util::Full;
+    use hyper1::service::service_fn;
+    use hyper_util::rt::TokioIo;
+    use rand::rngs::OsRng;
+    use signature::Signer;
+    use tokio::net::TcpListener;
+
+    fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) {
+        let sk = p256::SecretKey::random(&mut OsRng);
+        let pk = sk.public_key().into();
+        let jwk = jose_jwk::Jwk {
+            key: jose_jwk::Key::Ec(pk),
+            prm: jose_jwk::Parameters {
+                kid: Some(kid),
+                alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)),
+                ..Default::default()
+            },
+        };
+        (sk, jwk)
+    }
+
+    fn new_rsa_jwk(kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) {
+        let sk = rsa::RsaPrivateKey::new(&mut OsRng, 2048).unwrap();
+        let pk = sk.to_public_key().into();
+        let jwk = jose_jwk::Jwk {
+            key: jose_jwk::Key::Rsa(pk),
+            prm: jose_jwk::Parameters {
+                kid: Some(kid),
+                alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)),
+                ..Default::default()
+            },
+        };
+        (sk, jwk)
+    }
+
+    fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
+        let header = JWTHeader {
+            typ: "JWT",
+            alg: jose_jwa::Algorithm::Signing(sig),
+            kid: Some(&kid),
+        };
+        let body = typed_json::json! {{
+            "exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
+        }};
+
+        let header =
+            base64::encode_config(serde_json::to_string(&header).unwrap(), URL_SAFE_NO_PAD);
+        let body = base64::encode_config(body.to_string(), URL_SAFE_NO_PAD);
+
+        format!("{header}.{body}")
+    }
+
+    fn new_ec_jwt(kid: String, key: p256::SecretKey) -> String {
+        use p256::ecdsa::{Signature, SigningKey};
+
+        let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256);
+        let sig: Signature = SigningKey::from(key).sign(payload.as_bytes());
+        let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
+
+        format!("{payload}.{sig}")
+    }
+
+    fn new_rsa_jwt(kid: String, key: rsa::RsaPrivateKey) -> String {
+        use rsa::pkcs1v15::SigningKey;
+        use rsa::signature::SignatureEncoding;
+
+        let payload = build_jwt_payload(kid, jose_jwa::Signing::Rs256);
+        let sig = SigningKey::<sha2::Sha256>::new(key).sign(payload.as_bytes());
+        let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
+
+        format!("{payload}.{sig}")
+    }
+
+    #[tokio::test]
+    async fn renew() {
+        let (rs1, jwk1) = new_rsa_jwk("1".into());
+        let (rs2, jwk2) = new_rsa_jwk("2".into());
+        let (ec1, jwk3) = new_ec_jwk("3".into());
+        let (ec2, jwk4) = new_ec_jwk("4".into());
+
+        let jwt1 = new_rsa_jwt("1".into(), rs1);
+        let jwt2 = new_rsa_jwt("2".into(), rs2);
+        let jwt3 = new_ec_jwt("3".into(), ec1);
+        let jwt4 = new_ec_jwt("4".into(), ec2);
+
+        let foo_jwks = jose_jwk::JwkSet {
+            keys: vec![jwk1, jwk3],
+        };
+        let bar_jwks = jose_jwk::JwkSet {
+            keys: vec![jwk2, jwk4],
+        };
+
+        let service = service_fn(move |req| {
+            let foo_jwks = foo_jwks.clone();
+            let bar_jwks = bar_jwks.clone();
+            async move {
+                let jwks = match req.uri().path() {
+                    "/foo" => &foo_jwks,
+                    "/bar" => &bar_jwks,
+                    _ => {
+                        return Response::builder()
+                            .status(404)
+                            .body(Full::new(Bytes::new()));
+                    }
+                };
+                let body = serde_json::to_vec(jwks).unwrap();
+                Response::builder()
+                    .status(200)
+                    .body(Full::new(Bytes::from(body)))
+            }
+        });
+
+        let listener = TcpListener::bind("0.0.0.0:0").await.unwrap();
+        let server = hyper1::server::conn::http1::Builder::new();
+        let addr = listener.local_addr().unwrap();
+        tokio::spawn(async move {
+            loop {
+                let (s, _) = listener.accept().await.unwrap();
+                let serve = server.serve_connection(TokioIo::new(s), service.clone());
+                tokio::spawn(serve.into_future());
+            }
+        });
+
+        let client = reqwest::Client::new();
+
+        #[derive(Clone)]
+        struct Fetch(SocketAddr);
+
+        impl FetchAuthRules for Fetch {
+            async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
+                Ok(AuthRules {
+                    jwks_urls: vec![
+                        format!("http://{}/foo", self.0).parse().unwrap(),
+                        format!("http://{}/bar", self.0).parse().unwrap(),
+                    ],
+                })
+            }
+        }
+
+        let jwk_cache = Arc::new(JwkCacheEntryLock::default());
+
+        jwk_cache
+            .check_jwt(jwt1, &client, &Fetch(addr))
+            .await
+            .unwrap();
+        jwk_cache
+            .check_jwt(jwt2, &client, &Fetch(addr))
+            .await
+            .unwrap();
+        jwk_cache
+            .check_jwt(jwt3, &client, &Fetch(addr))
+            .await
+            .unwrap();
+        jwk_cache
+            .check_jwt(jwt4, &client, &Fetch(addr))
+            .await
+            .unwrap();
+    }
+}
--- a/proxy/core/src/auth/backend/link.rs
+++ b/proxy/core/src/auth/backend/link.rs
--- a/proxy/core/src/auth/credentials.rs
+++ b/proxy/core/src/auth/credentials.rs
--- a/proxy/core/src/auth/flow.rs
+++ b/proxy/core/src/auth/flow.rs
@@ -2,16 +2,17 @@

 use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload};
 use crate::{
-    config::TlsServerEndPoint,
    console::AuthSecret,
    context::RequestMonitoring,
    intern::EndpointIdInt,
-    sasl,
-    scram::{self, threadpool::ThreadPool},
    stream::{PqStream, Stream},
 };
 use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
 use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
+use proxy_sasl::{
+    sasl,
+    scram::{self, threadpool::ThreadPool, TlsServerEndPoint},
+};
 use std::{io, sync::Arc};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
@@ -56,7 +57,7 @@ impl AuthMethod for PasswordHack {
 /// Use clear-text password auth called `password` in docs
 /// <https://www.postgresql.org/docs/current/auth-password.html>
 pub struct CleartextPassword {
-    pub pool: Arc<ThreadPool>,
+    pub pool: Arc<ThreadPool<EndpointIdInt>>,
    pub endpoint: EndpointIdInt,
    pub secret: AuthSecret,
 }
@@ -174,7 +175,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
        }
        info!("client chooses {}", sasl.method);

-        let outcome = sasl::SaslStream::new(self.stream, sasl.message)
+        let outcome = sasl::SaslStream::new(&mut self.stream.framed, sasl.message)
            .authenticate(scram::Exchange::new(
                secret,
                rand::random,
@@ -191,7 +192,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
 }

 pub(crate) async fn validate_password_and_exchange(
-    pool: &ThreadPool,
+    pool: &ThreadPool<EndpointIdInt>,
    endpoint: EndpointIdInt,
    password: &[u8],
    secret: AuthSecret,
@@ -206,7 +207,8 @@ pub(crate) async fn validate_password_and_exchange(
        }
        // perform scram authentication as both client and server to validate the keys
        AuthSecret::Scram(scram_secret) => {
-            let outcome = crate::scram::exchange(pool, endpoint, &scram_secret, password).await?;
+            let outcome =
+                proxy_sasl::scram::exchange(pool, endpoint, &scram_secret, password).await?;

            let client_key = match outcome {
                sasl::Outcome::Success(client_key) => client_key,
--- a/proxy/core/src/auth/password_hack.rs
+++ b/proxy/core/src/auth/password_hack.rs
--- a/proxy/core/src/cache.rs
+++ b/proxy/core/src/cache.rs
--- a/proxy/core/src/cache/common.rs
+++ b/proxy/core/src/cache/common.rs
--- a/proxy/core/src/cache/endpoints.rs
+++ b/proxy/core/src/cache/endpoints.rs
--- a/proxy/core/src/cache/project_info.rs
+++ b/proxy/core/src/cache/project_info.rs
@@ -371,7 +371,8 @@ impl Cache for ProjectInfoCacheImpl {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::{scram::ServerSecret, ProjectId};
+    use crate::ProjectId;
+    use proxy_sasl::scram::ServerSecret;

    #[tokio::test]
    async fn test_project_info_cache_settings() {
--- a/proxy/core/src/cache/timed_lru.rs
+++ b/proxy/core/src/cache/timed_lru.rs
--- a/proxy/core/src/cancellation.rs
+++ b/proxy/core/src/cancellation.rs
--- a/proxy/core/src/compute.rs
+++ b/proxy/core/src/compute.rs
--- a/proxy/core/src/config.rs
+++ b/proxy/core/src/config.rs
@@ -1,27 +1,26 @@
 use crate::{
    auth::{self, backend::AuthRateLimiter},
    console::locks::ApiLocks,
+    intern::EndpointIdInt,
    rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig},
-    scram::threadpool::ThreadPool,
    serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
    Host,
 };
+
 use anyhow::{bail, ensure, Context, Ok};
 use itertools::Itertools;
+use proxy_sasl::scram::{threadpool::ThreadPool, TlsServerEndPoint};
 use remote_storage::RemoteStorageConfig;
 use rustls::{
    crypto::ring::sign,
    pki_types::{CertificateDer, PrivateKeyDer},
 };
-use sha2::{Digest, Sha256};
 use std::{
    collections::{HashMap, HashSet},
    str::FromStr,
    sync::Arc,
    time::Duration,
 };
-use tracing::{error, info};
-use x509_parser::oid_registry;

 pub struct ProxyConfig {
    pub tls_config: Option<TlsConfig>,
@@ -58,7 +57,7 @@ pub struct HttpConfig {
 }

 pub struct AuthenticationConfig {
-    pub thread_pool: Arc<ThreadPool>,
+    pub thread_pool: Arc<ThreadPool<EndpointIdInt>>,
    pub scram_protocol_timeout: tokio::time::Duration,
    pub rate_limiter_enabled: bool,
    pub rate_limiter: AuthRateLimiter,
@@ -126,66 +125,6 @@ pub fn configure_tls(
    })
 }

-/// Channel binding parameter
-///
-/// <https://www.rfc-editor.org/rfc/rfc5929#section-4>
-/// Description: The hash of the TLS server's certificate as it
-/// appears, octet for octet, in the server's Certificate message.  Note
-/// that the Certificate message contains a certificate_list, in which
-/// the first element is the server's certificate.
-///
-/// The hash function is to be selected as follows:
-///
-/// * if the certificate's signatureAlgorithm uses a single hash
-///   function, and that hash function is either MD5 or SHA-1, then use SHA-256;
-///
-/// * if the certificate's signatureAlgorithm uses a single hash
-///   function and that hash function neither MD5 nor SHA-1, then use
-///   the hash function associated with the certificate's
-///   signatureAlgorithm;
-///
-/// * if the certificate's signatureAlgorithm uses no hash functions or
-///   uses multiple hash functions, then this channel binding type's
-///   channel bindings are undefined at this time (updates to is channel
-///   binding type may occur to address this issue if it ever arises).
-#[derive(Debug, Clone, Copy)]
-pub enum TlsServerEndPoint {
-    Sha256([u8; 32]),
-    Undefined,
-}
-
-impl TlsServerEndPoint {
-    pub fn new(cert: &CertificateDer) -> anyhow::Result<Self> {
-        let sha256_oids = [
-            // I'm explicitly not adding MD5 or SHA1 here... They're bad.
-            oid_registry::OID_SIG_ECDSA_WITH_SHA256,
-            oid_registry::OID_PKCS1_SHA256WITHRSA,
-        ];
-
-        let pem = x509_parser::parse_x509_certificate(cert)
-            .context("Failed to parse PEM object from cerficiate")?
-            .1;
-
-        info!(subject = %pem.subject, "parsing TLS certificate");
-
-        let reg = oid_registry::OidRegistry::default().with_all_crypto();
-        let oid = pem.signature_algorithm.oid();
-        let alg = reg.get(oid);
-        if sha256_oids.contains(oid) {
-            let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into();
-            info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding");
-            Ok(Self::Sha256(tls_server_end_point))
-        } else {
-            error!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), "unknown channel binding");
-            Ok(Self::Undefined)
-        }
-    }
-
-    pub fn supported(&self) -> bool {
-        !matches!(self, TlsServerEndPoint::Undefined)
-    }
-}
-
 #[derive(Default, Debug)]
 pub struct CertResolver {
    certs: HashMap<String, (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
--- a/proxy/core/src/console.rs
+++ b/proxy/core/src/console.rs
--- a/proxy/core/src/console/messages.rs
+++ b/proxy/core/src/console/messages.rs
--- a/proxy/core/src/console/mgmt.rs
+++ b/proxy/core/src/console/mgmt.rs
--- a/proxy/core/src/console/provider.rs
+++ b/proxy/core/src/console/provider.rs
@@ -16,9 +16,10 @@ use crate::{
    intern::ProjectIdInt,
    metrics::ApiLockMetrics,
    rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token},
-    scram, EndpointCacheKey,
+    EndpointCacheKey,
 };
 use dashmap::DashMap;
+use proxy_sasl::scram;
 use std::{hash::Hash, sync::Arc, time::Duration};
 use tokio::time::Instant;
 use tracing::info;
@@ -469,15 +470,15 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
        timeout: Duration,
        epoch: std::time::Duration,
        metrics: &'static ApiLockMetrics,
-    ) -> prometheus::Result<Self> {
-        Ok(Self {
+    ) -> Self {
+        Self {
            name,
            node_locks: DashMap::with_shard_amount(shards),
            config,
            timeout,
            epoch,
            metrics,
-        })
+        }
    }

    pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
--- a/proxy/core/src/console/provider/mock.rs
+++ b/proxy/core/src/console/provider/mock.rs
@@ -5,7 +5,7 @@ use super::{
    AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
 };
 use crate::context::RequestMonitoring;
-use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
+use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, url::ApiUrl};
 use crate::{auth::IpPattern, cache::Cached};
 use crate::{
    console::{
@@ -15,6 +15,7 @@ use crate::{
    BranchId, EndpointId, ProjectId,
 };
 use futures::TryFutureExt;
+use proxy_sasl::scram;
 use std::{str::FromStr, sync::Arc};
 use thiserror::Error;
 use tokio_postgres::{config::SslMode, Client};
--- a/proxy/core/src/console/provider/neon.rs
+++ b/proxy/core/src/console/provider/neon.rs
@@ -13,10 +13,11 @@ use crate::{
    http,
    metrics::{CacheOutcome, Metrics},
    rate_limiter::WakeComputeRateLimiter,
-    scram, EndpointCacheKey,
+    EndpointCacheKey,
 };
 use crate::{cache::Cached, context::RequestMonitoring};
 use futures::TryFutureExt;
+use proxy_sasl::scram;
 use std::{sync::Arc, time::Duration};
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
--- a/proxy/core/src/context.rs
+++ b/proxy/core/src/context.rs
--- a/proxy/core/src/context/parquet.rs
+++ b/proxy/core/src/context/parquet.rs
--- a/proxy/core/src/error.rs
+++ b/proxy/core/src/error.rs
--- a/proxy/core/src/http.rs
+++ b/proxy/core/src/http.rs
@@ -6,6 +6,12 @@ pub mod health_server;

 use std::time::Duration;

+use anyhow::bail;
+use bytes::Bytes;
+use http_body_util::BodyExt;
+use hyper1::body::Body;
+use serde::de::DeserializeOwned;
+
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
@@ -96,6 +102,33 @@ impl Endpoint {
    }
 }

+pub async fn parse_json_body_with_limit<D: DeserializeOwned>(
+    mut b: impl Body<Data = Bytes, Error = reqwest::Error> + Unpin,
+    limit: usize,
+) -> anyhow::Result<D> {
+    // We could use `b.limited().collect().await.to_bytes()` here
+    // but this ends up being slightly more efficient as far as I can tell.
+
+    // check the lower bound of the size hint.
+    // in reqwest, this value is influenced by the Content-Length header.
+    let lower_bound = match usize::try_from(b.size_hint().lower()) {
+        Ok(bound) if bound <= limit => bound,
+        _ => bail!("content length exceeds limit"),
+    };
+    let mut bytes = Vec::with_capacity(lower_bound);
+
+    while let Some(frame) = b.frame().await.transpose()? {
+        if let Ok(data) = frame.into_data() {
+            if bytes.len() + data.len() > limit {
+                bail!("content length exceeds limit")
+            }
+            bytes.extend_from_slice(&data);
+        }
+    }
+
+    Ok(serde_json::from_slice::<D>(&bytes)?)
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/proxy/core/src/http/health_server.rs
+++ b/proxy/core/src/http/health_server.rs
--- a/proxy/core/src/intern.rs
+++ b/proxy/core/src/intern.rs
--- a/proxy/core/src/jemalloc.rs
+++ b/proxy/core/src/jemalloc.rs
--- a/proxy/core/src/lib.rs
+++ b/proxy/core/src/lib.rs
@@ -21,13 +21,13 @@ pub mod intern;
 pub mod jemalloc;
 pub mod logging;
 pub mod metrics;
-pub mod parse;
+// pub mod parse;
 pub mod protocol2;
 pub mod proxy;
 pub mod rate_limiter;
 pub mod redis;
-pub mod sasl;
-pub mod scram;
+// pub mod sasl;
+// pub mod scram;
 pub mod serverless;
 pub mod stream;
 pub mod url;
--- a/proxy/core/src/logging.rs
+++ b/proxy/core/src/logging.rs
--- a/proxy/core/src/metrics.rs
+++ b/proxy/core/src/metrics.rs
@@ -2,13 +2,14 @@ use std::sync::{Arc, OnceLock};

 use lasso::ThreadedRodeo;
 use measured::{
-    label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet},
+    label::StaticLabelSet,
    metric::{histogram::Thresholds, name::MetricName},
-    Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
-    LabelGroup, MetricGroup,
+    Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
+    MetricGroup,
 };
 use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};

+use proxy_sasl::scram::threadpool::ThreadPoolMetrics;
 use tokio::time::{self, Instant};

 use crate::console::messages::ColdStartInfo;
@@ -546,78 +547,3 @@ pub enum RedisEventsCount {
    PasswordUpdate,
    AllowedIpsUpdate,
 }
-
-pub struct ThreadPoolWorkers(usize);
-pub struct ThreadPoolWorkerId(pub usize);
-
-impl LabelValue for ThreadPoolWorkerId {
-    fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
-        v.write_int(self.0 as i64)
-    }
-}
-
-impl LabelGroup for ThreadPoolWorkerId {
-    fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) {
-        v.write_value(LabelName::from_str("worker"), self);
-    }
-}
-
-impl LabelGroupSet for ThreadPoolWorkers {
-    type Group<'a> = ThreadPoolWorkerId;
-
-    fn cardinality(&self) -> Option<usize> {
-        Some(self.0)
-    }
-
-    fn encode_dense(&self, value: Self::Unique) -> Option<usize> {
-        Some(value)
-    }
-
-    fn decode_dense(&self, value: usize) -> Self::Group<'_> {
-        ThreadPoolWorkerId(value)
-    }
-
-    type Unique = usize;
-
-    fn encode(&self, value: Self::Group<'_>) -> Option<Self::Unique> {
-        Some(value.0)
-    }
-
-    fn decode(&self, value: &Self::Unique) -> Self::Group<'_> {
-        ThreadPoolWorkerId(*value)
-    }
-}
-
-impl LabelSet for ThreadPoolWorkers {
-    type Value<'a> = ThreadPoolWorkerId;
-
-    fn dynamic_cardinality(&self) -> Option<usize> {
-        Some(self.0)
-    }
-
-    fn encode(&self, value: Self::Value<'_>) -> Option<usize> {
-        (value.0 < self.0).then_some(value.0)
-    }
-
-    fn decode(&self, value: usize) -> Self::Value<'_> {
-        ThreadPoolWorkerId(value)
-    }
-}
-
-impl FixedCardinalitySet for ThreadPoolWorkers {
-    fn cardinality(&self) -> usize {
-        self.0
-    }
-}
-
-#[derive(MetricGroup)]
-#[metric(new(workers: usize))]
-pub struct ThreadPoolMetrics {
-    pub injector_queue_depth: Gauge,
-    #[metric(init = GaugeVec::with_label_set(ThreadPoolWorkers(workers)))]
-    pub worker_queue_depth: GaugeVec<ThreadPoolWorkers>,
-    #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
-    pub worker_task_turns_total: CounterVec<ThreadPoolWorkers>,
-    #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
-    pub worker_task_skips_total: CounterVec<ThreadPoolWorkers>,
-}
--- a/proxy/core/src/parse.rs
+++ b/proxy/core/src/parse.rs
--- a/proxy/core/src/protocol2.rs
+++ b/proxy/core/src/protocol2.rs
--- a/proxy/core/src/proxy.rs
+++ b/proxy/core/src/proxy.rs
--- a/proxy/core/src/proxy/connect_compute.rs
+++ b/proxy/core/src/proxy/connect_compute.rs
--- a/proxy/core/src/proxy/copy_bidirectional.rs
+++ b/proxy/core/src/proxy/copy_bidirectional.rs
--- a/proxy/core/src/proxy/handshake.rs
+++ b/proxy/core/src/proxy/handshake.rs
--- a/proxy/core/src/proxy/passthrough.rs
+++ b/proxy/core/src/proxy/passthrough.rs
--- a/proxy/core/src/proxy/retry.rs
+++ b/proxy/core/src/proxy/retry.rs
--- a/proxy/core/src/proxy/tests.rs
+++ b/proxy/core/src/proxy/tests.rs
@@ -16,9 +16,10 @@ use crate::console::messages::{ConsoleError, Details, MetricsAuxInfo, Status};
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
-use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
+use crate::{http, BranchId, EndpointId, ProjectId};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
+use proxy_sasl::{sasl, scram};
 use retry::{retry_after, ShouldRetryWakeCompute};
 use rstest::rstest;
 use rustls::pki_types;
@@ -137,7 +138,7 @@ struct Scram(scram::ServerSecret);

 impl Scram {
    async fn new(password: &str) -> anyhow::Result<Self> {
-        let secret = scram::ServerSecret::build(password)
+        let secret = scram::ServerSecret::build_test_secret(password)
            .await
            .context("failed to generate scram secret")?;
        Ok(Scram(secret))
--- a/proxy/core/src/proxy/tests/mitm.rs
+++ b/proxy/core/src/proxy/tests/mitm.rs
--- a/proxy/core/src/proxy/wake_compute.rs
+++ b/proxy/core/src/proxy/wake_compute.rs
--- a/proxy/core/src/rate_limiter.rs
+++ b/proxy/core/src/rate_limiter.rs
--- a/proxy/core/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/core/src/rate_limiter/leaky_bucket.rs
--- a/proxy/core/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/core/src/rate_limiter/limit_algorithm.rs
--- a/proxy/core/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/core/src/rate_limiter/limit_algorithm/aimd.rs
--- a/proxy/core/src/rate_limiter/limiter.rs
+++ b/proxy/core/src/rate_limiter/limiter.rs
--- a/proxy/core/src/redis.rs
+++ b/proxy/core/src/redis.rs
--- a/proxy/core/src/redis/cancellation_publisher.rs
+++ b/proxy/core/src/redis/cancellation_publisher.rs
--- a/proxy/core/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/core/src/redis/connection_with_credentials_provider.rs
--- a/proxy/core/src/redis/elasticache.rs
+++ b/proxy/core/src/redis/elasticache.rs
--- a/proxy/core/src/redis/notifications.rs
+++ b/proxy/core/src/redis/notifications.rs
--- a/proxy/core/src/serverless.rs
+++ b/proxy/core/src/serverless.rs
--- a/proxy/core/src/serverless/backend.rs
+++ b/proxy/core/src/serverless/backend.rs
@@ -79,11 +79,11 @@ impl PoolingBackend {
        )
        .await?;
        let res = match auth_outcome {
-            crate::sasl::Outcome::Success(key) => {
+            proxy_sasl::sasl::Outcome::Success(key) => {
                info!("user successfully authenticated");
                Ok(key)
            }
-            crate::sasl::Outcome::Failure(reason) => {
+            proxy_sasl::sasl::Outcome::Failure(reason) => {
                info!("auth backend failed with an error: {reason}");
                Err(AuthError::auth_failed(&*conn_info.user_info.user))
            }
--- a/proxy/core/src/serverless/cancel_set.rs
+++ b/proxy/core/src/serverless/cancel_set.rs
--- a/proxy/core/src/serverless/conn_pool.rs
+++ b/proxy/core/src/serverless/conn_pool.rs
--- a/proxy/core/src/serverless/http_util.rs
+++ b/proxy/core/src/serverless/http_util.rs
--- a/proxy/core/src/serverless/json.rs
+++ b/proxy/core/src/serverless/json.rs
--- a/proxy/core/src/serverless/sql_over_http.rs
+++ b/proxy/core/src/serverless/sql_over_http.rs
@@ -34,7 +34,6 @@ use tracing::error;
 use tracing::info;
 use typed_json::json;
 use url::Url;
-use urlencoding;
 use utils::http::error::ApiError;

 use crate::auth::backend::ComputeUserInfo;
@@ -169,8 +168,7 @@ fn get_conn_info(
        .path_segments()
        .ok_or(ConnInfoError::MissingDbName)?;

-    let dbname: DbName =
-        urlencoding::decode(url_path.next().ok_or(ConnInfoError::InvalidDbName)?)?.into();
+    let dbname: DbName = url_path.next().ok_or(ConnInfoError::InvalidDbName)?.into();
    ctx.set_dbname(dbname.clone());

    let username = RoleName::from(urlencoding::decode(connection_url.username())?);
--- a/proxy/core/src/serverless/websocket.rs
+++ b/proxy/core/src/serverless/websocket.rs
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Conrad Ludgate	18303e4d68	clean up	2024-08-13 15:08:57 +01:00
Conrad Ludgate	3df6d368e3	split out binaries	2024-08-13 15:08:57 +01:00
Conrad Ludgate	b62e7c0138	proxy: experiment with idea to split crates	2024-08-13 15:08:54 +01:00
Conrad Ludgate	a2968c6cf8	move proxy to proxy/code	2024-08-13 15:01:48 +01:00
Conrad Ludgate	bae1288671	make jwk renewal permits a bit more type safe	2024-08-13 11:08:25 +01:00
Conrad Ludgate	1254d8f56e	address some comments	2024-08-13 10:24:14 +01:00
Conrad Ludgate	073508493c	remove async_trait for FetchAuthRules	2024-08-12 16:14:53 +01:00
Conrad Ludgate	7cb2349296	add jwks size limiter	2024-08-12 11:48:57 +01:00
Conrad Ludgate	87151f9efd	ignore marvin vuln	2024-08-12 09:01:30 +01:00
Conrad Ludgate	96fe084c57	compact mock server	2024-08-12 09:01:04 +01:00
Conrad Ludgate	20fdf3e19f	extract fetch/update routine	2024-08-12 09:01:04 +01:00
Conrad Ludgate	c6b36d8171	fix lints	2024-08-12 09:01:04 +01:00
Conrad Ludgate	0e8a848937	finish happy path test	2024-08-12 09:01:04 +01:00
Conrad Ludgate	db4085fe22	mock tests for jwk renewal	2024-08-12 09:01:04 +01:00
Conrad Ludgate	0d895ba002	strip down supported algorithms to just RS256 and ES256	2024-08-12 09:01:04 +01:00
Conrad Ludgate	103f34e954	flesh out JWKs cache	2024-08-12 09:01:04 +01:00
Conrad Ludgate	262378e561	flesh out jwt code	2024-08-12 09:01:04 +01:00
Conrad Ludgate	9f38ab39c6	stash jwts	2024-08-12 09:01:04 +01:00
Conrad Ludgate	fa92328423	start stubbing jwt	2024-08-12 09:01:04 +01:00