Merge pull request #7041 from neondatabase/rc/proxy/2024-03-07

Proxy release 2024-03-07
pageserver: don't validate vectored get on shut-down (#7039 )
2026-05-14 03:30:36 +00:00 · 2024-03-08 08:19:16 +00:00 · 2024-03-07 12:37:52 +00:00 · 2024-03-07 12:36:47 +00:00 · 2024-03-06 12:20:44 -05:00 · 2024-03-06 17:09:54 +00:00
96 changed files with 3430 additions and 1427 deletions
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -16,9 +16,9 @@ assignees: ''

 ## Implementation ideas

-
+## Tasks
 ```[tasklist]
-### Tasks
+- [ ] Example Task
 ```


--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -49,6 +49,76 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  bench:
+    env:
+      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
+      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 14
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: "neon-staging"
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      options: --init
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Create Neon Project
+      id: create-neon-project
+      uses: ./.github/actions/neon-project-create
+      with:
+        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Run benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        # Set --sparse-ordering option of pytest-order plugin
+        # to ensure tests are running in order of appears in the file.
+        # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
+        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+
+    - name: Delete Neon Project
+      if: ${{ always() }}
+      uses: ./.github/actions/neon-project-delete
+      with:
+        project_id: ${{ steps.create-neon-project.outputs.project_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
  generate-matrices:
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
    #
@@ -65,31 +135,72 @@ jobs:
      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
      olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
      tpch-compare-matrix: ${{ steps.tpch-compare-matrix.outputs.matrix }}
-      pgbench-compare-big-db-matrix: ${{ steps.pgbench-compare-big-db-matrix.outputs.matrix }}

    steps:
-    - name: Generate matrix for pgbench benchmark with big databases
-      id: pgbench-compare-big-db-matrix
+    - name: Generate matrix for pgbench benchmark
+      id: pgbench-compare-matrix
      run: |
-        # There's also `neonvm-captest-reuse` platform, but we don't want to use it
-        # because DevRel team used it for some demos, they might need it again
+        matrix='{
+          "platform": [
+            "neon-captest-new",
+            "neon-captest-reuse",
+            "neonvm-captest-new"
+          ],
+          "db_size": [ "10gb" ],
+          "include": [{ "platform": "neon-captest-freetier",   "db_size": "3gb"  },
+                      { "platform": "neon-captest-new",        "db_size": "50gb" },
+                      { "platform": "neonvm-captest-freetier", "db_size": "3gb"  },
+                      { "platform": "neonvm-captest-new",      "db_size": "50gb" }]
+        }'

+        if [ "$(date +%A)" = "Saturday" ]; then
+          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
+                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
+        fi
+
+        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
+
+    - name: Generate matrix for OLAP benchmarks
+      id: olap-compare-matrix
+      run: |
+        matrix='{
+          "platform": [
+            "neon-captest-reuse"
+          ]
+        }'
+
+        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
+          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
+                                                   { "platform": "rds-aurora"   }]')
+        fi
+
+        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
+
+    - name: Generate matrix for TPC-H benchmarks
+      id: tpch-compare-matrix
+      run: |
        matrix='{
          "platform": [
            "neon-captest-reuse"
          ],
-          "db_size": [ "1tb" ]
+          "scale": [
+            "10"
+          ]
        }'

-        echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
+        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
+          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
+                                                    { "platform": "rds-aurora",   "scale": "10" }]')
+        fi

-  pgbench-compare-big-db:
-    if: ${{ !cancelled() }}
+        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
+
+  pgbench-compare:
    needs: [ generate-matrices ]

    strategy:
      fail-fast: false
-      matrix: ${{fromJson(needs.generate-matrices.outputs.pgbench-compare-big-db-matrix)}}
+      matrix: ${{fromJson(needs.generate-matrices.outputs.pgbench-compare-matrix)}}

    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
@@ -106,6 +217,350 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

+    # Increase timeout to 8h, default timeout is 6h
+    timeout-minutes: 480
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Add Postgres binaries to PATH
+      run: |
+        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
+        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
+
+    - name: Create Neon Project
+      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
+      id: create-neon-project
+      uses: ./.github/actions/neon-project-create
+      with:
+        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
+        provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+        case "${PLATFORM}" in
+          neon-captest-reuse)
+            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
+            ;;
+          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
+            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
+            ;;
+          rds-aurora)
+            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }}
+            ;;
+          rds-postgres)
+            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
+            ;;
+          *)
+            echo >&2 "Unknown PLATFORM=${PLATFORM}"
+            exit 1
+            ;;
+        esac
+
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+
+        QUERY="SELECT version();"
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"
+
+    - name: Benchmark init
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+
+    - name: Benchmark simple-update
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+
+    - name: Benchmark select-only
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+
+    - name: Delete Neon Project
+      if: ${{ steps.create-neon-project.outputs.project_id && always() }}
+      uses: ./.github/actions/neon-project-delete
+      with:
+        project_id: ${{ steps.create-neon-project.outputs.project_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+  clickbench-compare:
+    # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
+    # we use for performance testing in pgbench-compare.
+    # Run this job only when pgbench-compare is finished to avoid the intersection.
+    # We might change it after https://github.com/neondatabase/neon/issues/2900.
+    #
+    # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
+    # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
+    if: ${{ !cancelled() }}
+    needs: [ generate-matrices, pgbench-compare ]
+
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}
+
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 14
+      TEST_OUTPUT: /tmp/test_output
+      TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
+      TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: ${{ matrix.platform }}
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      options: --init
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Add Postgres binaries to PATH
+      run: |
+        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
+        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+        case "${PLATFORM}" in
+          neon-captest-reuse)
+            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
+            ;;
+          rds-aurora)
+            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CLICKBENCH_10M_CONNSTR }}
+            ;;
+          rds-postgres)
+            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
+            ;;
+          *)
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            exit 1
+            ;;
+        esac
+
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+
+        QUERY="SELECT version();"
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"
+
+    - name: ClickBench benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_perf_olap.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain || 'false' }}
+        TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements || 'false' }}
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        TEST_OLAP_SCALE: 10
+
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic OLAP perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+  tpch-compare:
+    # TCP-H DB for rds-aurora and rds-Postgres deployed to the same clusters
+    # we use for performance testing in pgbench-compare & clickbench-compare.
+    # Run this job only when clickbench-compare is finished to avoid the intersection.
+    # We might change it after https://github.com/neondatabase/neon/issues/2900.
+    #
+    # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
+    if: ${{ !cancelled() }}
+    needs: [ generate-matrices, clickbench-compare ]
+
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJson(needs.generate-matrices.outputs.tpch-compare-matrix) }}
+
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 14
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: ${{ matrix.platform }}
+      TEST_OLAP_SCALE: ${{ matrix.scale }}
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      options: --init
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Add Postgres binaries to PATH
+      run: |
+        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
+        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
+
+    - name: Get Connstring Secret Name
+      run: |
+        case "${PLATFORM}" in
+          neon-captest-reuse)
+            ENV_PLATFORM=CAPTEST_TPCH
+            ;;
+          rds-aurora)
+            ENV_PLATFORM=RDS_AURORA_TPCH
+            ;;
+          rds-postgres)
+            ENV_PLATFORM=RDS_AURORA_TPCH
+            ;;
+          *)
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
+            exit 1
+            ;;
+        esac
+
+        CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${TEST_OLAP_SCALE}_CONNSTR"
+        echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+        CONNSTR=${{ secrets[env.CONNSTR_SECRET_NAME] }}
+
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+
+        QUERY="SELECT version();"
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"
+
+    - name: Run TPC-H benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_perf_olap.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_tpch
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        TEST_OLAP_SCALE: ${{ matrix.scale }}
+
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+  user-examples-compare:
+    if: ${{ !cancelled() }}
+    needs: [ generate-matrices, tpch-compare ]
+
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}
+
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 14
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: ${{ matrix.platform }}
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      options: --init
+
    steps:
    - uses: actions/checkout@v4

@@ -126,46 +581,40 @@ jobs:
      run: |
        case "${PLATFORM}" in
          neon-captest-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_PGBENCH_1TB_CONNSTR }}
+            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
            ;;
-          neonvm-captest-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_NEONVM_PGBENCH_1TB_CONNSTR }}
+          rds-aurora)
+            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_AURORA_CONNSTR }}
+            ;;
+          rds-postgres)
+            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
            ;;
          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}"
+            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        psql ${CONNSTR} -c "SELECT version();"
+        QUERY="SELECT version();"
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"

-    - name: Benchmark simple-update
+    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
-        test_selection: performance
+        test_selection: performance/test_perf_olap.py
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
+        extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
      env:
-        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-
-    - name: Benchmark select-only
-      uses: ./.github/actions/run-python-test-set
-      with:
-        build_type: ${{ env.BUILD_TYPE }}
-        test_selection: performance
-        run_in_parallel: false
-        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
-      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
-        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"

    - name: Create Allure report
      if: ${{ !cancelled() }}
@@ -176,6 +625,6 @@ jobs:
      uses: slackapi/slack-github-action@v1
      with:
        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic perf testing ${{ matrix.platform }} (${{ matrix.db_size }}): ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2959,9 +2959,9 @@ dependencies = [

 [[package]]
 name = "mio"
-version = "0.8.10"
+version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
+checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
 dependencies = [
 "libc",
 "log",
@@ -4216,7 +4216,6 @@ dependencies = [
 "thiserror",
 "tikv-jemalloc-ctl",
 "tikv-jemallocator",
- "tls-listener",
 "tokio",
 "tokio-postgres",
 "tokio-postgres-rustls",
@@ -5794,25 +5793,11 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"

-[[package]]
-name = "tls-listener"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81294c017957a1a69794f506723519255879e15a870507faf45dfed288b763dd"
-dependencies = [
- "futures-util",
- "hyper",
- "pin-project-lite",
- "thiserror",
- "tokio",
- "tokio-rustls",
-]
-
 [[package]]
 name = "tokio"
-version = "1.34.0"
+version = "1.36.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9"
+checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
 dependencies = [
 "backtrace",
 "bytes",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -156,7 +156,6 @@ test-context = "0.1"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"
 tikv-jemalloc-ctl = "0.5"
-tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
--- a/clippy.toml
+++ b/clippy.toml
@@ -3,3 +3,10 @@ disallowed-methods = [
    # Allow this for now, to deny it later once we stop using Handle::block_on completely
    # "tokio::runtime::Handle::block_on",
 ]
+
+disallowed-macros = [
+    # use std::pin::pin
+    "futures::pin_mut",
+    # cannot disallow this, because clippy finds used from tokio macros
+    #"tokio::pin",
+]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -17,6 +17,7 @@ use chrono::{DateTime, Utc};
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
+use postgres::error::SqlState;
 use postgres::{Client, NoTls};
 use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
@@ -763,6 +764,26 @@ impl ComputeNode {
        Ok((pg, logs_handle))
    }

+    /// Do post configuration of the already started Postgres. This function spawns a background thread to
+    /// configure the database after applying the compute spec. Currently, it upgrades the neon extension
+    /// version. In the future, it may upgrade all 3rd-party extensions.
+    #[instrument(skip_all)]
+    pub fn post_apply_config(&self) -> Result<()> {
+        let connstr = self.connstr.clone();
+        thread::spawn(move || {
+            let func = || {
+                let mut client = Client::connect(connstr.as_str(), NoTls)?;
+                handle_neon_extension_upgrade(&mut client)
+                    .context("handle_neon_extension_upgrade")?;
+                Ok::<_, anyhow::Error>(())
+            };
+            if let Err(err) = func() {
+                error!("error while post_apply_config: {err:#}");
+            }
+        });
+        Ok(())
+    }
+
    /// Do initial configuration of the already started Postgres.
    #[instrument(skip_all)]
    pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
@@ -774,27 +795,34 @@ impl ComputeNode {
        // but we can create a new one and grant it all privileges.
        let connstr = self.connstr.clone();
        let mut client = match Client::connect(connstr.as_str(), NoTls) {
-            Err(e) => {
-                info!(
-                    "cannot connect to postgres: {}, retrying with `zenith_admin` username",
-                    e
-                );
-                let mut zenith_admin_connstr = connstr.clone();
+            Err(e) => match e.code() {
+                Some(&SqlState::INVALID_PASSWORD)
+                | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => {
+                    // connect with zenith_admin if cloud_admin could not authenticate
+                    info!(
+                        "cannot connect to postgres: {}, retrying with `zenith_admin` username",
+                        e
+                    );
+                    let mut zenith_admin_connstr = connstr.clone();

-                zenith_admin_connstr
-                    .set_username("zenith_admin")
-                    .map_err(|_| anyhow::anyhow!("invalid connstr"))?;
+                    zenith_admin_connstr
+                        .set_username("zenith_admin")
+                        .map_err(|_| anyhow::anyhow!("invalid connstr"))?;

-                let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
-                // Disable forwarding so that users don't get a cloud_admin role
-                client.simple_query("SET neon.forward_ddl = false")?;
-                client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
-                client.simple_query("GRANT zenith_admin TO cloud_admin")?;
-                drop(client);
+                    let mut client =
+                        Client::connect(zenith_admin_connstr.as_str(), NoTls)
+                            .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
+                    // Disable forwarding so that users don't get a cloud_admin role
+                    client.simple_query("SET neon.forward_ddl = false")?;
+                    client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
+                    client.simple_query("GRANT zenith_admin TO cloud_admin")?;
+                    drop(client);

-                // reconnect with connstring with expected name
-                Client::connect(connstr.as_str(), NoTls)?
-            }
+                    // reconnect with connstring with expected name
+                    Client::connect(connstr.as_str(), NoTls)?
+                }
+                _ => return Err(e.into()),
+            },
            Ok(client) => client,
        };

@@ -990,18 +1018,21 @@ impl ComputeNode {
        let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;

        let config_time = Utc::now();
-        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
-            let pgdata_path = Path::new(&self.pgdata);
-            // temporarily reset max_cluster_size in config
-            // to avoid the possibility of hitting the limit, while we are applying config:
-            // creating new extensions, roles, etc...
-            config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
-            self.pg_reload_conf()?;
+        if pspec.spec.mode == ComputeMode::Primary {
+            if !pspec.spec.skip_pg_catalog_updates {
+                let pgdata_path = Path::new(&self.pgdata);
+                // temporarily reset max_cluster_size in config
+                // to avoid the possibility of hitting the limit, while we are applying config:
+                // creating new extensions, roles, etc...
+                config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
+                self.pg_reload_conf()?;

-            self.apply_config(&compute_state)?;
+                self.apply_config(&compute_state)?;

-            config::compute_ctl_temp_override_remove(pgdata_path)?;
-            self.pg_reload_conf()?;
+                config::compute_ctl_temp_override_remove(pgdata_path)?;
+                self.pg_reload_conf()?;
+            }
+            self.post_apply_config()?;
        }

        let startup_end_time = Utc::now();
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -744,7 +744,17 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
    // - extension was just installed
    // - extension was already installed and is up to date
    let query = "ALTER EXTENSION neon UPDATE";
-    info!("update neon extension schema with query: {}", query);
+    info!("update neon extension version with query: {}", query);
+    client.simple_query(query)?;
+
+    Ok(())
+}
+
+#[instrument(skip_all)]
+pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
+    info!("handle neon extension upgrade");
+    let query = "ALTER EXTENSION neon UPDATE";
+    info!("update neon extension version with query: {}", query);
    client.simple_query(query)?;

    Ok(())
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -3,7 +3,7 @@ use std::{collections::HashMap, time::Duration};
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
 use hyper::{Method, StatusCode};
-use pageserver_api::shard::{ShardIndex, ShardNumber, TenantShardId};
+use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
@@ -19,8 +19,66 @@ const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);

 pub(crate) const API_CONCURRENCY: usize = 32;

-pub(super) struct ComputeHookTenant {
-    shards: Vec<(ShardIndex, NodeId)>,
+struct ShardedComputeHookTenant {
+    stripe_size: ShardStripeSize,
+    shard_count: ShardCount,
+    shards: Vec<(ShardNumber, NodeId)>,
+}
+
+enum ComputeHookTenant {
+    Unsharded(NodeId),
+    Sharded(ShardedComputeHookTenant),
+}
+
+impl ComputeHookTenant {
+    /// Construct with at least one shard's information
+    fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self {
+        if tenant_shard_id.shard_count.count() > 1 {
+            Self::Sharded(ShardedComputeHookTenant {
+                shards: vec![(tenant_shard_id.shard_number, node_id)],
+                stripe_size,
+                shard_count: tenant_shard_id.shard_count,
+            })
+        } else {
+            Self::Unsharded(node_id)
+        }
+    }
+
+    /// Set one shard's location.  If stripe size or shard count have changed, Self is reset
+    /// and drops existing content.
+    fn update(
+        &mut self,
+        tenant_shard_id: TenantShardId,
+        stripe_size: ShardStripeSize,
+        node_id: NodeId,
+    ) {
+        match self {
+            Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
+                *existing_node_id = node_id
+            }
+            Self::Sharded(sharded_tenant)
+                if sharded_tenant.stripe_size == stripe_size
+                    && sharded_tenant.shard_count == tenant_shard_id.shard_count =>
+            {
+                if let Some(existing) = sharded_tenant
+                    .shards
+                    .iter()
+                    .position(|s| s.0 == tenant_shard_id.shard_number)
+                {
+                    sharded_tenant.shards.get_mut(existing).unwrap().1 = node_id;
+                } else {
+                    sharded_tenant
+                        .shards
+                        .push((tenant_shard_id.shard_number, node_id));
+                    sharded_tenant.shards.sort_by_key(|s| s.0)
+                }
+            }
+            _ => {
+                // Shard count changed: reset struct.
+                *self = Self::new(tenant_shard_id, stripe_size, node_id);
+            }
+        }
+    }
 }

 #[derive(Serialize, Deserialize, Debug)]
@@ -33,6 +91,7 @@ struct ComputeHookNotifyRequestShard {
 #[derive(Serialize, Deserialize, Debug)]
 struct ComputeHookNotifyRequest {
    tenant_id: TenantId,
+    stripe_size: Option<ShardStripeSize>,
    shards: Vec<ComputeHookNotifyRequestShard>,
 }

@@ -63,42 +122,43 @@ pub(crate) enum NotifyError {
 }

 impl ComputeHookTenant {
-    async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
-        // Find the highest shard count and drop any shards that aren't
-        // for that shard count.
-        let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max();
-        let Some(shard_count) = shard_count else {
-            // No shards, nothing to do.
-            tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards");
-            return None;
-        };
-
-        self.shards.retain(|(k, _v)| k.shard_count == shard_count);
-        self.shards
-            .sort_by_key(|(shard, _node_id)| shard.shard_number);
-
-        if self.shards.len() == shard_count.count() as usize || shard_count.is_unsharded() {
-            // We have pageservers for all the shards: emit a configuration update
-            return Some(ComputeHookNotifyRequest {
+    fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
+        match self {
+            Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
                tenant_id,
-                shards: self
-                    .shards
-                    .iter()
-                    .map(|(shard, node_id)| ComputeHookNotifyRequestShard {
-                        shard_number: shard.shard_number,
-                        node_id: *node_id,
-                    })
-                    .collect(),
-            });
-        } else {
-            tracing::info!(
-                "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
-                self.shards.len(),
-                shard_count.count()
-            );
-        }
+                shards: vec![ComputeHookNotifyRequestShard {
+                    shard_number: ShardNumber(0),
+                    node_id: *node_id,
+                }],
+                stripe_size: None,
+            }),
+            Self::Sharded(sharded_tenant)
+                if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize =>
+            {
+                Some(ComputeHookNotifyRequest {
+                    tenant_id,
+                    shards: sharded_tenant
+                        .shards
+                        .iter()
+                        .map(|(shard_number, node_id)| ComputeHookNotifyRequestShard {
+                            shard_number: *shard_number,
+                            node_id: *node_id,
+                        })
+                        .collect(),
+                    stripe_size: Some(sharded_tenant.stripe_size),
+                })
+            }
+            Self::Sharded(sharded_tenant) => {
+                // Sharded tenant doesn't yet have information for all its shards

-        None
+                tracing::info!(
+                    "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
+                    sharded_tenant.shards.len(),
+                    sharded_tenant.shard_count.count()
+                );
+                None
+            }
+        }
    }
 }

@@ -139,7 +199,11 @@ impl ComputeHook {
        };
        let cplane =
            ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane");
-        let ComputeHookNotifyRequest { tenant_id, shards } = reconfigure_request;
+        let ComputeHookNotifyRequest {
+            tenant_id,
+            shards,
+            stripe_size,
+        } = reconfigure_request;

        let compute_pageservers = shards
            .into_iter()
@@ -156,7 +220,9 @@ impl ComputeHook {
        for (endpoint_name, endpoint) in &cplane.endpoints {
            if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
                tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
-                endpoint.reconfigure(compute_pageservers.clone()).await?;
+                endpoint
+                    .reconfigure(compute_pageservers.clone(), stripe_size)
+                    .await?;
            }
        }

@@ -271,30 +337,26 @@ impl ComputeHook {
        &self,
        tenant_shard_id: TenantShardId,
        node_id: NodeId,
+        stripe_size: ShardStripeSize,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
        let mut locked = self.state.lock().await;
-        let entry = locked
-            .entry(tenant_shard_id.tenant_id)
-            .or_insert_with(|| ComputeHookTenant { shards: Vec::new() });

-        let shard_index = ShardIndex {
-            shard_count: tenant_shard_id.shard_count,
-            shard_number: tenant_shard_id.shard_number,
+        use std::collections::hash_map::Entry;
+        let tenant = match locked.entry(tenant_shard_id.tenant_id) {
+            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
+                tenant_shard_id,
+                stripe_size,
+                node_id,
+            )),
+            Entry::Occupied(e) => {
+                let tenant = e.into_mut();
+                tenant.update(tenant_shard_id, stripe_size, node_id);
+                tenant
+            }
        };

-        let mut set = false;
-        for (existing_shard, existing_node) in &mut entry.shards {
-            if *existing_shard == shard_index {
-                *existing_node = node_id;
-                set = true;
-            }
-        }
-        if !set {
-            entry.shards.push((shard_index, node_id));
-        }
-
-        let reconfigure_request = entry.maybe_reconfigure(tenant_shard_id.tenant_id).await;
+        let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
        let Some(reconfigure_request) = reconfigure_request else {
            // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
            // until it does.
@@ -316,3 +378,85 @@ impl ComputeHook {
        }
    }
 }
+
+#[cfg(test)]
+pub(crate) mod tests {
+    use pageserver_api::shard::{ShardCount, ShardNumber};
+    use utils::id::TenantId;
+
+    use super::*;
+
+    #[test]
+    fn tenant_updates() -> anyhow::Result<()> {
+        let tenant_id = TenantId::generate();
+        let mut tenant_state = ComputeHookTenant::new(
+            TenantShardId {
+                tenant_id,
+                shard_count: ShardCount::new(0),
+                shard_number: ShardNumber(0),
+            },
+            ShardStripeSize(12345),
+            NodeId(1),
+        );
+
+        // An unsharded tenant is always ready to emit a notification
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .shards
+                .len(),
+            1
+        );
+        assert!(tenant_state
+            .maybe_reconfigure(tenant_id)
+            .unwrap()
+            .stripe_size
+            .is_none());
+
+        // Writing the first shard of a multi-sharded situation (i.e. in a split)
+        // resets the tenant state and puts it in an non-notifying state (need to
+        // see all shards)
+        tenant_state.update(
+            TenantShardId {
+                tenant_id,
+                shard_count: ShardCount::new(2),
+                shard_number: ShardNumber(1),
+            },
+            ShardStripeSize(32768),
+            NodeId(1),
+        );
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());
+
+        // Writing the second shard makes it ready to notify
+        tenant_state.update(
+            TenantShardId {
+                tenant_id,
+                shard_count: ShardCount::new(2),
+                shard_number: ShardNumber(0),
+            },
+            ShardStripeSize(32768),
+            NodeId(1),
+        );
+
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .shards
+                .len(),
+            2
+        );
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .stripe_size,
+            Some(ShardStripeSize(32768))
+        );
+
+        Ok(())
+    }
+}
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -104,6 +104,7 @@ impl Reconciler {
        node_id: NodeId,
        config: LocationConfig,
        flush_ms: Option<Duration>,
+        lazy: bool,
    ) -> anyhow::Result<()> {
        let node = self
            .pageservers
@@ -118,7 +119,7 @@ impl Reconciler {
        let client =
            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
        client
-            .location_config(self.tenant_shard_id, config.clone(), flush_ms)
+            .location_config(self.tenant_shard_id, config.clone(), flush_ms, lazy)
            .await?;
        tracing::info!("location_config({}) complete: {:?}", node_id, config);

@@ -315,8 +316,13 @@ impl Reconciler {
            self.generation,
            None,
        );
-        self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
-            .await?;
+        self.location_config(
+            origin_ps_id,
+            stale_conf,
+            Some(Duration::from_secs(10)),
+            false,
+        )
+        .await?;

        let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?);

@@ -350,7 +356,8 @@ impl Reconciler {
        );

        tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id);
-        self.location_config(dest_ps_id, dest_conf, None).await?;
+        self.location_config(dest_ps_id, dest_conf, None, false)
+            .await?;

        if let Some(baseline) = baseline_lsns {
            tracing::info!("🕑 Waiting for LSN to catch up...");
@@ -382,7 +389,7 @@ impl Reconciler {
            None,
            Some(LocationConfigSecondary { warm: true }),
        );
-        self.location_config(origin_ps_id, origin_secondary_conf.clone(), None)
+        self.location_config(origin_ps_id, origin_secondary_conf.clone(), None, false)
            .await?;
        // TODO: we should also be setting the ObservedState on earlier API calls, in case we fail
        // partway through.  In fact, all location conf API calls should be in a wrapper that sets
@@ -405,7 +412,7 @@ impl Reconciler {
            self.generation,
            None,
        );
-        self.location_config(dest_ps_id, dest_final_conf.clone(), None)
+        self.location_config(dest_ps_id, dest_final_conf.clone(), None, false)
            .await?;
        self.observed.locations.insert(
            dest_ps_id,
@@ -491,7 +498,10 @@ impl Reconciler {
                        wanted_conf.generation = generation.into();
                    }
                    tracing::info!(%node_id, "Observed configuration requires update.");
-                    self.location_config(node_id, wanted_conf, None).await?;
+                    // Use lazy=true, because we may run many of Self concurrently, and do not want to
+                    // overload the pageserver with logical size calculations.
+                    self.location_config(node_id, wanted_conf, None, true)
+                        .await?;
                    self.compute_notify().await?;
                }
            }
@@ -543,7 +553,7 @@ impl Reconciler {
            if self.cancel.is_cancelled() {
                return Err(ReconcileError::Cancel);
            }
-            self.location_config(node_id, conf, None).await?;
+            self.location_config(node_id, conf, None, false).await?;
        }

        Ok(())
@@ -555,7 +565,12 @@ impl Reconciler {
        if let Some(node_id) = self.intent.attached {
            let result = self
                .compute_hook
-                .notify(self.tenant_shard_id, node_id, &self.cancel)
+                .notify(
+                    self.tenant_shard_id,
+                    node_id,
+                    self.shard.stripe_size,
+                    &self.cancel,
+                )
                .await;
            if let Err(e) = &result {
                // It is up to the caller whether they want to drop out on this error, but they don't have to:
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -283,7 +283,11 @@ impl Service {
                    // emit a compute notification for this. In the case where our observed state does not
                    // yet match our intent, we will eventually reconcile, and that will emit a compute notification.
                    if let Some(attached_at) = tenant_state.stably_attached() {
-                        compute_notifications.push((*tenant_shard_id, attached_at));
+                        compute_notifications.push((
+                            *tenant_shard_id,
+                            attached_at,
+                            tenant_state.shard.stripe_size,
+                        ));
                    }
                }
            }
@@ -468,6 +472,7 @@ impl Service {
                        tenant_conf: models::TenantConfig::default(),
                    },
                    None,
+                    false,
                )
                .await
            {
@@ -492,7 +497,7 @@ impl Service {
    /// Returns a set of any shards for which notifications where not acked within the deadline.
    async fn compute_notify_many(
        &self,
-        notifications: Vec<(TenantShardId, NodeId)>,
+        notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>,
        deadline: Instant,
    ) -> HashSet<TenantShardId> {
        let compute_hook = self.inner.read().unwrap().compute_hook.clone();
@@ -503,11 +508,14 @@ impl Service {
        // Construct an async stream of futures to invoke the compute notify function: we do this
        // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.
        let mut stream = futures::stream::iter(notifications.into_iter())
-            .map(|(tenant_shard_id, node_id)| {
+            .map(|(tenant_shard_id, node_id, stripe_size)| {
                let compute_hook = compute_hook.clone();
                let cancel = self.cancel.clone();
                async move {
-                    if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
+                    if let Err(e) = compute_hook
+                        .notify(tenant_shard_id, node_id, stripe_size, &cancel)
+                        .await
+                    {
                        tracing::error!(
                            %tenant_shard_id,
                            %node_id,
@@ -1151,9 +1159,12 @@ impl Service {

        let (waiters, response_shards) = {
            let mut locked = self.inner.write().unwrap();
-            let (_nodes, tenants, scheduler) = locked.parts_mut();
+            let result_tx = locked.result_tx.clone();
+            let compute_hook = locked.compute_hook.clone();
+            let (nodes, tenants, scheduler) = locked.parts_mut();

            let mut response_shards = Vec::new();
+            let mut schcedule_error = None;

            for tenant_shard_id in create_ids {
                tracing::info!("Creating shard {tenant_shard_id}...");
@@ -1190,23 +1201,20 @@ impl Service {
                        continue;
                    }
                    Entry::Vacant(entry) => {
-                        let mut state = TenantState::new(
+                        let state = entry.insert(TenantState::new(
                            tenant_shard_id,
                            ShardIdentity::from_params(
                                tenant_shard_id.shard_number,
                                &create_req.shard_parameters,
                            ),
                            placement_policy.clone(),
-                        );
+                        ));

                        state.generation = initial_generation;
                        state.config = create_req.config.clone();
-
-                        state.schedule(scheduler).map_err(|e| {
-                            ApiError::Conflict(format!(
-                                "Failed to schedule shard {tenant_shard_id}: {e}"
-                            ))
-                        })?;
+                        if let Err(e) = state.schedule(scheduler) {
+                            schcedule_error = Some(e);
+                        }

                        // Only include shards in result if we are attaching: the purpose
                        // of the response is to tell the caller where the shards are attached.
@@ -1220,24 +1228,27 @@ impl Service {
                                generation: generation.into().unwrap(),
                            });
                        }
-                        entry.insert(state)
                    }
                };
            }

-            // Take a snapshot of pageservers
-            let pageservers = locked.nodes.clone();
+            // If we failed to schedule shards, then they are still created in the controller,
+            // but we return an error to the requester to avoid a silent failure when someone
+            // tries to e.g. create a tenant whose placement policy requires more nodes than
+            // are present in the system.  We do this here rather than in the above loop, to
+            // avoid situations where we only create a subset of shards in the tenant.
+            if let Some(e) = schcedule_error {
+                return Err(ApiError::Conflict(format!(
+                    "Failed to schedule shard(s): {e}"
+                )));
+            }

-            let result_tx = locked.result_tx.clone();
-            let compute_hook = locked.compute_hook.clone();
-
-            let waiters = locked
-                .tenants
+            let waiters = tenants
                .range_mut(TenantShardId::tenant_range(tenant_id))
                .filter_map(|(_shard_id, shard)| {
                    shard.maybe_reconcile(
                        result_tx.clone(),
-                        &pageservers,
+                        nodes,
                        &compute_hook,
                        &self.config,
                        &self.persistence,
@@ -1395,7 +1406,10 @@ impl Service {
        // First check if this is a creation or an update
        let create_or_update = self.tenant_location_config_prepare(tenant_id, req);

-        let mut result = TenantLocationConfigResponse { shards: Vec::new() };
+        let mut result = TenantLocationConfigResponse {
+            shards: Vec::new(),
+            stripe_size: None,
+        };
        let waiters = match create_or_update {
            TenantCreateOrUpdate::Create((create_req, placement_policy)) => {
                let (create_resp, waiters) =
@@ -1451,6 +1465,11 @@ impl Service {
                            continue;
                        };

+                        // Update stripe size
+                        if result.stripe_size.is_none() && shard.shard.count.count() > 1 {
+                            result.stripe_size = Some(shard.shard.stripe_size);
+                        }
+
                        shard.policy = placement_policy;
                        shard.config = tenant_config;
                        if let Some(generation) = update_generation {
@@ -2455,7 +2474,7 @@ impl Service {
                    // as at this point in the split process we have succeeded and this part is infallible:
                    // we will never need to do any special recovery from this state.

-                    child_locations.push((child, pageserver));
+                    child_locations.push((child, pageserver, child_shard.stripe_size));

                    tenants.insert(child, child_state);
                    response.new_shards.push(child);
@@ -2465,8 +2484,11 @@ impl Service {

        // Send compute notifications for all the new shards
        let mut failed_notifications = Vec::new();
-        for (child_id, child_ps) in child_locations {
-            if let Err(e) = compute_hook.notify(child_id, child_ps, &self.cancel).await {
+        for (child_id, child_ps, stripe_size) in child_locations {
+            if let Err(e) = compute_hook
+                .notify(child_id, child_ps, stripe_size, &self.cancel)
+                .await
+            {
                tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
                        child_id, child_ps);
                failed_notifications.push(child_id);
@@ -2497,6 +2519,19 @@ impl Service {
            let compute_hook = locked.compute_hook.clone();
            let (nodes, tenants, scheduler) = locked.parts_mut();

+            let Some(node) = nodes.get(&migrate_req.node_id) else {
+                return Err(ApiError::BadRequest(anyhow::anyhow!(
+                    "Node {} not found",
+                    migrate_req.node_id
+                )));
+            };
+
+            if node.availability != NodeAvailability::Active {
+                // Warn but proceed: the caller may intend to manually adjust the placement of
+                // a shard even if the node is down, e.g. if intervening during an incident.
+                tracing::warn!("Migrating to an unavailable node ({})", node.id);
+            }
+
            let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
                return Err(ApiError::NotFound(
                    anyhow::anyhow!("Tenant shard not found").into(),
@@ -2626,6 +2661,18 @@ impl Service {
                .map(|t| t.to_persistent())
                .collect::<Vec<_>>();

+            // This method can only validate the state of an idle system: if a reconcile is in
+            // progress, fail out early to avoid giving false errors on state that won't match
+            // between database and memory under a ReconcileResult is processed.
+            for t in locked.tenants.values() {
+                if t.reconciler.is_some() {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "Shard {} reconciliation in progress",
+                        t.tenant_shard_id
+                    )));
+                }
+            }
+
            (expect_nodes, expect_shards)
        };

--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1024,7 +1024,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                        })
                        .collect::<Vec<_>>()
                };
-            endpoint.reconfigure(pageservers).await?;
+            endpoint.reconfigure(pageservers, None).await?;
        }
        "stop" => {
            let endpoint_id = sub_args
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -52,6 +52,7 @@ use compute_api::spec::RemoteExtSpec;
 use compute_api::spec::Role;
 use nix::sys::signal::kill;
 use nix::sys::signal::Signal;
+use pageserver_api::shard::ShardStripeSize;
 use serde::{Deserialize, Serialize};
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -735,7 +736,11 @@ impl Endpoint {
        }
    }

-    pub async fn reconfigure(&self, mut pageservers: Vec<(Host, u16)>) -> Result<()> {
+    pub async fn reconfigure(
+        &self,
+        mut pageservers: Vec<(Host, u16)>,
+        stripe_size: Option<ShardStripeSize>,
+    ) -> Result<()> {
        let mut spec: ComputeSpec = {
            let spec_path = self.endpoint_path().join("spec.json");
            let file = std::fs::File::open(spec_path)?;
@@ -765,6 +770,9 @@ impl Endpoint {
        let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
        assert!(!pageserver_connstr.is_empty());
        spec.pageserver_connstring = Some(pageserver_connstr);
+        if stripe_size.is_some() {
+            spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
+        }

        let client = reqwest::Client::new();
        let response = client
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -537,10 +537,11 @@ impl PageServerNode {
        tenant_shard_id: TenantShardId,
        config: LocationConfig,
        flush_ms: Option<Duration>,
+        lazy: bool,
    ) -> anyhow::Result<()> {
        Ok(self
            .http_client
-            .location_config(tenant_shard_id, config, flush_ms)
+            .location_config(tenant_shard_id, config, flush_ms, lazy)
            .await?)
    }

@@ -605,7 +606,7 @@ impl PageServerNode {
                eprintln!("connection error: {}", e);
            }
        });
-        tokio::pin!(client);
+        let client = std::pin::pin!(client);

        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -435,6 +435,8 @@ pub struct TenantShardLocation {
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigResponse {
    pub shards: Vec<TenantShardLocation>,
+    // If the shards' ShardCount count is >1, stripe_size will be set.
+    pub stripe_size: Option<ShardStripeSize>,
 }

 #[derive(Serialize, Deserialize, Debug)]
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -6,7 +6,6 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::Context;
 use bytes::Bytes;
-use futures::pin_mut;
 use serde::{Deserialize, Serialize};
 use std::io::ErrorKind;
 use std::net::SocketAddr;
@@ -378,8 +377,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        &mut self,
        cx: &mut std::task::Context<'_>,
    ) -> Poll<Result<(), std::io::Error>> {
-        let flush_fut = self.flush();
-        pin_mut!(flush_fut);
+        let flush_fut = std::pin::pin!(self.flush());
        flush_fut.poll(cx)
    }

--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -251,21 +251,30 @@ impl Client {
        tenant_shard_id: TenantShardId,
        config: LocationConfig,
        flush_ms: Option<std::time::Duration>,
+        lazy: bool,
    ) -> Result<()> {
        let req_body = TenantLocationConfigRequest {
            tenant_id: tenant_shard_id,
            config,
        };
-        let path = format!(
+
+        let mut path = reqwest::Url::parse(&format!(
            "{}/v1/tenant/{}/location_config",
            self.mgmt_api_endpoint, tenant_shard_id
-        );
-        let path = if let Some(flush_ms) = flush_ms {
-            format!("{}?flush_ms={}", path, flush_ms.as_millis())
-        } else {
-            path
-        };
-        self.request(Method::PUT, &path, &req_body).await?;
+        ))
+        // Should always work: mgmt_api_endpoint is configuration, not user input.
+        .expect("Cannot build URL");
+
+        if lazy {
+            path.query_pairs_mut().append_pair("lazy", "true");
+        }
+
+        if let Some(flush_ms) = flush_ms {
+            path.query_pairs_mut()
+                .append_pair("flush_ms", &format!("{}", flush_ms.as_millis()));
+        }
+
+        self.request(Method::PUT, path, &req_body).await?;
        Ok(())
    }

--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -63,7 +63,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
        );

        // Identify the range of LSNs that belong to this level. We assume that
-        // each file in this level span an LSN range up to 1.75x target file
+        // each file in this level spans an LSN range up to 1.75x target file
        // size. That should give us enough slop that if we created a slightly
        // oversized L0 layer, e.g. because flushing the in-memory layer was
        // delayed for some reason, we don't consider the oversized layer to
@@ -248,7 +248,6 @@ enum CompactionStrategy {
    CreateImage,
 }

-#[allow(dead_code)] // Todo
 struct CompactionJob<E: CompactionJobExecutor> {
    key_range: Range<E::Key>,
    lsn_range: Range<Lsn>,
@@ -345,7 +344,7 @@ where
    ///
    /// TODO: Currently, this is called exactly once for the level, and we
    /// decide whether to create new image layers to cover the whole level, or
-    /// write a new set of delta. In the future, this should try to partition
+    /// write a new set of deltas. In the future, this should try to partition
    /// the key space, and make the decision separately for each partition.
    async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
        let job = &self.jobs[job_id.0];
@@ -709,18 +708,6 @@ where
    }
 }

-// Sliding window through keyspace and values
-//
-// This is used to decide what layer to write next, from the beginning of the window.
-//
-// Candidates:
-//
-// 1. Create an image layer, snapping to previous images
-// 2. Create a delta layer, snapping to previous images
-// 3. Create an image layer, snapping to
-//
-//
-
 // Take previous partitioning, based on the image layers below.
 //
 // Candidate is at the front:
@@ -739,6 +726,10 @@ struct WindowElement<K> {
    last_key: K,  // inclusive
    accum_size: u64,
 }
+
+// Sliding window through keyspace and values
+//
+// This is used to decide what layer to write next, from the beginning of the window.
 struct Window<K> {
    elems: VecDeque<WindowElement<K>>,

--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -1,5 +1,5 @@
-//! An LSM tree consists of multiple levels, each exponential larger than the
-//! previous level. And each level consists of be multiple "tiers". With tiered
+//! An LSM tree consists of multiple levels, each exponentially larger than the
+//! previous level. And each level consists of multiple "tiers". With tiered
 //! compaction, a level is compacted when it has accumulated more than N tiers,
 //! forming one tier on the next level.
 //!
@@ -170,13 +170,6 @@ where
    })
 }

-// helper struct used in depth()
-struct Event<K> {
-    key: K,
-    layer_idx: usize,
-    start: bool,
-}
-
 impl<L> Level<L> {
    /// Count the number of deltas stacked on each other.
    pub fn depth<K>(&self) -> u64
@@ -184,6 +177,11 @@ impl<L> Level<L> {
        K: CompactionKey,
        L: CompactionLayer<K>,
    {
+        struct Event<K> {
+            key: K,
+            layer_idx: usize,
+            start: bool,
+        }
        let mut events: Vec<Event<K>> = Vec::new();
        for (idx, l) in self.layers.iter().enumerate() {
            events.push(Event {
@@ -202,7 +200,7 @@ impl<L> Level<L> {
        // Sweep the key space left to right. Stop at each distinct key, and
        // count the number of deltas on top of the highest image at that key.
        //
-        // This is a little enefficient, as we walk through the active_set on
+        // This is a little inefficient, as we walk through the active_set on
        // every key. We could increment/decrement a counter on each step
        // instead, but that'd require a bit more complex bookkeeping.
        let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
@@ -236,6 +234,7 @@ impl<L> Level<L> {
                }
            }
        }
+        debug_assert_eq!(active_set, BTreeSet::new());
        max_depth
    }
 }
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -4,12 +4,12 @@
 //! All the heavy lifting is done by the create_image and create_delta
 //! functions that the implementor provides.
 use async_trait::async_trait;
+use futures::Future;
 use pageserver_api::{key::Key, keyspace::key_range_size};
 use std::ops::Range;
 use utils::lsn::Lsn;

 /// Public interface. This is the main thing that the implementor needs to provide
-#[async_trait]
 pub trait CompactionJobExecutor {
    // Type system.
    //
@@ -17,8 +17,7 @@ pub trait CompactionJobExecutor {
    // compaction doesn't distinguish whether they are stored locally or
    // remotely.
    //
-    // The keyspace is defined by CompactionKey trait.
-    //
+    // The keyspace is defined by the CompactionKey trait.
    type Key: CompactionKey;

    type Layer: CompactionLayer<Self::Key> + Clone;
@@ -35,27 +34,27 @@ pub trait CompactionJobExecutor {
    // ----

    /// Return all layers that overlap the given bounding box.
-    async fn get_layers(
+    fn get_layers(
        &mut self,
        key_range: &Range<Self::Key>,
        lsn_range: &Range<Lsn>,
        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<Vec<Self::Layer>>;
+    ) -> impl Future<Output = anyhow::Result<Vec<Self::Layer>>> + Send;

-    async fn get_keyspace(
+    fn get_keyspace(
        &mut self,
        key_range: &Range<Self::Key>,
        lsn: Lsn,
        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<CompactionKeySpace<Self::Key>>;
+    ) -> impl Future<Output = anyhow::Result<CompactionKeySpace<Self::Key>>> + Send;

    /// NB: This is a pretty expensive operation. In the real pageserver
    /// implementation, it downloads the layer, and keeps it resident
    /// until the DeltaLayer is dropped.
-    async fn downcast_delta_layer(
+    fn downcast_delta_layer(
        &self,
        layer: &Self::Layer,
-    ) -> anyhow::Result<Option<Self::DeltaLayer>>;
+    ) -> impl Future<Output = anyhow::Result<Option<Self::DeltaLayer>>> + Send;

    // ----
    // Functions to execute the plan
@@ -63,33 +62,33 @@ pub trait CompactionJobExecutor {

    /// Create a new image layer, materializing all the values in the key range,
    /// at given 'lsn'.
-    async fn create_image(
+    fn create_image(
        &mut self,
        lsn: Lsn,
        key_range: &Range<Self::Key>,
        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
+    ) -> impl Future<Output = anyhow::Result<()>> + Send;

    /// Create a new delta layer, containing all the values from 'input_layers'
    /// in the given key and LSN range.
-    async fn create_delta(
+    fn create_delta(
        &mut self,
        lsn_range: &Range<Lsn>,
        key_range: &Range<Self::Key>,
        input_layers: &[Self::DeltaLayer],
        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
+    ) -> impl Future<Output = anyhow::Result<()>> + Send;

    /// Delete a layer. The compaction implementation will call this only after
    /// all the create_image() or create_delta() calls that deletion of this
    /// layer depends on have finished. But if the implementor has extra lazy
-    /// background tasks, like uploading the index json file to remote storage,
+    /// background tasks, like uploading the index json file to remote storage.
    /// it is the implementation's responsibility to track those.
-    async fn delete_layer(
+    fn delete_layer(
        &mut self,
        layer: &Self::Layer,
        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
+    ) -> impl Future<Output = anyhow::Result<()>> + Send;
 }

 pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -429,7 +429,6 @@ impl From<&Arc<MockImageLayer>> for MockLayer {
    }
 }

-#[async_trait]
 impl interface::CompactionJobExecutor for MockTimeline {
    type Key = Key;
    type Layer = MockLayer;
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -88,13 +88,16 @@

 use crate::task_mgr::TaskKind;

+pub(crate) mod optional_counter;
+
 // The main structure of this module, see module-level comment.
-#[derive(Clone, Debug)]
+#[derive(Debug)]
 pub struct RequestContext {
    task_kind: TaskKind,
    download_behavior: DownloadBehavior,
    access_stats_behavior: AccessStatsBehavior,
    page_content_kind: PageContentKind,
+    pub micros_spent_throttled: optional_counter::MicroSecondsCounterU32,
 }

 /// The kind of access to the page cache.
@@ -150,6 +153,7 @@ impl RequestContextBuilder {
                download_behavior: DownloadBehavior::Download,
                access_stats_behavior: AccessStatsBehavior::Update,
                page_content_kind: PageContentKind::Unknown,
+                micros_spent_throttled: Default::default(),
            },
        }
    }
@@ -163,6 +167,7 @@ impl RequestContextBuilder {
                download_behavior: original.download_behavior,
                access_stats_behavior: original.access_stats_behavior,
                page_content_kind: original.page_content_kind,
+                micros_spent_throttled: Default::default(),
            },
        }
    }
--- a/pageserver/src/context/optional_counter.rs
+++ b/pageserver/src/context/optional_counter.rs
@@ -0,0 +1,101 @@
+use std::{
+    sync::atomic::{AtomicU32, Ordering},
+    time::Duration,
+};
+
+#[derive(Debug)]
+pub struct CounterU32 {
+    inner: AtomicU32,
+}
+impl Default for CounterU32 {
+    fn default() -> Self {
+        Self {
+            inner: AtomicU32::new(u32::MAX),
+        }
+    }
+}
+impl CounterU32 {
+    pub fn open(&self) -> Result<(), &'static str> {
+        match self
+            .inner
+            .compare_exchange(u32::MAX, 0, Ordering::Relaxed, Ordering::Relaxed)
+        {
+            Ok(_) => Ok(()),
+            Err(_) => Err("open() called on clsoed state"),
+        }
+    }
+    pub fn close(&self) -> Result<u32, &'static str> {
+        match self.inner.swap(u32::MAX, Ordering::Relaxed) {
+            u32::MAX => Err("close() called on closed state"),
+            x => Ok(x),
+        }
+    }
+
+    pub fn add(&self, count: u32) -> Result<(), &'static str> {
+        if count == 0 {
+            return Ok(());
+        }
+        let mut had_err = None;
+        self.inner
+            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| match cur {
+                u32::MAX => {
+                    had_err = Some("add() called on closed state");
+                    None
+                }
+                x => {
+                    let (new, overflowed) = x.overflowing_add(count);
+                    if new == u32::MAX || overflowed {
+                        had_err = Some("add() overflowed the counter");
+                        None
+                    } else {
+                        Some(new)
+                    }
+                }
+            })
+            .map_err(|_| had_err.expect("we set it whenever the function returns None"))
+            .map(|_| ())
+    }
+}
+
+#[derive(Default, Debug)]
+pub struct MicroSecondsCounterU32 {
+    inner: CounterU32,
+}
+
+impl MicroSecondsCounterU32 {
+    pub fn open(&self) -> Result<(), &'static str> {
+        self.inner.open()
+    }
+    pub fn add(&self, duration: Duration) -> Result<(), &'static str> {
+        match duration.as_micros().try_into() {
+            Ok(x) => self.inner.add(x),
+            Err(_) => Err("add(): duration conversion error"),
+        }
+    }
+    pub fn close_and_checked_sub_from(&self, from: Duration) -> Result<Duration, &'static str> {
+        let val = self.inner.close()?;
+        let val = Duration::from_micros(val as u64);
+        let subbed = match from.checked_sub(val) {
+            Some(v) => v,
+            None => return Err("Duration::checked_sub"),
+        };
+        Ok(subbed)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    #[test]
+    fn test_basic() {
+        let counter = MicroSecondsCounterU32::default();
+        counter.open().unwrap();
+        counter.add(Duration::from_micros(23)).unwrap();
+        let res = counter
+            .close_and_checked_sub_from(Duration::from_micros(42))
+            .unwrap();
+        assert_eq!(res, Duration::from_micros(42 - 23));
+    }
+}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1339,6 +1339,10 @@ components:
          type: array
          items:
            $ref: "#/components/schemas/TenantShardLocation"
+        stripe_size:
+          description: If multiple shards are present, this field contains the sharding stripe size, else it is null.
+          type: integer
+          nullable: true
    TenantShardLocation:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1451,11 +1451,12 @@ async fn put_tenant_location_config_handler(
        tenant::SpawnMode::Eager
    };

-    let attached = state
+    let tenant = state
        .tenant_manager
        .upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx)
-        .await?
-        .is_some();
+        .await?;
+    let stripe_size = tenant.as_ref().map(|t| t.get_shard_stripe_size());
+    let attached = tenant.is_some();

    if let Some(_flush_ms) = flush {
        match state
@@ -1477,12 +1478,20 @@ async fn put_tenant_location_config_handler(
    // This API returns a vector of pageservers where the tenant is attached: this is
    // primarily for use in the sharding service.  For compatibilty, we also return this
    // when called directly on a pageserver, but the payload is always zero or one shards.
-    let mut response = TenantLocationConfigResponse { shards: Vec::new() };
+    let mut response = TenantLocationConfigResponse {
+        shards: Vec::new(),
+        stripe_size: None,
+    };
    if attached {
        response.shards.push(TenantShardLocation {
            shard_id: tenant_shard_id,
            node_id: state.conf.id,
-        })
+        });
+        if tenant_shard_id.shard_count.count() > 1 {
+            // Stripe size should be set if we are attached
+            debug_assert!(stripe_size.is_some());
+            response.stripe_size = stripe_size;
+        }
    }

    json_response(StatusCode::OK, response)
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -11,6 +11,7 @@ use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
 use strum::{EnumCount, IntoEnumIterator, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
+use tracing::warn;
 use utils::id::TimelineId;

 /// Prometheus histogram buckets (in seconds) for operations in the critical
@@ -1005,15 +1006,39 @@ impl GlobalAndPerTimelineHistogram {
    }
 }

-struct GlobalAndPerTimelineHistogramTimer<'a> {
+struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
    h: &'a GlobalAndPerTimelineHistogram,
+    ctx: &'c RequestContext,
    start: std::time::Instant,
+    op: SmgrQueryType,
 }

-impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> {
+impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
    fn drop(&mut self) {
        let elapsed = self.start.elapsed();
-        self.h.observe(elapsed.as_secs_f64());
+        let ex_throttled = self
+            .ctx
+            .micros_spent_throttled
+            .close_and_checked_sub_from(elapsed);
+        let ex_throttled = match ex_throttled {
+            Ok(res) => res,
+            Err(error) => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
+                    Lazy::new(|| {
+                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
+                            RateLimit::new(Duration::from_secs(10))
+                        })))
+                    });
+                let mut guard = LOGGED.lock().unwrap();
+                let rate_limit = &mut guard[self.op];
+                rate_limit.call(|| {
+                    warn!(op=?self.op, error, "error deducting time spent throttled; this message is logged at a global rate limit");
+                });
+                elapsed
+            }
+        };
+        self.h.observe(ex_throttled.as_secs_f64());
    }
 }

@@ -1025,6 +1050,7 @@ impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> {
    strum_macros::EnumCount,
    strum_macros::EnumIter,
    strum_macros::FromRepr,
+    enum_map::Enum,
 )]
 #[strum(serialize_all = "snake_case")]
 pub enum SmgrQueryType {
@@ -1130,11 +1156,35 @@ impl SmgrQueryTimePerTimeline {
        });
        Self { metrics }
    }
-    pub(crate) fn start_timer(&self, op: SmgrQueryType) -> impl Drop + '_ {
+    pub(crate) fn start_timer<'c: 'a, 'a>(
+        &'a self,
+        op: SmgrQueryType,
+        ctx: &'c RequestContext,
+    ) -> impl Drop + '_ {
        let metric = &self.metrics[op as usize];
+        let start = Instant::now();
+        match ctx.micros_spent_throttled.open() {
+            Ok(()) => (),
+            Err(error) => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
+                    Lazy::new(|| {
+                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
+                            RateLimit::new(Duration::from_secs(10))
+                        })))
+                    });
+                let mut guard = LOGGED.lock().unwrap();
+                let rate_limit = &mut guard[op];
+                rate_limit.call(|| {
+                    warn!(?op, error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
+                });
+            }
+        }
        GlobalAndPerTimelineHistogramTimer {
            h: metric,
-            start: std::time::Instant::now(),
+            ctx,
+            start,
+            op,
        }
    }
 }
@@ -1145,6 +1195,11 @@ mod smgr_query_time_tests {
    use strum::IntoEnumIterator;
    use utils::id::{TenantId, TimelineId};

+    use crate::{
+        context::{DownloadBehavior, RequestContext},
+        task_mgr::TaskKind,
+    };
+
    // Regression test, we used hard-coded string constants before using an enum.
    #[test]
    fn op_label_name() {
@@ -1193,7 +1248,8 @@ mod smgr_query_time_tests {
            let (pre_global, pre_per_tenant_timeline) = get_counts();
            assert_eq!(pre_per_tenant_timeline, 0);

-            let timer = metrics.start_timer(*op);
+            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
+            let timer = metrics.start_timer(*op, &ctx);
            drop(timer);

            let (post_global, post_per_tenant_timeline) = get_counts();
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -910,7 +910,7 @@ impl PageServerHandler {
        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
        let _timer = timeline
            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetRelExists);
+            .start_timer(metrics::SmgrQueryType::GetRelExists, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
@@ -938,7 +938,7 @@ impl PageServerHandler {

        let _timer = timeline
            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetRelSize);
+            .start_timer(metrics::SmgrQueryType::GetRelSize, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
@@ -966,7 +966,7 @@ impl PageServerHandler {

        let _timer = timeline
            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetDbSize);
+            .start_timer(metrics::SmgrQueryType::GetDbSize, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
@@ -1144,7 +1144,7 @@ impl PageServerHandler {

        let _timer = timeline
            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetPageAtLsn);
+            .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
@@ -1172,7 +1172,7 @@ impl PageServerHandler {

        let _timer = timeline
            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetSlruSegment);
+            .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1677,7 +1677,7 @@ struct RelDirectory {
    rels: HashSet<(Oid, u8)>,
 }

-#[derive(Debug, Serialize, Deserialize, Default)]
+#[derive(Debug, Serialize, Deserialize, Default, PartialEq)]
 pub(crate) struct AuxFilesDirectory {
    pub(crate) files: HashMap<String, Bytes>,
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -22,6 +22,7 @@ use pageserver_api::models;
 use pageserver_api::models::TimelineState;
 use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::ShardIdentity;
+use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
@@ -2086,6 +2087,10 @@ impl Tenant {
        &self.tenant_shard_id
    }

+    pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize {
+        self.shard_identity.stripe_size
+    }
+
    pub(crate) fn get_generation(&self) -> Generation {
        self.generation
    }
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -12,7 +12,7 @@
 //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
 use bytes::{BufMut, BytesMut};
-use tokio_epoll_uring::{BoundedBuf, Slice};
+use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};

 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
@@ -127,7 +127,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
    /// You need to make sure that the internal buffer is empty, otherwise
    /// data will be written in wrong order.
    #[inline(always)]
-    async fn write_all_unbuffered<B: BoundedBuf>(
+    async fn write_all_unbuffered<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
        &mut self,
        src_buf: B,
    ) -> (B::Buf, Result<(), Error>) {
@@ -162,7 +162,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
    }

    /// Internal, possibly buffered, write function
-    async fn write_all<B: BoundedBuf>(&mut self, src_buf: B) -> (B::Buf, Result<(), Error>) {
+    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        src_buf: B,
+    ) -> (B::Buf, Result<(), Error>) {
        if !BUFFERED {
            assert!(self.buf.is_empty());
            return self.write_all_unbuffered(src_buf).await;
@@ -210,7 +213,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {

    /// Write a blob of data. Returns the offset that it was written to,
    /// which can be used to retrieve the data later.
-    pub async fn write_blob<B: BoundedBuf>(&mut self, srcbuf: B) -> (B::Buf, Result<u64, Error>) {
+    pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        srcbuf: B,
+    ) -> (B::Buf, Result<u64, Error>) {
        let offset = self.offset;

        let len = srcbuf.bytes_init();
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -18,10 +18,19 @@
 //! - An Iterator interface would be more convenient for the callers than the
 //!   'visit' function
 //!
+use async_stream::try_stream;
 use byteorder::{ReadBytesExt, BE};
 use bytes::{BufMut, Bytes, BytesMut};
 use either::Either;
-use std::{cmp::Ordering, io, result};
+use futures::Stream;
+use hex;
+use std::{
+    cmp::Ordering,
+    io,
+    iter::Rev,
+    ops::{Range, RangeInclusive},
+    result,
+};
 use thiserror::Error;
 use tracing::error;

@@ -250,6 +259,90 @@ where
        Ok(result)
    }

+    /// Return a stream which yields all key, value pairs from the index
+    /// starting from the first key greater or equal to `start_key`.
+    ///
+    /// Note that this is a copy of [`Self::visit`].
+    /// TODO: Once the sequential read path is removed this will become
+    /// the only index traversal method.
+    pub fn get_stream_from<'a>(
+        &'a self,
+        start_key: &'a [u8; L],
+        ctx: &'a RequestContext,
+    ) -> impl Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a {
+        try_stream! {
+            let mut stack = Vec::new();
+            stack.push((self.root_blk, None));
+            let block_cursor = self.reader.block_cursor();
+            while let Some((node_blknum, opt_iter)) = stack.pop() {
+                // Locate the node.
+                let node_buf = block_cursor
+                    .read_blk(self.start_blk + node_blknum, ctx)
+                    .await?;
+
+                let node = OnDiskNode::deparse(node_buf.as_ref())?;
+                let prefix_len = node.prefix_len as usize;
+                let suffix_len = node.suffix_len as usize;
+
+                assert!(node.num_children > 0);
+
+                let mut keybuf = Vec::new();
+                keybuf.extend(node.prefix);
+                keybuf.resize(prefix_len + suffix_len, 0);
+
+                let mut iter: Either<Range<usize>, Rev<RangeInclusive<usize>>> = if let Some(iter) = opt_iter {
+                    iter
+                } else {
+                    // Locate the first match
+                    let idx = match node.binary_search(start_key, keybuf.as_mut_slice()) {
+                        Ok(idx) => idx,
+                        Err(idx) => {
+                            if node.level == 0 {
+                                // Imagine that the node contains the following keys:
+                                //
+                                // 1
+                                // 3  <-- idx
+                                // 5
+                                //
+                                // If the search key is '2' and there is exact match,
+                                // the binary search would return the index of key
+                                // '3'. That's cool, '3' is the first key to return.
+                                idx
+                            } else {
+                                // This is an internal page, so each key represents a lower
+                                // bound for what's in the child page. If there is no exact
+                                // match, we have to return the *previous* entry.
+                                //
+                                // 1  <-- return this
+                                // 3  <-- idx
+                                // 5
+                                idx.saturating_sub(1)
+                            }
+                        }
+                    };
+                    Either::Left(idx..node.num_children.into())
+                };
+
+                // idx points to the first match now. Keep going from there
+                while let Some(idx) = iter.next() {
+                    let key_off = idx * suffix_len;
+                    let suffix = &node.keys[key_off..key_off + suffix_len];
+                    keybuf[prefix_len..].copy_from_slice(suffix);
+                    let value = node.value(idx);
+                    #[allow(clippy::collapsible_if)]
+                    if node.level == 0 {
+                        // leaf
+                        yield (keybuf.clone(), value.to_u64());
+                    } else {
+                        stack.push((node_blknum, Some(iter)));
+                        stack.push((value.to_blknum(), None));
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
    ///
    /// Scan the tree, starting from 'search_key', in the given direction. 'visitor'
    /// will be called for every key >= 'search_key' (or <= 'search_key', if scanning
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -14,14 +14,14 @@ use tokio::io::{AsyncSeekExt, AsyncWriteExt};
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
-use utils::{backoff, crashsafe};
+use utils::backoff;

 use crate::config::PageServerConf;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::Generation;
-use crate::virtual_file::on_fatal_io_error;
+use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
 use utils::crashsafe::path_with_suffix_extension;
@@ -50,9 +50,8 @@ pub async fn download_layer_file<'a>(
 ) -> Result<u64, DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

-    let local_path = conf
-        .timeline_path(&tenant_shard_id, &timeline_id)
-        .join(layer_file_name.file_name());
+    let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
+    let local_path = timeline_path.join(layer_file_name.file_name());

    let remote_path = remote_layer_path(
        &tenant_shard_id.tenant_id,
@@ -149,10 +148,21 @@ pub async fn download_layer_file<'a>(
        .with_context(|| format!("rename download layer file to {local_path}"))
        .map_err(DownloadError::Other)?;

-    crashsafe::fsync_async(&local_path)
-        .await
-        .with_context(|| format!("fsync layer file {local_path}"))
-        .map_err(DownloadError::Other)?;
+    // We use fatal_err() below because the after the rename above,
+    // the in-memory state of the filesystem already has the layer file in its final place,
+    // and subsequent pageserver code could think it's durable while it really isn't.
+    let work = async move {
+        let timeline_dir = VirtualFile::open(&timeline_path)
+            .await
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+        timeline_dir
+            .sync_all()
+            .await
+            .fatal_err("VirtualFile::sync_all timeline dir");
+    };
+    crate::virtual_file::io_engine::get()
+        .spawn_blocking_and_block_on_if_std(work)
+        .await;

    tracing::debug!("download complete: {local_path}");

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -46,6 +46,7 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
+use futures::StreamExt;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -847,10 +848,33 @@ impl DeltaLayerInner {
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
-        let reads = self
-            .plan_reads(keyspace, lsn_range, reconstruct_state, ctx)
-            .await
-            .map_err(GetVectoredError::Other)?;
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            block_reader,
+        );
+
+        let planner = VectoredReadPlanner::new(
+            self.max_vectored_read_bytes
+                .expect("Layer is loaded with max vectored bytes config")
+                .0
+                .into(),
+        );
+
+        let data_end_offset = self.index_start_blk as u64 * PAGE_SZ as u64;
+
+        let reads = Self::plan_reads(
+            keyspace,
+            lsn_range,
+            data_end_offset,
+            index_reader,
+            planner,
+            reconstruct_state,
+            ctx,
+        )
+        .await
+        .map_err(GetVectoredError::Other)?;

        self.do_reads_and_update_state(reads, reconstruct_state)
            .await;
@@ -858,73 +882,64 @@ impl DeltaLayerInner {
        Ok(())
    }

-    async fn plan_reads(
-        &self,
+    async fn plan_reads<Reader>(
        keyspace: KeySpace,
        lsn_range: Range<Lsn>,
+        data_end_offset: u64,
+        index_reader: DiskBtreeReader<Reader, DELTA_KEY_SIZE>,
+        mut planner: VectoredReadPlanner,
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<VectoredRead>> {
-        let mut planner = VectoredReadPlanner::new(
-            self.max_vectored_read_bytes
-                .expect("Layer is loaded with max vectored bytes config")
-                .0
-                .into(),
-        );
-
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            block_reader,
-        );
+    ) -> anyhow::Result<Vec<VectoredRead>>
+    where
+        Reader: BlockReader,
+    {
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
+            .build();

        for range in keyspace.ranges.iter() {
            let mut range_end_handled = false;

            let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start);
-            tree_reader
-                .visit(
-                    &start_key.0,
-                    VisitDirection::Forwards,
-                    |raw_key, value| {
-                        let key = Key::from_slice(&raw_key[..KEY_SIZE]);
-                        let lsn = DeltaKey::extract_lsn_from_buf(raw_key);
-                        let blob_ref = BlobRef(value);
+            let index_stream = index_reader.get_stream_from(&start_key.0, &ctx);
+            let mut index_stream = std::pin::pin!(index_stream);

-                        assert!(key >= range.start && lsn >= lsn_range.start);
+            while let Some(index_entry) = index_stream.next().await {
+                let (raw_key, value) = index_entry?;
+                let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+                let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
+                let blob_ref = BlobRef(value);

-                        let cached_lsn = reconstruct_state.get_cached_lsn(&key);
-                        let flag = {
-                            if cached_lsn >= Some(lsn) {
-                                BlobFlag::Ignore
-                            } else if blob_ref.will_init() {
-                                BlobFlag::Replaces
-                            } else {
-                                BlobFlag::None
-                            }
-                        };
+                // Lsns are not monotonically increasing across keys, so we don't assert on them.
+                assert!(key >= range.start);

-                        if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) {
-                            planner.handle_range_end(blob_ref.pos());
-                            range_end_handled = true;
-                            false
-                        } else {
-                            planner.handle(key, lsn, blob_ref.pos(), flag);
-                            true
-                        }
-                    },
-                    &RequestContextBuilder::extend(ctx)
-                        .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
-                        .build(),
-                )
-                .await
-                .map_err(|err| anyhow!(err))?;
+                let outside_lsn_range = !lsn_range.contains(&lsn);
+                let below_cached_lsn = reconstruct_state.get_cached_lsn(&key) >= Some(lsn);
+
+                let flag = {
+                    if outside_lsn_range || below_cached_lsn {
+                        BlobFlag::Ignore
+                    } else if blob_ref.will_init() {
+                        BlobFlag::ReplaceAll
+                    } else {
+                        // Usual path: add blob to the read
+                        BlobFlag::None
+                    }
+                };
+
+                if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) {
+                    planner.handle_range_end(blob_ref.pos());
+                    range_end_handled = true;
+                    break;
+                } else {
+                    planner.handle(key, lsn, blob_ref.pos(), flag);
+                }
+            }

            if !range_end_handled {
-                let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
-                tracing::info!("Handling range end fallback at {}", payload_end);
-                planner.handle_range_end(payload_end);
+                tracing::info!("Handling range end fallback at {}", data_end_offset);
+                planner.handle_range_end(data_end_offset);
            }
        }

@@ -1190,3 +1205,131 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
        self.size
    }
 }
+
+#[cfg(test)]
+mod test {
+    use std::collections::BTreeMap;
+
+    use super::*;
+    use crate::{
+        context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk,
+    };
+
+    /// Construct an index for a fictional delta layer and and then
+    /// traverse in order to plan vectored reads for a query. Finally,
+    /// verify that the traversal fed the right index key and value
+    /// pairs into the planner.
+    #[tokio::test]
+    async fn test_delta_layer_index_traversal() {
+        let base_key = Key {
+            field1: 0,
+            field2: 1663,
+            field3: 12972,
+            field4: 16396,
+            field5: 0,
+            field6: 246080,
+        };
+
+        // Populate the index with some entries
+        let entries: BTreeMap<Key, Vec<Lsn>> = BTreeMap::from([
+            (base_key, vec![Lsn(1), Lsn(5), Lsn(25), Lsn(26), Lsn(28)]),
+            (base_key.add(1), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]),
+            (base_key.add(2), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]),
+            (base_key.add(5), vec![Lsn(10), Lsn(15), Lsn(16), Lsn(20)]),
+        ]);
+
+        let mut disk = TestDisk::default();
+        let mut writer = DiskBtreeBuilder::<_, DELTA_KEY_SIZE>::new(&mut disk);
+
+        let mut disk_offset = 0;
+        for (key, lsns) in &entries {
+            for lsn in lsns {
+                let index_key = DeltaKey::from_key_lsn(key, *lsn);
+                let blob_ref = BlobRef::new(disk_offset, false);
+                writer
+                    .append(&index_key.0, blob_ref.0)
+                    .expect("In memory disk append should never fail");
+
+                disk_offset += 1;
+            }
+        }
+
+        // Prepare all the arguments for the call into `plan_reads` below
+        let (root_offset, _writer) = writer
+            .finish()
+            .expect("In memory disk finish should never fail");
+        let reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(0, root_offset, disk);
+        let planner = VectoredReadPlanner::new(100);
+        let mut reconstruct_state = ValuesReconstructState::new();
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
+        let keyspace = KeySpace {
+            ranges: vec![
+                base_key..base_key.add(3),
+                base_key.add(3)..base_key.add(100),
+            ],
+        };
+        let lsn_range = Lsn(2)..Lsn(40);
+
+        // Plan and validate
+        let vectored_reads = DeltaLayerInner::plan_reads(
+            keyspace.clone(),
+            lsn_range.clone(),
+            disk_offset,
+            reader,
+            planner,
+            &mut reconstruct_state,
+            &ctx,
+        )
+        .await
+        .expect("Read planning should not fail");
+
+        validate(keyspace, lsn_range, vectored_reads, entries);
+    }
+
+    fn validate(
+        keyspace: KeySpace,
+        lsn_range: Range<Lsn>,
+        vectored_reads: Vec<VectoredRead>,
+        index_entries: BTreeMap<Key, Vec<Lsn>>,
+    ) {
+        #[derive(Debug, PartialEq, Eq)]
+        struct BlobSpec {
+            key: Key,
+            lsn: Lsn,
+            at: u64,
+        }
+
+        let mut planned_blobs = Vec::new();
+        for read in vectored_reads {
+            for (at, meta) in read.blobs_at.as_slice() {
+                planned_blobs.push(BlobSpec {
+                    key: meta.key,
+                    lsn: meta.lsn,
+                    at: *at,
+                });
+            }
+        }
+
+        let mut expected_blobs = Vec::new();
+        let mut disk_offset = 0;
+        for (key, lsns) in index_entries {
+            for lsn in lsns {
+                let key_included = keyspace.ranges.iter().any(|range| range.contains(&key));
+                let lsn_included = lsn_range.contains(&lsn);
+
+                if key_included && lsn_included {
+                    expected_blobs.push(BlobSpec {
+                        key,
+                        lsn,
+                        at: disk_offset,
+                    });
+                }
+
+                disk_offset += 1;
+            }
+        }
+
+        assert_eq!(planned_blobs, expected_blobs);
+    }
+}
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -43,6 +43,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
+use hex;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -54,6 +55,7 @@ use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::sync::Arc;
 use tokio::sync::OnceCell;
+use tokio_stream::StreamExt;
 use tracing::*;

 use utils::{
@@ -488,35 +490,33 @@ impl ImageLayerInner {
        let tree_reader =
            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);

+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::ImageLayerBtreeNode)
+            .build();
+
        for range in keyspace.ranges.iter() {
            let mut range_end_handled = false;

            let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
            range.start.write_to_byte_slice(&mut search_key);

-            tree_reader
-                .visit(
-                    &search_key,
-                    VisitDirection::Forwards,
-                    |raw_key, offset| {
-                        let key = Key::from_slice(&raw_key[..KEY_SIZE]);
-                        assert!(key >= range.start);
+            let index_stream = tree_reader.get_stream_from(&search_key, &ctx);
+            let mut index_stream = std::pin::pin!(index_stream);

-                        if key >= range.end {
-                            planner.handle_range_end(offset);
-                            range_end_handled = true;
-                            false
-                        } else {
-                            planner.handle(key, self.lsn, offset, BlobFlag::None);
-                            true
-                        }
-                    },
-                    &RequestContextBuilder::extend(ctx)
-                        .page_content_kind(PageContentKind::ImageLayerBtreeNode)
-                        .build(),
-                )
-                .await
-                .map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
+            while let Some(index_entry) = index_stream.next().await {
+                let (raw_key, offset) = index_entry?;
+
+                let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+                assert!(key >= range.start);
+
+                if key >= range.end {
+                    planner.handle_range_end(offset);
+                    range_end_handled = true;
+                    break;
+                } else {
+                    planner.handle(key, self.lsn, offset, BlobFlag::None);
+                }
+            }

            if !range_end_handled {
                let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -195,6 +195,7 @@ impl Layer {
        let downloaded = resident.expect("just initialized");

        // if the rename works, the path is as expected
+        // TODO: sync system call
        std::fs::rename(temp_path, owner.local_path())
            .with_context(|| format!("rename temporary file as correct path for {owner}"))?;

@@ -975,7 +976,7 @@ impl LayerInner {
                }

                self.consecutive_failures.store(0, Ordering::Relaxed);
-                tracing::info!("on-demand download successful");
+                tracing::info!(size=%self.desc.file_size, "on-demand download successful");

                Ok(permit)
            }
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -217,7 +217,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
                let delta = now - prev;
-                warn!(
+                info!(
                    n_seconds=%format_args!("{:.3}",
                    delta.as_secs_f64()),
                    count_accounted,
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -2,14 +2,14 @@ use std::{
    str::FromStr,
    sync::{
        atomic::{AtomicU64, Ordering},
-        Arc,
+        Arc, Mutex,
    },
    time::{Duration, Instant},
 };

 use arc_swap::ArcSwap;
 use enumset::EnumSet;
-use tracing::error;
+use tracing::{error, warn};

 use crate::{context::RequestContext, task_mgr::TaskKind};

@@ -157,6 +157,19 @@ where
                .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
            let observation = Observation { wait_time };
            self.metric.observe_throttling(&observation);
+            match ctx.micros_spent_throttled.add(wait_time) {
+                Ok(res) => res,
+                Err(error) => {
+                    use once_cell::sync::Lazy;
+                    use utils::rate_limit::RateLimit;
+                    static WARN_RATE_LIMIT: Lazy<Mutex<RateLimit>> =
+                        Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                    let mut guard = WARN_RATE_LIMIT.lock().unwrap();
+                    guard.call(move || {
+                        warn!(error, "error adding time spent throttled; this message is logged at a global rate limit");
+                    });
+                }
+            }
        }
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,9 +14,9 @@ use camino::Utf8Path;
 use enumset::EnumSet;
 use fail::fail_point;
 use futures::stream::StreamExt;
-use itertools::Itertools;
 use once_cell::sync::Lazy;
 use pageserver_api::{
+    key::AUX_FILES_KEY,
    keyspace::KeySpaceAccum,
    models::{
        CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
@@ -34,7 +34,7 @@ use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 use std::{
    array,
-    collections::{BTreeMap, BinaryHeap, HashMap, HashSet},
+    collections::{BTreeMap, HashMap, HashSet},
    sync::atomic::AtomicU64,
 };
 use std::{
@@ -56,7 +56,7 @@ use crate::tenant::{
    metadata::TimelineMetadata,
 };
 use crate::{
-    context::{AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder},
+    context::{DownloadBehavior, RequestContext},
    disk_usage_eviction_task::DiskUsageEvictionInfo,
    pgdatadir_mapping::CollectKeySpaceError,
 };
@@ -863,8 +863,6 @@ impl Timeline {
        fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool {
            use GetVectoredError::*;
            match (lhs, rhs) {
-                (Cancelled, Cancelled) => true,
-                (_, Cancelled) => true,
                (Oversized(l), Oversized(r)) => l == r,
                (InvalidLsn(l), InvalidLsn(r)) => l == r,
                (MissingKey(l), MissingKey(r)) => l == r,
@@ -875,6 +873,8 @@ impl Timeline {
        }

        match (&sequential_res, vectored_res) {
+            (Err(GetVectoredError::Cancelled), _) => {},
+            (_, Err(GetVectoredError::Cancelled)) => {},
            (Err(seq_err), Ok(_)) => {
                panic!(concat!("Sequential get failed with {}, but vectored get did not",
                               " - keyspace={:?} lsn={}"),
@@ -891,8 +891,7 @@ impl Timeline {
                    assert_eq!(seq_key, vec_key);
                    match (seq_res, vec_res) {
                        (Ok(seq_blob), Ok(vec_blob)) => {
-                            assert_eq!(seq_blob, vec_blob,
-                                       "Image mismatch for key {seq_key} - keyspace={keyspace:?} lsn={lsn}");
+                            Self::validate_key_equivalence(seq_key, &keyspace, lsn, seq_blob, vec_blob);
                        },
                        (Err(err), Ok(_)) => {
                            panic!(
@@ -911,6 +910,43 @@ impl Timeline {
        }
    }

+    fn validate_key_equivalence(
+        key: &Key,
+        keyspace: &KeySpace,
+        lsn: Lsn,
+        seq: &Bytes,
+        vec: &Bytes,
+    ) {
+        use utils::bin_ser::BeSer;
+
+        if *key == AUX_FILES_KEY {
+            // The value reconstruct of AUX_FILES_KEY from records is not deterministic
+            // since it uses a hash map under the hood. Hence, deserialise both results
+            // before comparing.
+            let seq_aux_dir_res = AuxFilesDirectory::des(seq);
+            let vec_aux_dir_res = AuxFilesDirectory::des(vec);
+            match (&seq_aux_dir_res, &vec_aux_dir_res) {
+                (Ok(seq_aux_dir), Ok(vec_aux_dir)) => {
+                    assert_eq!(
+                        seq_aux_dir, vec_aux_dir,
+                        "Mismatch for key {} - keyspace={:?} lsn={}",
+                        key, keyspace, lsn
+                    );
+                }
+                (Err(_), Err(_)) => {}
+                _ => {
+                    panic!("Mismatch for {key}: {seq_aux_dir_res:?} != {vec_aux_dir_res:?}");
+                }
+            }
+        } else {
+            // All other keys should reconstruct deterministically, so we simply compare the blobs.
+            assert_eq!(
+                seq, vec,
+                "Image mismatch for key {key} - keyspace={keyspace:?} lsn={lsn}"
+            );
+        }
+    }
+
    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
    pub(crate) fn get_last_record_lsn(&self) -> Lsn {
        self.last_record_lsn.load().last
@@ -1109,118 +1145,6 @@ impl Timeline {
        }
    }

-    /// TODO: cancellation
-    async fn compact_legacy(
-        self: &Arc<Self>,
-        _cancel: &CancellationToken,
-        flags: EnumSet<CompactFlags>,
-        ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
-        // High level strategy for compaction / image creation:
-        //
-        // 1. First, calculate the desired "partitioning" of the
-        // currently in-use key space. The goal is to partition the
-        // key space into roughly fixed-size chunks, but also take into
-        // account any existing image layers, and try to align the
-        // chunk boundaries with the existing image layers to avoid
-        // too much churn. Also try to align chunk boundaries with
-        // relation boundaries.  In principle, we don't know about
-        // relation boundaries here, we just deal with key-value
-        // pairs, and the code in pgdatadir_mapping.rs knows how to
-        // map relations into key-value pairs. But in practice we know
-        // that 'field6' is the block number, and the fields 1-5
-        // identify a relation. This is just an optimization,
-        // though.
-        //
-        // 2. Once we know the partitioning, for each partition,
-        // decide if it's time to create a new image layer. The
-        // criteria is: there has been too much "churn" since the last
-        // image layer? The "churn" is fuzzy concept, it's a
-        // combination of too many delta files, or too much WAL in
-        // total in the delta file. Or perhaps: if creating an image
-        // file would allow to delete some older files.
-        //
-        // 3. After that, we compact all level0 delta files if there
-        // are too many of them.  While compacting, we also garbage
-        // collect any page versions that are no longer needed because
-        // of the new image layers we created in step 2.
-        //
-        // TODO: This high level strategy hasn't been implemented yet.
-        // Below are functions compact_level0() and create_image_layers()
-        // but they are a bit ad hoc and don't quite work like it's explained
-        // above. Rewrite it.
-
-        // Is the timeline being deleted?
-        if self.is_stopping() {
-            trace!("Dropping out of compaction on timeline shutdown");
-            return Err(CompactionError::ShuttingDown);
-        }
-
-        let target_file_size = self.get_checkpoint_distance();
-
-        // Define partitioning schema if needed
-
-        // FIXME: the match should only cover repartitioning, not the next steps
-        match self
-            .repartition(
-                self.get_last_record_lsn(),
-                self.get_compaction_target_size(),
-                flags,
-                ctx,
-            )
-            .await
-        {
-            Ok((partitioning, lsn)) => {
-                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
-                let image_ctx = RequestContextBuilder::extend(ctx)
-                    .access_stats_behavior(AccessStatsBehavior::Skip)
-                    .build();
-
-                // 2. Compact
-                let timer = self.metrics.compact_time_histo.start_timer();
-                self.compact_level0(target_file_size, ctx).await?;
-                timer.stop_and_record();
-
-                // 3. Create new image layers for partitions that have been modified
-                // "enough".
-                let layers = self
-                    .create_image_layers(
-                        &partitioning,
-                        lsn,
-                        flags.contains(CompactFlags::ForceImageLayerCreation),
-                        &image_ctx,
-                    )
-                    .await
-                    .map_err(anyhow::Error::from)?;
-                if let Some(remote_client) = &self.remote_client {
-                    for layer in layers {
-                        remote_client.schedule_layer_file_upload(layer)?;
-                    }
-                }
-
-                if let Some(remote_client) = &self.remote_client {
-                    // should any new image layer been created, not uploading index_part will
-                    // result in a mismatch between remote_physical_size and layermap calculated
-                    // size, which will fail some tests, but should not be an issue otherwise.
-                    remote_client.schedule_index_upload_for_file_changes()?;
-                }
-            }
-            Err(err) => {
-                // no partitioning? This is normal, if the timeline was just created
-                // as an empty timeline. Also in unit tests, when we use the timeline
-                // as a simple key-value store, ignoring the datadir layout. Log the
-                // error but continue.
-                //
-                // Suppress error when it's due to cancellation
-                if !self.cancel.is_cancelled() {
-                    error!("could not compact, repartitioning keyspace failed: {err:?}");
-                }
-            }
-        };
-
-        Ok(())
-    }
-
    /// Mutate the timeline with a [`TimelineWriter`].
    pub(crate) async fn writer(&self) -> TimelineWriter<'_> {
        TimelineWriter {
@@ -3410,44 +3334,48 @@ impl Timeline {
        frozen_layer: &Arc<InMemoryLayer>,
        ctx: &RequestContext,
    ) -> anyhow::Result<ResidentLayer> {
-        let span = tracing::info_span!("blocking");
-        let new_delta: ResidentLayer = tokio::task::spawn_blocking({
-            let self_clone = Arc::clone(self);
-            let frozen_layer = Arc::clone(frozen_layer);
-            let ctx = ctx.attached_child();
-            move || {
-                Handle::current().block_on(
-                    async move {
-                        let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
-                        // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
-                        // We just need to fsync the directory in which these inodes are linked,
-                        // which we know to be the timeline directory.
-                        //
-                        // We use fatal_err() below because the after write_to_disk returns with success,
-                        // the in-memory state of the filesystem already has the layer file in its final place,
-                        // and subsequent pageserver code could think it's durable while it really isn't.
-                        let timeline_dir =
-                            VirtualFile::open(&self_clone.conf.timeline_path(
-                                &self_clone.tenant_shard_id,
-                                &self_clone.timeline_id,
-                            ))
-                            .await
-                            .fatal_err("VirtualFile::open for timeline dir fsync");
-                        timeline_dir
-                            .sync_all()
-                            .await
-                            .fatal_err("VirtualFile::sync_all timeline dir");
-                        anyhow::Ok(new_delta)
-                    }
-                    .instrument(span),
-                )
+        let self_clone = Arc::clone(self);
+        let frozen_layer = Arc::clone(frozen_layer);
+        let ctx = ctx.attached_child();
+        let work = async move {
+            let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
+            // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
+            // We just need to fsync the directory in which these inodes are linked,
+            // which we know to be the timeline directory.
+            //
+            // We use fatal_err() below because the after write_to_disk returns with success,
+            // the in-memory state of the filesystem already has the layer file in its final place,
+            // and subsequent pageserver code could think it's durable while it really isn't.
+            let timeline_dir = VirtualFile::open(
+                &self_clone
+                    .conf
+                    .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id),
+            )
+            .await
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
+                .await
+                .fatal_err("VirtualFile::sync_all timeline dir");
+            anyhow::Ok(new_delta)
+        };
+        // Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking.
+        // Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`.
+        use crate::virtual_file::io_engine::IoEngine;
+        match crate::virtual_file::io_engine::get() {
+            IoEngine::NotSet => panic!("io engine not set"),
+            IoEngine::StdFs => {
+                let span = tracing::info_span!("blocking");
+                tokio::task::spawn_blocking({
+                    move || Handle::current().block_on(work.instrument(span))
+                })
+                .await
+                .context("spawn_blocking")
+                .and_then(|x| x)
            }
-        })
-        .await
-        .context("spawn_blocking")
-        .and_then(|x| x)?;
-
-        Ok(new_delta)
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => work.await,
+        }
    }

    async fn repartition(
@@ -3725,12 +3653,6 @@ impl Timeline {
    }
 }

-#[derive(Default)]
-struct CompactLevel0Phase1Result {
-    new_layers: Vec<ResidentLayer>,
-    deltas_to_compact: Vec<Layer>,
-}
-
 /// Top-level failure to compact.
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum CompactionError {
@@ -3784,577 +3706,7 @@ impl DurationRecorder {
    }
 }

-#[derive(Default)]
-struct CompactLevel0Phase1StatsBuilder {
-    version: Option<u64>,
-    tenant_id: Option<TenantShardId>,
-    timeline_id: Option<TimelineId>,
-    read_lock_acquisition_micros: DurationRecorder,
-    read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
-    read_lock_held_key_sort_micros: DurationRecorder,
-    read_lock_held_prerequisites_micros: DurationRecorder,
-    read_lock_held_compute_holes_micros: DurationRecorder,
-    read_lock_drop_micros: DurationRecorder,
-    write_layer_files_micros: DurationRecorder,
-    level0_deltas_count: Option<usize>,
-    new_deltas_count: Option<usize>,
-    new_deltas_size: Option<u64>,
-}
-
-#[derive(serde::Serialize)]
-struct CompactLevel0Phase1Stats {
-    version: u64,
-    tenant_id: TenantShardId,
-    timeline_id: TimelineId,
-    read_lock_acquisition_micros: RecordedDuration,
-    read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
-    read_lock_held_key_sort_micros: RecordedDuration,
-    read_lock_held_prerequisites_micros: RecordedDuration,
-    read_lock_held_compute_holes_micros: RecordedDuration,
-    read_lock_drop_micros: RecordedDuration,
-    write_layer_files_micros: RecordedDuration,
-    level0_deltas_count: usize,
-    new_deltas_count: usize,
-    new_deltas_size: u64,
-}
-
-impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
-    type Error = anyhow::Error;
-
-    fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result<Self, Self::Error> {
-        Ok(Self {
-            version: value.version.ok_or_else(|| anyhow!("version not set"))?,
-            tenant_id: value
-                .tenant_id
-                .ok_or_else(|| anyhow!("tenant_id not set"))?,
-            timeline_id: value
-                .timeline_id
-                .ok_or_else(|| anyhow!("timeline_id not set"))?,
-            read_lock_acquisition_micros: value
-                .read_lock_acquisition_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?,
-            read_lock_held_spawn_blocking_startup_micros: value
-                .read_lock_held_spawn_blocking_startup_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
-            read_lock_held_key_sort_micros: value
-                .read_lock_held_key_sort_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?,
-            read_lock_held_prerequisites_micros: value
-                .read_lock_held_prerequisites_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?,
-            read_lock_held_compute_holes_micros: value
-                .read_lock_held_compute_holes_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?,
-            read_lock_drop_micros: value
-                .read_lock_drop_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?,
-            write_layer_files_micros: value
-                .write_layer_files_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?,
-            level0_deltas_count: value
-                .level0_deltas_count
-                .ok_or_else(|| anyhow!("level0_deltas_count not set"))?,
-            new_deltas_count: value
-                .new_deltas_count
-                .ok_or_else(|| anyhow!("new_deltas_count not set"))?,
-            new_deltas_size: value
-                .new_deltas_size
-                .ok_or_else(|| anyhow!("new_deltas_size not set"))?,
-        })
-    }
-}
-
 impl Timeline {
-    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
-    async fn compact_level0_phase1(
-        self: &Arc<Self>,
-        guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
-        mut stats: CompactLevel0Phase1StatsBuilder,
-        target_file_size: u64,
-        ctx: &RequestContext,
-    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
-        stats.read_lock_held_spawn_blocking_startup_micros =
-            stats.read_lock_acquisition_micros.till_now(); // set by caller
-        let layers = guard.layer_map();
-        let level0_deltas = layers.get_level0_deltas()?;
-        let mut level0_deltas = level0_deltas
-            .into_iter()
-            .map(|x| guard.get_from_desc(&x))
-            .collect_vec();
-        stats.level0_deltas_count = Some(level0_deltas.len());
-        // Only compact if enough layers have accumulated.
-        let threshold = self.get_compaction_threshold();
-        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
-            debug!(
-                level0_deltas = level0_deltas.len(),
-                threshold, "too few deltas to compact"
-            );
-            return Ok(CompactLevel0Phase1Result::default());
-        }
-
-        // This failpoint is used together with `test_duplicate_layers` integration test.
-        // It returns the compaction result exactly the same layers as input to compaction.
-        // We want to ensure that this will not cause any problem when updating the layer map
-        // after the compaction is finished.
-        //
-        // Currently, there are two rare edge cases that will cause duplicated layers being
-        // inserted.
-        // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which
-        //    is compacted to 5, but the page server is shut down, next time we start page server we will get a layer
-        //    map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this
-        //    point again, it is likely that we will get a file 6 which has the same content and the key range as 5,
-        //    and this causes an overwrite. This is acceptable because the content is the same, and we should do a
-        //    layer replace instead of the normal remove / upload process.
-        // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file
-        //    size length. Compaction will likely create the same set of n files afterwards.
-        //
-        // This failpoint is a superset of both of the cases.
-        if cfg!(feature = "testing") {
-            let active = (|| {
-                ::fail::fail_point!("compact-level0-phase1-return-same", |_| true);
-                false
-            })();
-
-            if active {
-                let mut new_layers = Vec::with_capacity(level0_deltas.len());
-                for delta in &level0_deltas {
-                    // we are just faking these layers as being produced again for this failpoint
-                    new_layers.push(
-                        delta
-                            .download_and_keep_resident()
-                            .await
-                            .context("download layer for failpoint")?,
-                    );
-                }
-                tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
-                return Ok(CompactLevel0Phase1Result {
-                    new_layers,
-                    deltas_to_compact: level0_deltas,
-                });
-            }
-        }
-
-        // Gather the files to compact in this iteration.
-        //
-        // Start with the oldest Level 0 delta file, and collect any other
-        // level 0 files that form a contiguous sequence, such that the end
-        // LSN of previous file matches the start LSN of the next file.
-        //
-        // Note that if the files don't form such a sequence, we might
-        // "compact" just a single file. That's a bit pointless, but it allows
-        // us to get rid of the level 0 file, and compact the other files on
-        // the next iteration. This could probably made smarter, but such
-        // "gaps" in the sequence of level 0 files should only happen in case
-        // of a crash, partial download from cloud storage, or something like
-        // that, so it's not a big deal in practice.
-        level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start);
-        let mut level0_deltas_iter = level0_deltas.iter();
-
-        let first_level0_delta = level0_deltas_iter.next().unwrap();
-        let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
-        let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());
-
-        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
-        for l in level0_deltas_iter {
-            let lsn_range = &l.layer_desc().lsn_range;
-
-            if lsn_range.start != prev_lsn_end {
-                break;
-            }
-            deltas_to_compact.push(l.download_and_keep_resident().await?);
-            prev_lsn_end = lsn_range.end;
-        }
-        let lsn_range = Range {
-            start: deltas_to_compact
-                .first()
-                .unwrap()
-                .layer_desc()
-                .lsn_range
-                .start,
-            end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end,
-        };
-
-        info!(
-            "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
-            lsn_range.start,
-            lsn_range.end,
-            deltas_to_compact.len(),
-            level0_deltas.len()
-        );
-
-        for l in deltas_to_compact.iter() {
-            info!("compact includes {l}");
-        }
-
-        // We don't need the original list of layers anymore. Drop it so that
-        // we don't accidentally use it later in the function.
-        drop(level0_deltas);
-
-        stats.read_lock_held_prerequisites_micros = stats
-            .read_lock_held_spawn_blocking_startup_micros
-            .till_now();
-
-        // Determine N largest holes where N is number of compacted layers.
-        let max_holes = deltas_to_compact.len();
-        let last_record_lsn = self.get_last_record_lsn();
-        let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
-        let min_hole_coverage_size = 3; // TODO: something more flexible?
-
-        // min-heap (reserve space for one more element added before eviction)
-        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
-        let mut prev: Option<Key> = None;
-
-        let mut all_keys = Vec::new();
-
-        for l in deltas_to_compact.iter() {
-            all_keys.extend(l.load_keys(ctx).await?);
-        }
-
-        // FIXME: should spawn_blocking the rest of this function
-
-        // The current stdlib sorting implementation is designed in a way where it is
-        // particularly fast where the slice is made up of sorted sub-ranges.
-        all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
-
-        stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
-
-        for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
-            if let Some(prev_key) = prev {
-                // just first fast filter
-                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
-                    let key_range = prev_key..next_key;
-                    // Measuring hole by just subtraction of i128 representation of key range boundaries
-                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
-                    // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
-                    // That is why it is better to measure size of hole as number of covering image layers.
-                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
-                    if coverage_size >= min_hole_coverage_size {
-                        heap.push(Hole {
-                            key_range,
-                            coverage_size,
-                        });
-                        if heap.len() > max_holes {
-                            heap.pop(); // remove smallest hole
-                        }
-                    }
-                }
-            }
-            prev = Some(next_key.next());
-        }
-        stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
-        drop_rlock(guard);
-        stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
-        let mut holes = heap.into_vec();
-        holes.sort_unstable_by_key(|hole| hole.key_range.start);
-        let mut next_hole = 0; // index of next hole in holes vector
-
-        // This iterator walks through all key-value pairs from all the layers
-        // we're compacting, in key, LSN order.
-        let all_values_iter = all_keys.iter();
-
-        // This iterator walks through all keys and is needed to calculate size used by each key
-        let mut all_keys_iter = all_keys
-            .iter()
-            .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size))
-            .coalesce(|mut prev, cur| {
-                // Coalesce keys that belong to the same key pair.
-                // This ensures that compaction doesn't put them
-                // into different layer files.
-                // Still limit this by the target file size,
-                // so that we keep the size of the files in
-                // check.
-                if prev.0 == cur.0 && prev.2 < target_file_size {
-                    prev.2 += cur.2;
-                    Ok(prev)
-                } else {
-                    Err((prev, cur))
-                }
-            });
-
-        // Merge the contents of all the input delta layers into a new set
-        // of delta layers, based on the current partitioning.
-        //
-        // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one.
-        // It's possible that there is a single key with so many page versions that storing all of them in a single layer file
-        // would be too large. In that case, we also split on the LSN dimension.
-        //
-        // LSN
-        //  ^
-        //  |
-        //  | +-----------+            +--+--+--+--+
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            |  |  |  |  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+     ==>    |  |  |  |  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            |  |  |  |  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            +--+--+--+--+
-        //  |
-        //  +--------------> key
-        //
-        //
-        // If one key (X) has a lot of page versions:
-        //
-        // LSN
-        //  ^
-        //  |                                 (X)
-        //  | +-----------+            +--+--+--+--+
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            |  |  +--+  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+     ==>    |  |  |  |  |
-        //  | |           |            |  |  +--+  |
-        //  | +-----------+            |  |  |  |  |
-        //  | |           |            |  |  |  |  |
-        //  | +-----------+            +--+--+--+--+
-        //  |
-        //  +--------------> key
-        // TODO: this actually divides the layers into fixed-size chunks, not
-        // based on the partitioning.
-        //
-        // TODO: we should also opportunistically materialize and
-        // garbage collect what we can.
-        let mut new_layers = Vec::new();
-        let mut prev_key: Option<Key> = None;
-        let mut writer: Option<DeltaLayerWriter> = None;
-        let mut key_values_total_size = 0u64;
-        let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
-        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
-
-        for &DeltaEntry {
-            key, lsn, ref val, ..
-        } in all_values_iter
-        {
-            let value = val.load(ctx).await?;
-            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
-            // We need to check key boundaries once we reach next key or end of layer with the same key
-            if !same_key || lsn == dup_end_lsn {
-                let mut next_key_size = 0u64;
-                let is_dup_layer = dup_end_lsn.is_valid();
-                dup_start_lsn = Lsn::INVALID;
-                if !same_key {
-                    dup_end_lsn = Lsn::INVALID;
-                }
-                // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
-                for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
-                    next_key_size = next_size;
-                    if key != next_key {
-                        if dup_end_lsn.is_valid() {
-                            // We are writting segment with duplicates:
-                            // place all remaining values of this key in separate segment
-                            dup_start_lsn = dup_end_lsn; // new segments starts where old stops
-                            dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
-                        }
-                        break;
-                    }
-                    key_values_total_size += next_size;
-                    // Check if it is time to split segment: if total keys size is larger than target file size.
-                    // We need to avoid generation of empty segments if next_size > target_file_size.
-                    if key_values_total_size > target_file_size && lsn != next_lsn {
-                        // Split key between multiple layers: such layer can contain only single key
-                        dup_start_lsn = if dup_end_lsn.is_valid() {
-                            dup_end_lsn // new segment with duplicates starts where old one stops
-                        } else {
-                            lsn // start with the first LSN for this key
-                        };
-                        dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
-                        break;
-                    }
-                }
-                // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
-                if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
-                    dup_start_lsn = dup_end_lsn;
-                    dup_end_lsn = lsn_range.end;
-                }
-                if writer.is_some() {
-                    let written_size = writer.as_mut().unwrap().size();
-                    let contains_hole =
-                        next_hole < holes.len() && key >= holes[next_hole].key_range.end;
-                    // check if key cause layer overflow or contains hole...
-                    if is_dup_layer
-                        || dup_end_lsn.is_valid()
-                        || written_size + key_values_total_size > target_file_size
-                        || contains_hole
-                    {
-                        // ... if so, flush previous layer and prepare to write new one
-                        new_layers.push(
-                            writer
-                                .take()
-                                .unwrap()
-                                .finish(prev_key.unwrap().next(), self)
-                                .await?,
-                        );
-                        writer = None;
-
-                        if contains_hole {
-                            // skip hole
-                            next_hole += 1;
-                        }
-                    }
-                }
-                // Remember size of key value because at next iteration we will access next item
-                key_values_total_size = next_key_size;
-            }
-            fail_point!("delta-layer-writer-fail-before-finish", |_| {
-                Err(CompactionError::Other(anyhow::anyhow!(
-                    "failpoint delta-layer-writer-fail-before-finish"
-                )))
-            });
-
-            if !self.shard_identity.is_key_disposable(&key) {
-                if writer.is_none() {
-                    // Create writer if not initiaized yet
-                    writer = Some(
-                        DeltaLayerWriter::new(
-                            self.conf,
-                            self.timeline_id,
-                            self.tenant_shard_id,
-                            key,
-                            if dup_end_lsn.is_valid() {
-                                // this is a layer containing slice of values of the same key
-                                debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
-                                dup_start_lsn..dup_end_lsn
-                            } else {
-                                debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
-                                lsn_range.clone()
-                            },
-                        )
-                        .await?,
-                    );
-                }
-
-                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
-            } else {
-                debug!(
-                    "Dropping key {} during compaction (it belongs on shard {:?})",
-                    key,
-                    self.shard_identity.get_shard_number(&key)
-                );
-            }
-
-            if !new_layers.is_empty() {
-                fail_point!("after-timeline-compacted-first-L1");
-            }
-
-            prev_key = Some(key);
-        }
-        if let Some(writer) = writer {
-            new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?);
-        }
-
-        // Sync layers
-        if !new_layers.is_empty() {
-            // Print a warning if the created layer is larger than double the target size
-            // Add two pages for potential overhead. This should in theory be already
-            // accounted for in the target calculation, but for very small targets,
-            // we still might easily hit the limit otherwise.
-            let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
-            for layer in new_layers.iter() {
-                if layer.layer_desc().file_size > warn_limit {
-                    warn!(
-                        %layer,
-                        "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size
-                    );
-                }
-            }
-
-            // The writer.finish() above already did the fsync of the inodes.
-            // We just need to fsync the directory in which these inodes are linked,
-            // which we know to be the timeline directory.
-            //
-            // We use fatal_err() below because the after writer.finish() returns with success,
-            // the in-memory state of the filesystem already has the layer file in its final place,
-            // and subsequent pageserver code could think it's durable while it really isn't.
-            let timeline_dir = VirtualFile::open(
-                &self
-                    .conf
-                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
-            )
-            .await
-            .fatal_err("VirtualFile::open for timeline dir fsync");
-            timeline_dir
-                .sync_all()
-                .await
-                .fatal_err("VirtualFile::sync_all timeline dir");
-        }
-
-        stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
-        stats.new_deltas_count = Some(new_layers.len());
-        stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum());
-
-        match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
-            .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
-        {
-            Ok(stats_json) => {
-                info!(
-                    stats_json = stats_json.as_str(),
-                    "compact_level0_phase1 stats available"
-                )
-            }
-            Err(e) => {
-                warn!("compact_level0_phase1 stats failed to serialize: {:#}", e);
-            }
-        }
-
-        Ok(CompactLevel0Phase1Result {
-            new_layers,
-            deltas_to_compact: deltas_to_compact
-                .into_iter()
-                .map(|x| x.drop_eviction_guard())
-                .collect::<Vec<_>>(),
-        })
-    }
-
-    ///
-    /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
-    /// as Level 1 files.
-    ///
-    async fn compact_level0(
-        self: &Arc<Self>,
-        target_file_size: u64,
-        ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
-        let CompactLevel0Phase1Result {
-            new_layers,
-            deltas_to_compact,
-        } = {
-            let phase1_span = info_span!("compact_level0_phase1");
-            let ctx = ctx.attached_child();
-            let mut stats = CompactLevel0Phase1StatsBuilder {
-                version: Some(2),
-                tenant_id: Some(self.tenant_shard_id),
-                timeline_id: Some(self.timeline_id),
-                ..Default::default()
-            };
-
-            let begin = tokio::time::Instant::now();
-            let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
-            let now = tokio::time::Instant::now();
-            stats.read_lock_acquisition_micros =
-                DurationRecorder::Recorded(RecordedDuration(now - begin), now);
-            self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx)
-                .instrument(phase1_span)
-                .await?
-        };
-
-        if new_layers.is_empty() && deltas_to_compact.is_empty() {
-            // nothing to do
-            return Ok(());
-        }
-
-        self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
-            .await?;
-        Ok(())
-    }
-
    async fn finish_compact_batch(
        self: &Arc<Self>,
        new_deltas: &[ResidentLayer],
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,24 +4,32 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.

+use std::collections::BinaryHeap;
 use std::ops::{Deref, Range};
 use std::sync::Arc;

-use super::Timeline;
+use super::layer_manager::LayerManager;
+use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};

+use anyhow::{anyhow, Context};
 use async_trait::async_trait;
+use enumset::EnumSet;
 use fail::fail_point;
+use itertools::Itertools;
+use pageserver_api::shard::TenantShardId;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, trace, warn};
+use tracing::{debug, info, info_span, trace, warn, Instrument};
+use utils::id::TimelineId;

-use crate::context::RequestContext;
+use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
-use crate::tenant::timeline::{is_rel_fsm_block_key, is_rel_vm_block_key};
+use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole};
 use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
 use crate::tenant::PageReconstructError;
-use crate::ZERO_PAGE;
+use crate::virtual_file::{MaybeFatalIo, VirtualFile};
+use crate::{page_cache, ZERO_PAGE};

 use crate::keyspace::KeySpace;
 use crate::repository::Key;
@@ -33,6 +41,694 @@ use pageserver_compaction::interface::*;

 use super::CompactionError;

+impl Timeline {
+    /// TODO: cancellation
+    pub(crate) async fn compact_legacy(
+        self: &Arc<Self>,
+        _cancel: &CancellationToken,
+        flags: EnumSet<CompactFlags>,
+        ctx: &RequestContext,
+    ) -> Result<(), CompactionError> {
+        // High level strategy for compaction / image creation:
+        //
+        // 1. First, calculate the desired "partitioning" of the
+        // currently in-use key space. The goal is to partition the
+        // key space into roughly fixed-size chunks, but also take into
+        // account any existing image layers, and try to align the
+        // chunk boundaries with the existing image layers to avoid
+        // too much churn. Also try to align chunk boundaries with
+        // relation boundaries.  In principle, we don't know about
+        // relation boundaries here, we just deal with key-value
+        // pairs, and the code in pgdatadir_mapping.rs knows how to
+        // map relations into key-value pairs. But in practice we know
+        // that 'field6' is the block number, and the fields 1-5
+        // identify a relation. This is just an optimization,
+        // though.
+        //
+        // 2. Once we know the partitioning, for each partition,
+        // decide if it's time to create a new image layer. The
+        // criteria is: there has been too much "churn" since the last
+        // image layer? The "churn" is fuzzy concept, it's a
+        // combination of too many delta files, or too much WAL in
+        // total in the delta file. Or perhaps: if creating an image
+        // file would allow to delete some older files.
+        //
+        // 3. After that, we compact all level0 delta files if there
+        // are too many of them.  While compacting, we also garbage
+        // collect any page versions that are no longer needed because
+        // of the new image layers we created in step 2.
+        //
+        // TODO: This high level strategy hasn't been implemented yet.
+        // Below are functions compact_level0() and create_image_layers()
+        // but they are a bit ad hoc and don't quite work like it's explained
+        // above. Rewrite it.
+
+        // Is the timeline being deleted?
+        if self.is_stopping() {
+            trace!("Dropping out of compaction on timeline shutdown");
+            return Err(CompactionError::ShuttingDown);
+        }
+
+        let target_file_size = self.get_checkpoint_distance();
+
+        // Define partitioning schema if needed
+
+        // FIXME: the match should only cover repartitioning, not the next steps
+        match self
+            .repartition(
+                self.get_last_record_lsn(),
+                self.get_compaction_target_size(),
+                flags,
+                ctx,
+            )
+            .await
+        {
+            Ok((partitioning, lsn)) => {
+                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
+                let image_ctx = RequestContextBuilder::extend(ctx)
+                    .access_stats_behavior(AccessStatsBehavior::Skip)
+                    .build();
+
+                // 2. Compact
+                let timer = self.metrics.compact_time_histo.start_timer();
+                self.compact_level0(target_file_size, ctx).await?;
+                timer.stop_and_record();
+
+                // 3. Create new image layers for partitions that have been modified
+                // "enough".
+                let layers = self
+                    .create_image_layers(
+                        &partitioning,
+                        lsn,
+                        flags.contains(CompactFlags::ForceImageLayerCreation),
+                        &image_ctx,
+                    )
+                    .await
+                    .map_err(anyhow::Error::from)?;
+                if let Some(remote_client) = &self.remote_client {
+                    for layer in layers {
+                        remote_client.schedule_layer_file_upload(layer)?;
+                    }
+                }
+
+                if let Some(remote_client) = &self.remote_client {
+                    // should any new image layer been created, not uploading index_part will
+                    // result in a mismatch between remote_physical_size and layermap calculated
+                    // size, which will fail some tests, but should not be an issue otherwise.
+                    remote_client.schedule_index_upload_for_file_changes()?;
+                }
+            }
+            Err(err) => {
+                // no partitioning? This is normal, if the timeline was just created
+                // as an empty timeline. Also in unit tests, when we use the timeline
+                // as a simple key-value store, ignoring the datadir layout. Log the
+                // error but continue.
+                //
+                // Suppress error when it's due to cancellation
+                if !self.cancel.is_cancelled() {
+                    tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
+                }
+            }
+        };
+
+        Ok(())
+    }
+
+    /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
+    /// as Level 1 files.
+    async fn compact_level0(
+        self: &Arc<Self>,
+        target_file_size: u64,
+        ctx: &RequestContext,
+    ) -> Result<(), CompactionError> {
+        let CompactLevel0Phase1Result {
+            new_layers,
+            deltas_to_compact,
+        } = {
+            let phase1_span = info_span!("compact_level0_phase1");
+            let ctx = ctx.attached_child();
+            let mut stats = CompactLevel0Phase1StatsBuilder {
+                version: Some(2),
+                tenant_id: Some(self.tenant_shard_id),
+                timeline_id: Some(self.timeline_id),
+                ..Default::default()
+            };
+
+            let begin = tokio::time::Instant::now();
+            let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
+            let now = tokio::time::Instant::now();
+            stats.read_lock_acquisition_micros =
+                DurationRecorder::Recorded(RecordedDuration(now - begin), now);
+            self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx)
+                .instrument(phase1_span)
+                .await?
+        };
+
+        if new_layers.is_empty() && deltas_to_compact.is_empty() {
+            // nothing to do
+            return Ok(());
+        }
+
+        self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
+            .await?;
+        Ok(())
+    }
+
+    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
+    async fn compact_level0_phase1(
+        self: &Arc<Self>,
+        guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
+        mut stats: CompactLevel0Phase1StatsBuilder,
+        target_file_size: u64,
+        ctx: &RequestContext,
+    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
+        stats.read_lock_held_spawn_blocking_startup_micros =
+            stats.read_lock_acquisition_micros.till_now(); // set by caller
+        let layers = guard.layer_map();
+        let level0_deltas = layers.get_level0_deltas()?;
+        let mut level0_deltas = level0_deltas
+            .into_iter()
+            .map(|x| guard.get_from_desc(&x))
+            .collect_vec();
+        stats.level0_deltas_count = Some(level0_deltas.len());
+        // Only compact if enough layers have accumulated.
+        let threshold = self.get_compaction_threshold();
+        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
+            debug!(
+                level0_deltas = level0_deltas.len(),
+                threshold, "too few deltas to compact"
+            );
+            return Ok(CompactLevel0Phase1Result::default());
+        }
+
+        // This failpoint is used together with `test_duplicate_layers` integration test.
+        // It returns the compaction result exactly the same layers as input to compaction.
+        // We want to ensure that this will not cause any problem when updating the layer map
+        // after the compaction is finished.
+        //
+        // Currently, there are two rare edge cases that will cause duplicated layers being
+        // inserted.
+        // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which
+        //    is compacted to 5, but the page server is shut down, next time we start page server we will get a layer
+        //    map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this
+        //    point again, it is likely that we will get a file 6 which has the same content and the key range as 5,
+        //    and this causes an overwrite. This is acceptable because the content is the same, and we should do a
+        //    layer replace instead of the normal remove / upload process.
+        // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file
+        //    size length. Compaction will likely create the same set of n files afterwards.
+        //
+        // This failpoint is a superset of both of the cases.
+        if cfg!(feature = "testing") {
+            let active = (|| {
+                ::fail::fail_point!("compact-level0-phase1-return-same", |_| true);
+                false
+            })();
+
+            if active {
+                let mut new_layers = Vec::with_capacity(level0_deltas.len());
+                for delta in &level0_deltas {
+                    // we are just faking these layers as being produced again for this failpoint
+                    new_layers.push(
+                        delta
+                            .download_and_keep_resident()
+                            .await
+                            .context("download layer for failpoint")?,
+                    );
+                }
+                tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
+                return Ok(CompactLevel0Phase1Result {
+                    new_layers,
+                    deltas_to_compact: level0_deltas,
+                });
+            }
+        }
+
+        // Gather the files to compact in this iteration.
+        //
+        // Start with the oldest Level 0 delta file, and collect any other
+        // level 0 files that form a contiguous sequence, such that the end
+        // LSN of previous file matches the start LSN of the next file.
+        //
+        // Note that if the files don't form such a sequence, we might
+        // "compact" just a single file. That's a bit pointless, but it allows
+        // us to get rid of the level 0 file, and compact the other files on
+        // the next iteration. This could probably made smarter, but such
+        // "gaps" in the sequence of level 0 files should only happen in case
+        // of a crash, partial download from cloud storage, or something like
+        // that, so it's not a big deal in practice.
+        level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start);
+        let mut level0_deltas_iter = level0_deltas.iter();
+
+        let first_level0_delta = level0_deltas_iter.next().unwrap();
+        let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
+        let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());
+
+        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
+        for l in level0_deltas_iter {
+            let lsn_range = &l.layer_desc().lsn_range;
+
+            if lsn_range.start != prev_lsn_end {
+                break;
+            }
+            deltas_to_compact.push(l.download_and_keep_resident().await?);
+            prev_lsn_end = lsn_range.end;
+        }
+        let lsn_range = Range {
+            start: deltas_to_compact
+                .first()
+                .unwrap()
+                .layer_desc()
+                .lsn_range
+                .start,
+            end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end,
+        };
+
+        info!(
+            "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
+            lsn_range.start,
+            lsn_range.end,
+            deltas_to_compact.len(),
+            level0_deltas.len()
+        );
+
+        for l in deltas_to_compact.iter() {
+            info!("compact includes {l}");
+        }
+
+        // We don't need the original list of layers anymore. Drop it so that
+        // we don't accidentally use it later in the function.
+        drop(level0_deltas);
+
+        stats.read_lock_held_prerequisites_micros = stats
+            .read_lock_held_spawn_blocking_startup_micros
+            .till_now();
+
+        // Determine N largest holes where N is number of compacted layers.
+        let max_holes = deltas_to_compact.len();
+        let last_record_lsn = self.get_last_record_lsn();
+        let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
+        let min_hole_coverage_size = 3; // TODO: something more flexible?
+
+        // min-heap (reserve space for one more element added before eviction)
+        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
+        let mut prev: Option<Key> = None;
+
+        let mut all_keys = Vec::new();
+
+        for l in deltas_to_compact.iter() {
+            all_keys.extend(l.load_keys(ctx).await?);
+        }
+
+        // FIXME: should spawn_blocking the rest of this function
+
+        // The current stdlib sorting implementation is designed in a way where it is
+        // particularly fast where the slice is made up of sorted sub-ranges.
+        all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
+
+        stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
+
+        for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
+            if let Some(prev_key) = prev {
+                // just first fast filter
+                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
+                    let key_range = prev_key..next_key;
+                    // Measuring hole by just subtraction of i128 representation of key range boundaries
+                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
+                    // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
+                    // That is why it is better to measure size of hole as number of covering image layers.
+                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
+                    if coverage_size >= min_hole_coverage_size {
+                        heap.push(Hole {
+                            key_range,
+                            coverage_size,
+                        });
+                        if heap.len() > max_holes {
+                            heap.pop(); // remove smallest hole
+                        }
+                    }
+                }
+            }
+            prev = Some(next_key.next());
+        }
+        stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
+        drop_rlock(guard);
+        stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
+        let mut holes = heap.into_vec();
+        holes.sort_unstable_by_key(|hole| hole.key_range.start);
+        let mut next_hole = 0; // index of next hole in holes vector
+
+        // This iterator walks through all key-value pairs from all the layers
+        // we're compacting, in key, LSN order.
+        let all_values_iter = all_keys.iter();
+
+        // This iterator walks through all keys and is needed to calculate size used by each key
+        let mut all_keys_iter = all_keys
+            .iter()
+            .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size))
+            .coalesce(|mut prev, cur| {
+                // Coalesce keys that belong to the same key pair.
+                // This ensures that compaction doesn't put them
+                // into different layer files.
+                // Still limit this by the target file size,
+                // so that we keep the size of the files in
+                // check.
+                if prev.0 == cur.0 && prev.2 < target_file_size {
+                    prev.2 += cur.2;
+                    Ok(prev)
+                } else {
+                    Err((prev, cur))
+                }
+            });
+
+        // Merge the contents of all the input delta layers into a new set
+        // of delta layers, based on the current partitioning.
+        //
+        // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one.
+        // It's possible that there is a single key with so many page versions that storing all of them in a single layer file
+        // would be too large. In that case, we also split on the LSN dimension.
+        //
+        // LSN
+        //  ^
+        //  |
+        //  | +-----------+            +--+--+--+--+
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            |  |  |  |  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+     ==>    |  |  |  |  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            |  |  |  |  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            +--+--+--+--+
+        //  |
+        //  +--------------> key
+        //
+        //
+        // If one key (X) has a lot of page versions:
+        //
+        // LSN
+        //  ^
+        //  |                                 (X)
+        //  | +-----------+            +--+--+--+--+
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            |  |  +--+  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+     ==>    |  |  |  |  |
+        //  | |           |            |  |  +--+  |
+        //  | +-----------+            |  |  |  |  |
+        //  | |           |            |  |  |  |  |
+        //  | +-----------+            +--+--+--+--+
+        //  |
+        //  +--------------> key
+        // TODO: this actually divides the layers into fixed-size chunks, not
+        // based on the partitioning.
+        //
+        // TODO: we should also opportunistically materialize and
+        // garbage collect what we can.
+        let mut new_layers = Vec::new();
+        let mut prev_key: Option<Key> = None;
+        let mut writer: Option<DeltaLayerWriter> = None;
+        let mut key_values_total_size = 0u64;
+        let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
+        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
+
+        for &DeltaEntry {
+            key, lsn, ref val, ..
+        } in all_values_iter
+        {
+            let value = val.load(ctx).await?;
+            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
+            // We need to check key boundaries once we reach next key or end of layer with the same key
+            if !same_key || lsn == dup_end_lsn {
+                let mut next_key_size = 0u64;
+                let is_dup_layer = dup_end_lsn.is_valid();
+                dup_start_lsn = Lsn::INVALID;
+                if !same_key {
+                    dup_end_lsn = Lsn::INVALID;
+                }
+                // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
+                for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
+                    next_key_size = next_size;
+                    if key != next_key {
+                        if dup_end_lsn.is_valid() {
+                            // We are writting segment with duplicates:
+                            // place all remaining values of this key in separate segment
+                            dup_start_lsn = dup_end_lsn; // new segments starts where old stops
+                            dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
+                        }
+                        break;
+                    }
+                    key_values_total_size += next_size;
+                    // Check if it is time to split segment: if total keys size is larger than target file size.
+                    // We need to avoid generation of empty segments if next_size > target_file_size.
+                    if key_values_total_size > target_file_size && lsn != next_lsn {
+                        // Split key between multiple layers: such layer can contain only single key
+                        dup_start_lsn = if dup_end_lsn.is_valid() {
+                            dup_end_lsn // new segment with duplicates starts where old one stops
+                        } else {
+                            lsn // start with the first LSN for this key
+                        };
+                        dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
+                        break;
+                    }
+                }
+                // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
+                if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
+                    dup_start_lsn = dup_end_lsn;
+                    dup_end_lsn = lsn_range.end;
+                }
+                if writer.is_some() {
+                    let written_size = writer.as_mut().unwrap().size();
+                    let contains_hole =
+                        next_hole < holes.len() && key >= holes[next_hole].key_range.end;
+                    // check if key cause layer overflow or contains hole...
+                    if is_dup_layer
+                        || dup_end_lsn.is_valid()
+                        || written_size + key_values_total_size > target_file_size
+                        || contains_hole
+                    {
+                        // ... if so, flush previous layer and prepare to write new one
+                        new_layers.push(
+                            writer
+                                .take()
+                                .unwrap()
+                                .finish(prev_key.unwrap().next(), self)
+                                .await?,
+                        );
+                        writer = None;
+
+                        if contains_hole {
+                            // skip hole
+                            next_hole += 1;
+                        }
+                    }
+                }
+                // Remember size of key value because at next iteration we will access next item
+                key_values_total_size = next_key_size;
+            }
+            fail_point!("delta-layer-writer-fail-before-finish", |_| {
+                Err(CompactionError::Other(anyhow::anyhow!(
+                    "failpoint delta-layer-writer-fail-before-finish"
+                )))
+            });
+
+            if !self.shard_identity.is_key_disposable(&key) {
+                if writer.is_none() {
+                    // Create writer if not initiaized yet
+                    writer = Some(
+                        DeltaLayerWriter::new(
+                            self.conf,
+                            self.timeline_id,
+                            self.tenant_shard_id,
+                            key,
+                            if dup_end_lsn.is_valid() {
+                                // this is a layer containing slice of values of the same key
+                                debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
+                                dup_start_lsn..dup_end_lsn
+                            } else {
+                                debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
+                                lsn_range.clone()
+                            },
+                        )
+                        .await?,
+                    );
+                }
+
+                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+            } else {
+                debug!(
+                    "Dropping key {} during compaction (it belongs on shard {:?})",
+                    key,
+                    self.shard_identity.get_shard_number(&key)
+                );
+            }
+
+            if !new_layers.is_empty() {
+                fail_point!("after-timeline-compacted-first-L1");
+            }
+
+            prev_key = Some(key);
+        }
+        if let Some(writer) = writer {
+            new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?);
+        }
+
+        // Sync layers
+        if !new_layers.is_empty() {
+            // Print a warning if the created layer is larger than double the target size
+            // Add two pages for potential overhead. This should in theory be already
+            // accounted for in the target calculation, but for very small targets,
+            // we still might easily hit the limit otherwise.
+            let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
+            for layer in new_layers.iter() {
+                if layer.layer_desc().file_size > warn_limit {
+                    warn!(
+                        %layer,
+                        "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size
+                    );
+                }
+            }
+
+            // The writer.finish() above already did the fsync of the inodes.
+            // We just need to fsync the directory in which these inodes are linked,
+            // which we know to be the timeline directory.
+            //
+            // We use fatal_err() below because the after writer.finish() returns with success,
+            // the in-memory state of the filesystem already has the layer file in its final place,
+            // and subsequent pageserver code could think it's durable while it really isn't.
+            let timeline_dir = VirtualFile::open(
+                &self
+                    .conf
+                    .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+            )
+            .await
+            .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
+                .await
+                .fatal_err("VirtualFile::sync_all timeline dir");
+        }
+
+        stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
+        stats.new_deltas_count = Some(new_layers.len());
+        stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum());
+
+        match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
+            .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
+        {
+            Ok(stats_json) => {
+                info!(
+                    stats_json = stats_json.as_str(),
+                    "compact_level0_phase1 stats available"
+                )
+            }
+            Err(e) => {
+                warn!("compact_level0_phase1 stats failed to serialize: {:#}", e);
+            }
+        }
+
+        Ok(CompactLevel0Phase1Result {
+            new_layers,
+            deltas_to_compact: deltas_to_compact
+                .into_iter()
+                .map(|x| x.drop_eviction_guard())
+                .collect::<Vec<_>>(),
+        })
+    }
+}
+
+#[derive(Default)]
+struct CompactLevel0Phase1Result {
+    new_layers: Vec<ResidentLayer>,
+    deltas_to_compact: Vec<Layer>,
+}
+
+#[derive(Default)]
+struct CompactLevel0Phase1StatsBuilder {
+    version: Option<u64>,
+    tenant_id: Option<TenantShardId>,
+    timeline_id: Option<TimelineId>,
+    read_lock_acquisition_micros: DurationRecorder,
+    read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
+    read_lock_held_key_sort_micros: DurationRecorder,
+    read_lock_held_prerequisites_micros: DurationRecorder,
+    read_lock_held_compute_holes_micros: DurationRecorder,
+    read_lock_drop_micros: DurationRecorder,
+    write_layer_files_micros: DurationRecorder,
+    level0_deltas_count: Option<usize>,
+    new_deltas_count: Option<usize>,
+    new_deltas_size: Option<u64>,
+}
+
+#[derive(serde::Serialize)]
+struct CompactLevel0Phase1Stats {
+    version: u64,
+    tenant_id: TenantShardId,
+    timeline_id: TimelineId,
+    read_lock_acquisition_micros: RecordedDuration,
+    read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
+    read_lock_held_key_sort_micros: RecordedDuration,
+    read_lock_held_prerequisites_micros: RecordedDuration,
+    read_lock_held_compute_holes_micros: RecordedDuration,
+    read_lock_drop_micros: RecordedDuration,
+    write_layer_files_micros: RecordedDuration,
+    level0_deltas_count: usize,
+    new_deltas_count: usize,
+    new_deltas_size: u64,
+}
+
+impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
+    type Error = anyhow::Error;
+
+    fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result<Self, Self::Error> {
+        Ok(Self {
+            version: value.version.ok_or_else(|| anyhow!("version not set"))?,
+            tenant_id: value
+                .tenant_id
+                .ok_or_else(|| anyhow!("tenant_id not set"))?,
+            timeline_id: value
+                .timeline_id
+                .ok_or_else(|| anyhow!("timeline_id not set"))?,
+            read_lock_acquisition_micros: value
+                .read_lock_acquisition_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?,
+            read_lock_held_spawn_blocking_startup_micros: value
+                .read_lock_held_spawn_blocking_startup_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
+            read_lock_held_key_sort_micros: value
+                .read_lock_held_key_sort_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?,
+            read_lock_held_prerequisites_micros: value
+                .read_lock_held_prerequisites_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?,
+            read_lock_held_compute_holes_micros: value
+                .read_lock_held_compute_holes_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?,
+            read_lock_drop_micros: value
+                .read_lock_drop_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?,
+            write_layer_files_micros: value
+                .write_layer_files_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?,
+            level0_deltas_count: value
+                .level0_deltas_count
+                .ok_or_else(|| anyhow!("level0_deltas_count not set"))?,
+            new_deltas_count: value
+                .new_deltas_count
+                .ok_or_else(|| anyhow!("new_deltas_count not set"))?,
+            new_deltas_size: value
+                .new_deltas_size
+                .ok_or_else(|| anyhow!("new_deltas_size not set"))?,
+        })
+    }
+}
+
 impl Timeline {
    /// Entry point for new tiered compaction algorithm.
    ///
@@ -134,7 +830,6 @@ struct ResidentDeltaLayer(ResidentLayer);
 #[derive(Clone)]
 struct ResidentImageLayer(ResidentLayer);

-#[async_trait]
 impl CompactionJobExecutor for TimelineAdaptor {
    type Key = crate::repository::Key;

--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -128,7 +128,7 @@ impl VectoredReadBuilder {
 pub enum BlobFlag {
    None,
    Ignore,
-    Replaces,
+    ReplaceAll,
 }

 /// Planner for vectored blob reads.
@@ -170,7 +170,7 @@ impl VectoredReadPlanner {
    /// incorrect data to the user.
    ///
    /// The `flag` argument has two interesting values:
-    /// * [`BlobFlag::Replaces`]: The blob for this key should replace all existing blobs.
+    /// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs.
    /// This is used for WAL records that `will_init`.
    /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
    /// if the blob is cached.
@@ -204,7 +204,7 @@ impl VectoredReadPlanner {
                let blobs_for_key = self.blobs.entry(key).or_default();
                blobs_for_key.push((lsn, start_offset, end_offset));
            }
-            BlobFlag::Replaces => {
+            BlobFlag::ReplaceAll => {
                let blobs_for_key = self.blobs.entry(key).or_default();
                blobs_for_key.clear();
                blobs_for_key.push((lsn, start_offset, end_offset));
@@ -411,10 +411,10 @@ mod tests {
        let blob_descriptions = vec![
            (first_key, lsn, 0, BlobFlag::None),    // First in read 1
            (first_key, lsn, 1024, BlobFlag::None), // Last in read 1
-            (second_key, lsn, 2 * 1024, BlobFlag::Replaces),
+            (second_key, lsn, 2 * 1024, BlobFlag::ReplaceAll),
            (second_key, lsn, 3 * 1024, BlobFlag::None),
-            (second_key, lsn, 4 * 1024, BlobFlag::Replaces), // First in read 2
-            (second_key, lsn, 5 * 1024, BlobFlag::None),     // Last in read 2
+            (second_key, lsn, 4 * 1024, BlobFlag::ReplaceAll), // First in read 2
+            (second_key, lsn, 5 * 1024, BlobFlag::None),       // Last in read 2
        ];

        let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]];
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -17,20 +17,21 @@ use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
-use std::fs::{self, File};
+use std::fs::File;
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};

 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
-use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;

 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
+mod metadata;
 mod open_options;
 pub(crate) use io_engine::IoEngineKind;
+pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;

 ///
@@ -435,13 +436,25 @@ impl VirtualFile {

    /// Call File::sync_all() on the underlying File.
    pub async fn sync_all(&self) -> Result<(), Error> {
-        with_file!(self, StorageIoOperation::Fsync, |file_guard| file_guard
-            .with_std_file(|std_file| std_file.sync_all()))
+        with_file!(self, StorageIoOperation::Fsync, |file_guard| {
+            let (_file_guard, res) = io_engine::get().sync_all(file_guard).await;
+            res
+        })
    }

-    pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
-        with_file!(self, StorageIoOperation::Metadata, |file_guard| file_guard
-            .with_std_file(|std_file| std_file.metadata()))
+    /// Call File::sync_data() on the underlying File.
+    pub async fn sync_data(&self) -> Result<(), Error> {
+        with_file!(self, StorageIoOperation::Fsync, |file_guard| {
+            let (_file_guard, res) = io_engine::get().sync_data(file_guard).await;
+            res
+        })
+    }
+
+    pub async fn metadata(&self) -> Result<Metadata, Error> {
+        with_file!(self, StorageIoOperation::Metadata, |file_guard| {
+            let (_file_guard, res) = io_engine::get().metadata(file_guard).await;
+            res
+        })
    }

    /// Helper function internal to `VirtualFile` that looks up the underlying File,
@@ -579,7 +592,7 @@ impl VirtualFile {
    }

    // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
-    pub async fn write_all_at<B: BoundedBuf>(
+    pub async fn write_all_at<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
        &self,
        buf: B,
        mut offset: u64,
@@ -590,8 +603,9 @@ impl VirtualFile {
        }
        let mut buf = buf.slice(0..buf_len);
        while !buf.is_empty() {
-            // TODO: push `buf` further down
-            match self.write_at(&buf, offset).await {
+            let res;
+            (buf, res) = self.write_at(buf, offset).await;
+            match res {
                Ok(0) => {
                    return (
                        Slice::into_inner(buf),
@@ -605,7 +619,7 @@ impl VirtualFile {
                    buf = buf.slice(n..);
                    offset += n as u64;
                }
-                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+                Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {}
                Err(e) => return (Slice::into_inner(buf), Err(e)),
            }
        }
@@ -616,15 +630,19 @@ impl VirtualFile {
    /// Returns the IoBuf that is underlying the BoundedBuf `buf`.
    /// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in.
    /// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant.
-    pub async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> (B::Buf, Result<usize, Error>) {
+    pub async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        buf: B,
+    ) -> (B::Buf, Result<usize, Error>) {
        let nbytes = buf.bytes_init();
        if nbytes == 0 {
            return (Slice::into_inner(buf.slice_full()), Ok(0));
        }
        let mut buf = buf.slice(0..nbytes);
        while !buf.is_empty() {
-            // TODO: push `Slice` further down
-            match self.write(&buf).await {
+            let res;
+            (buf, res) = self.write(buf).await;
+            match res {
                Ok(0) => {
                    return (
                        Slice::into_inner(buf),
@@ -644,11 +662,18 @@ impl VirtualFile {
        (Slice::into_inner(buf), Ok(nbytes))
    }

-    async fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
+    async fn write<B: IoBuf + Send>(
+        &mut self,
+        buf: Slice<B>,
+    ) -> (Slice<B>, Result<usize, std::io::Error>) {
        let pos = self.pos;
-        let n = self.write_at(buf, pos).await?;
+        let (buf, res) = self.write_at(buf, pos).await;
+        let n = match res {
+            Ok(n) => n,
+            Err(e) => return (buf, Err(e)),
+        };
        self.pos += n as u64;
-        Ok(n)
+        (buf, Ok(n))
    }

    pub(crate) async fn read_at<B>(&self, buf: B, offset: u64) -> (B, Result<usize, Error>)
@@ -676,16 +701,30 @@ impl VirtualFile {
        })
    }

-    async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Write, |file_guard| {
-            file_guard.with_std_file(|std_file| std_file.write_at(buf, offset))
-        });
-        if let Ok(size) = result {
-            STORAGE_IO_SIZE
-                .with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id])
-                .add(size as i64);
-        }
-        result
+    async fn write_at<B: IoBuf + Send>(
+        &self,
+        buf: Slice<B>,
+        offset: u64,
+    ) -> (Slice<B>, Result<usize, Error>) {
+        let file_guard = match self.lock_file().await {
+            Ok(file_guard) => file_guard,
+            Err(e) => return (buf, Err(e)),
+        };
+        observe_duration!(StorageIoOperation::Write, {
+            let ((_file_guard, buf), result) =
+                io_engine::get().write_at(file_guard, offset, buf).await;
+            if let Ok(size) = result {
+                STORAGE_IO_SIZE
+                    .with_label_values(&[
+                        "write",
+                        &self.tenant_id,
+                        &self.shard_id,
+                        &self.timeline_id,
+                    ])
+                    .add(size as i64);
+            }
+            (buf, result)
+        })
    }
 }

@@ -1083,6 +1122,7 @@ mod tests {
    use rand::Rng;
    use std::future::Future;
    use std::io::Write;
+    use std::os::unix::fs::FileExt;
    use std::sync::Arc;

    enum MaybeVirtualFile {
@@ -1103,7 +1143,11 @@ mod tests {
                MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
            }
        }
-        async fn write_all_at<B: BoundedBuf>(&self, buf: B, offset: u64) -> Result<(), Error> {
+        async fn write_all_at<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+            &self,
+            buf: B,
+            offset: u64,
+        ) -> Result<(), Error> {
            match self {
                MaybeVirtualFile::VirtualFile(file) => {
                    let (_buf, res) = file.write_all_at(buf, offset).await;
@@ -1124,7 +1168,10 @@ mod tests {
                MaybeVirtualFile::File(file) => file.seek(pos),
            }
        }
-        async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> Result<(), Error> {
+        async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+            &mut self,
+            buf: B,
+        ) -> Result<(), Error> {
            match self {
                MaybeVirtualFile::VirtualFile(file) => {
                    let (_buf, res) = file.write_all(buf).await;
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -7,6 +7,9 @@
 //!
 //! Then use [`get`] and  [`super::OpenOptions`].

+use tokio_epoll_uring::{IoBuf, Slice};
+use tracing::Instrument;
+
 pub(crate) use super::api::IoEngineKind;
 #[derive(Clone, Copy)]
 #[repr(u8)]
@@ -61,7 +64,8 @@ pub(super) fn init(engine_kind: IoEngineKind) {
    set(engine_kind);
 }

-pub(super) fn get() -> IoEngine {
+/// Longer-term, this API should only be used by [`super::VirtualFile`].
+pub(crate) fn get() -> IoEngine {
    let cur = IoEngine::try_from(IO_ENGINE.load(Ordering::Relaxed)).unwrap();
    if cfg!(test) {
        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
@@ -98,7 +102,17 @@ use std::{
    sync::atomic::{AtomicU8, Ordering},
 };

-use super::FileGuard;
+use super::{FileGuard, Metadata};
+
+#[cfg(target_os = "linux")]
+fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error<std::io::Error>) -> std::io::Error {
+    match e {
+        tokio_epoll_uring::Error::Op(e) => e,
+        tokio_epoll_uring::Error::System(system) => {
+            std::io::Error::new(std::io::ErrorKind::Other, system)
+        }
+    }
+}

 impl IoEngine {
    pub(super) async fn read_at<B>(
@@ -133,16 +147,109 @@ impl IoEngine {
            IoEngine::TokioEpollUring => {
                let system = tokio_epoll_uring::thread_local_system().await;
                let (resources, res) = system.read(file_guard, offset, buf).await;
+                (resources, res.map_err(epoll_uring_error_to_std))
+            }
+        }
+    }
+    pub(super) async fn sync_all(&self, file_guard: FileGuard) -> (FileGuard, std::io::Result<()>) {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let res = file_guard.with_std_file(|std_file| std_file.sync_all());
+                (file_guard, res)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.fsync(file_guard).await;
+                (resources, res.map_err(epoll_uring_error_to_std))
+            }
+        }
+    }
+    pub(super) async fn sync_data(
+        &self,
+        file_guard: FileGuard,
+    ) -> (FileGuard, std::io::Result<()>) {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let res = file_guard.with_std_file(|std_file| std_file.sync_data());
+                (file_guard, res)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.fdatasync(file_guard).await;
+                (resources, res.map_err(epoll_uring_error_to_std))
+            }
+        }
+    }
+    pub(super) async fn metadata(
+        &self,
+        file_guard: FileGuard,
+    ) -> (FileGuard, std::io::Result<Metadata>) {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let res =
+                    file_guard.with_std_file(|std_file| std_file.metadata().map(Metadata::from));
+                (file_guard, res)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.statx(file_guard).await;
                (
                    resources,
-                    res.map_err(|e| match e {
-                        tokio_epoll_uring::Error::Op(e) => e,
-                        tokio_epoll_uring::Error::System(system) => {
-                            std::io::Error::new(std::io::ErrorKind::Other, system)
-                        }
-                    }),
+                    res.map_err(epoll_uring_error_to_std).map(Metadata::from),
                )
            }
        }
    }
+    pub(super) async fn write_at<B: IoBuf + Send>(
+        &self,
+        file_guard: FileGuard,
+        offset: u64,
+        buf: Slice<B>,
+    ) -> ((FileGuard, Slice<B>), std::io::Result<usize>) {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let result = file_guard.with_std_file(|std_file| std_file.write_at(&buf, offset));
+                ((file_guard, buf), result)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.write(file_guard, offset, buf).await;
+                (resources, res.map_err(epoll_uring_error_to_std))
+            }
+        }
+    }
+
+    /// If we switch a user of [`tokio::fs`] to use [`super::io_engine`],
+    /// they'd start blocking the executor thread if [`IoEngine::StdFs`] is configured
+    /// whereas before the switch to [`super::io_engine`], that wasn't the case.
+    /// This method helps avoid such a regression.
+    ///
+    /// Panics if the `spawn_blocking` fails, see [`tokio::task::JoinError`] for reasons why that can happen.
+    pub(crate) async fn spawn_blocking_and_block_on_if_std<Fut, R>(&self, work: Fut) -> R
+    where
+        Fut: 'static + Send + std::future::Future<Output = R>,
+        R: 'static + Send,
+    {
+        match self {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
+                let span = tracing::info_span!("spawn_blocking_block_on_if_std");
+                tokio::task::spawn_blocking({
+                    move || tokio::runtime::Handle::current().block_on(work.instrument(span))
+                })
+                .await
+                .expect("failed to join blocking code most likely it panicked, panicking as well")
+            }
+            #[cfg(target_os = "linux")]
+            IoEngine::TokioEpollUring => work.await,
+        }
+    }
 }
--- a/pageserver/src/virtual_file/metadata.rs
+++ b/pageserver/src/virtual_file/metadata.rs
@@ -0,0 +1,30 @@
+use std::fs;
+
+pub enum Metadata {
+    StdFs(fs::Metadata),
+    #[cfg(target_os = "linux")]
+    TokioEpollUring(Box<tokio_epoll_uring::ops::statx::statx>),
+}
+
+#[cfg(target_os = "linux")]
+impl From<Box<tokio_epoll_uring::ops::statx::statx>> for Metadata {
+    fn from(value: Box<tokio_epoll_uring::ops::statx::statx>) -> Self {
+        Metadata::TokioEpollUring(value)
+    }
+}
+
+impl From<std::fs::Metadata> for Metadata {
+    fn from(value: std::fs::Metadata) -> Self {
+        Metadata::StdFs(value)
+    }
+}
+
+impl Metadata {
+    pub fn len(&self) -> u64 {
+        match self {
+            Metadata::StdFs(metadata) => metadata.len(),
+            #[cfg(target_os = "linux")]
+            Metadata::TokioEpollUring(statx) => statx.stx_size,
+        }
+    }
+}
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -35,6 +35,7 @@
 #include "utils/memutils.h"
 #include "utils/jsonb.h"

+#include "control_plane_connector.h"
 #include "neon_utils.h"

 static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
@@ -113,6 +114,8 @@ ConstructDeltaMessage()
 	if (RootTable.db_table)
 	{
 		JsonbValue	dbs;
+		HASH_SEQ_STATUS status;
+		DbEntry    *entry;

 		dbs.type = jbvString;
 		dbs.val.string.val = "dbs";
@@ -120,9 +123,6 @@ ConstructDeltaMessage()
 		pushJsonbValue(&state, WJB_KEY, &dbs);
 		pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL);

-		HASH_SEQ_STATUS status;
-		DbEntry    *entry;
-
 		hash_seq_init(&status, RootTable.db_table);
 		while ((entry = hash_seq_search(&status)) != NULL)
 		{
@@ -168,8 +168,9 @@ ConstructDeltaMessage()
 #else
 				const char *logdetail;
 #endif
+				char	   *encrypted_password;
 				PushKeyValue(&state, "password", (char *) entry->password);
-				char	   *encrypted_password = get_role_password(entry->name, &logdetail);
+				encrypted_password = get_role_password(entry->name, &logdetail);

 				if (encrypted_password)
 				{
@@ -831,7 +832,7 @@ NeonProcessUtility(
 	}
 }

-extern void
+void
 InitControlPlaneConnector()
 {
 	PreviousProcessUtilityHook = ProcessUtility_hook;
--- a/pgxn/neon/control_plane_connector.h
+++ b/pgxn/neon/control_plane_connector.h
@@ -1,6 +1,6 @@
 #ifndef CONTROL_PLANE_CONNECTOR_H
 #define CONTROL_PLANE_CONNECTOR_H

-void		InitControlPlaneConnector();
+void		InitControlPlaneConnector(void);

 #endif
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -14,6 +14,7 @@

 #include "utils/guc.h"

+#include "extension_server.h" 
 #include "neon_utils.h"

 static int	extension_server_port = 0;
--- a/pgxn/neon/extension_server.h
+++ b/pgxn/neon/extension_server.h
@@ -0,0 +1,17 @@
+/*-------------------------------------------------------------------------
+ *
+ * extension_server.h
+ *	  Request compute_ctl to download extension files.
+ *
+ * IDENTIFICATION
+ *	 contrib/neon/extension_server.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef EXTENSION_SERVER_H
+#define EXTENSION_SERVER_H
+
+void pg_init_extension_server(void);
+
+#endif							/* EXTENSION_SERVER_H */
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -29,6 +29,7 @@
 #include "utils/guc.h"
 #include "utils/wait_event.h"

+#include "extension_server.h"
 #include "neon.h"
 #include "walproposer.h"
 #include "pagestore_client.h"
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -25,12 +25,11 @@ extern int	wal_acceptor_connection_timeout;
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);

-extern void pg_init_extension_server(void);
-
 extern uint64 BackpressureThrottlingTime(void);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);

 extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
 extern void PGDLLEXPORT WalProposerMain(Datum main_arg);
+PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);

 #endif							/* NEON_H */
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -6,6 +6,7 @@

 #include "postgres.h"

+#include "neon_utils.h"
 #include "lib/stringinfo.h"
 #include "libpq/pqformat.h"

@@ -14,7 +15,7 @@
 *
 * Returns -1 if the character is not a hexadecimal digit.
 */
-int
+static int
 HexDecodeChar(char c)
 {
 	if (c >= '0' && c <= '9')
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -12,7 +12,7 @@ uint32		pq_getmsgint32_le(StringInfo msg);
 uint64		pq_getmsgint64_le(StringInfo msg);
 void		pq_sendint32_le(StringInfo buf, uint32 i);
 void		pq_sendint64_le(StringInfo buf, uint64 i);
-extern void disable_core_dump();
+void        disable_core_dump(void);

 #ifndef WALPROPOSER_LIB

--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1460,7 +1460,7 @@ RecvAppendResponses(Safekeeper *sk)
 }

 /* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */
-void
+static void
 ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *rf)
 {
 	uint8		nkeys;
@@ -1590,9 +1590,9 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
 Safekeeper *
 GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
 {
-	*donor_lsn = InvalidXLogRecPtr;
 	Safekeeper *donor = NULL;
 	int			i;
+	*donor_lsn = InvalidXLogRecPtr;

 	if (wp->n_votes < wp->quorum)
 	{
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -398,7 +398,7 @@ walprop_pg_get_shmem_state(WalProposer *wp)
 	return walprop_shared;
 }

-void
+static void
 replication_feedback_set(PageserverFeedback *rf)
 {
 	SpinLockAcquire(&walprop_shared->mutex);
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -68,7 +68,6 @@ task-local-extensions.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
-tls-listener.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -102,6 +102,8 @@ pub(super) async fn authenticate(

    ctx.set_user(db_info.user.into());
    ctx.set_project(db_info.aux.clone());
+    let cold_start_info = db_info.aux.cold_start_info.clone().unwrap_or_default();
+    info!(?cold_start_info, "woken up a compute node");

    // Backwards compatibility. pg_sni_proxy uses "--" in domain names
    // while direct connections do not. Once we migrate to pg_sni_proxy
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -101,9 +101,10 @@ pub struct MetricsAuxInfo {
    pub cold_start_info: Option<ColdStartInfo>,
 }

-#[derive(Debug, Serialize, Deserialize, Clone)]
+#[derive(Debug, Default, Serialize, Deserialize, Clone)]
 #[serde(rename_all = "snake_case")]
 pub enum ColdStartInfo {
+    #[default]
    Unknown = 0,
    Warm = 1,
    PoolHit = 2,
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -259,6 +259,9 @@ impl super::Api for Api {
        }

        let node = self.do_wake_compute(ctx, user_info).await?;
+        ctx.set_project(node.aux.clone());
+        let cold_start_info = node.aux.cold_start_info.clone().unwrap_or_default();
+        info!(?cold_start_info, "woken up a compute node");
        let (_, cached) = self.caches.node_info.insert(key.clone(), node);
        info!(key = &*key, "created a cache entry for compute node info");

--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -4,7 +4,7 @@ use ::metrics::{
    register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec,
    IntCounterVec, IntGauge, IntGaugeVec,
 };
-use metrics::{register_int_counter_pair, IntCounterPair};
+use metrics::{register_int_counter, register_int_counter_pair, IntCounter, IntCounterPair};

 use once_cell::sync::Lazy;
 use tokio::time;
@@ -303,3 +303,20 @@ pub static ENDPOINT_ERRORS_BY_KIND: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
    )
    .unwrap()
 });
+
+pub static REDIS_BROKEN_MESSAGES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_redis_errors_total",
+        "Number of errors by a given classification",
+        &["channel"],
+    )
+    .unwrap()
+});
+
+pub static TLS_HANDSHAKE_FAILURES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "proxy_tls_handshake_failures",
+        "Number of TLS handshake failures",
+    )
+    .unwrap()
+});
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -1,22 +1,27 @@
 //! Proxy Protocol V2 implementation

 use std::{
-    future::poll_fn,
-    future::Future,
+    future::{poll_fn, Future},
    io,
    net::SocketAddr,
    pin::{pin, Pin},
+    sync::Mutex,
    task::{ready, Context, Poll},
 };

 use bytes::{Buf, BytesMut};
+use hyper::server::accept::Accept;
 use hyper::server::conn::{AddrIncoming, AddrStream};
+use metrics::IntCounterPairGuard;
 use pin_project_lite::pin_project;
-use tls_listener::AsyncAccept;
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
+use uuid::Uuid;
+
+use crate::{metrics::NUM_CLIENT_CONNECTION_GAUGE, serverless::tls_listener::AsyncAccept};

 pub struct ProxyProtocolAccept {
    pub incoming: AddrIncoming,
+    pub protocol: &'static str,
 }

 pin_project! {
@@ -327,7 +332,7 @@ impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
 }

 impl AsyncAccept for ProxyProtocolAccept {
-    type Connection = WithClientIp<AddrStream>;
+    type Connection = WithConnectionGuard<WithClientIp<AddrStream>>;

    type Error = io::Error;

@@ -336,11 +341,74 @@ impl AsyncAccept for ProxyProtocolAccept {
        cx: &mut Context<'_>,
    ) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
        let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
+        tracing::info!(protocol = self.protocol, "accepted new TCP connection");
        let Some(conn) = conn else {
            return Poll::Ready(None);
        };

-        Poll::Ready(Some(Ok(WithClientIp::new(conn))))
+        Poll::Ready(Some(Ok(WithConnectionGuard {
+            inner: WithClientIp::new(conn),
+            connection_id: Uuid::new_v4(),
+            gauge: Mutex::new(Some(
+                NUM_CLIENT_CONNECTION_GAUGE
+                    .with_label_values(&[self.protocol])
+                    .guard(),
+            )),
+        })))
+    }
+}
+
+pin_project! {
+    pub struct WithConnectionGuard<T> {
+        #[pin]
+        pub inner: T,
+        pub connection_id: Uuid,
+        pub gauge: Mutex<Option<IntCounterPairGuard>>,
+    }
+}
+
+impl<T: AsyncWrite> AsyncWrite for WithConnectionGuard<T> {
+    #[inline]
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &[u8],
+    ) -> Poll<Result<usize, io::Error>> {
+        self.project().inner.poll_write(cx, buf)
+    }
+
+    #[inline]
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
+        self.project().inner.poll_flush(cx)
+    }
+
+    #[inline]
+    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
+        self.project().inner.poll_shutdown(cx)
+    }
+
+    #[inline]
+    fn poll_write_vectored(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        bufs: &[io::IoSlice<'_>],
+    ) -> Poll<Result<usize, io::Error>> {
+        self.project().inner.poll_write_vectored(cx, bufs)
+    }
+
+    #[inline]
+    fn is_write_vectored(&self) -> bool {
+        self.inner.is_write_vectored()
+    }
+}
+
+impl<T: AsyncRead> AsyncRead for WithConnectionGuard<T> {
+    fn poll_read(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        self.project().inner.poll_read(cx, buf)
    }
 }

--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -24,6 +24,7 @@ use crate::{
 };
 use futures::TryFutureExt;
 use itertools::Itertools;
+use metrics::IntCounterPairGuard;
 use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, StartupMessageParams};
 use regex::Regex;
@@ -78,10 +79,16 @@ pub async fn task_main(
    {
        let (socket, peer_addr) = accept_result?;

+        let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
+            .with_label_values(&["tcp"])
+            .guard();
+
        let session_id = uuid::Uuid::new_v4();
        let cancellation_handler = Arc::clone(&cancellation_handler);
        let endpoint_rate_limiter = endpoint_rate_limiter.clone();

+        tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
+
        connections.spawn(async move {
            let mut socket = WithClientIp::new(socket);
            let mut peer_addr = peer_addr.ip();
@@ -116,6 +123,7 @@ pub async fn task_main(
                socket,
                ClientMode::Tcp,
                endpoint_rate_limiter,
+                conn_gauge,
            )
            .instrument(span.clone())
            .await;
@@ -229,13 +237,11 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    stream: S,
    mode: ClientMode,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    conn_gauge: IntCounterPairGuard,
 ) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
    info!("handling interactive connection from client");

    let proto = ctx.protocol;
-    let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE
-        .with_label_values(&[proto])
-        .guard();
    let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
        .with_label_values(&[proto])
        .guard();
@@ -325,7 +331,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        aux: node.aux.clone(),
        compute: node,
        req: _request_gauge,
-        conn: _client_gauge,
+        conn: conn_gauge,
        cancel: session,
    }))
 }
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -10,6 +10,7 @@ use crate::{
    cache::project_info::ProjectInfoCache,
    cancellation::{CancelMap, CancellationHandler, NotificationsCancellationHandler},
    intern::{ProjectIdInt, RoleNameInt},
+    metrics::REDIS_BROKEN_MESSAGES,
 };

 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
@@ -115,6 +116,9 @@ impl<
        let msg: Notification = match serde_json::from_str(&payload) {
            Ok(msg) => msg,
            Err(e) => {
+                REDIS_BROKEN_MESSAGES
+                    .with_label_values(&[msg.get_channel_name()])
+                    .inc();
                tracing::error!("broken message: {e}");
                return Ok(());
            }
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -6,6 +6,7 @@ mod backend;
 mod conn_pool;
 mod json;
 mod sql_over_http;
+pub mod tls_listener;
 mod websocket;

 pub use conn_pool::GlobalConnPoolOptions;
@@ -20,8 +21,8 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;

 use crate::context::RequestMonitoring;
-use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
-use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
+use crate::metrics::TLS_HANDSHAKE_FAILURES;
+use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
 use crate::{cancellation::CancellationHandler, config::ProxyConfig};
@@ -98,6 +99,7 @@ pub async fn task_main(
    let _ = addr_incoming.set_nodelay(true);
    let addr_incoming = ProxyProtocolAccept {
        incoming: addr_incoming,
+        protocol: "http",
    };

    let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
@@ -105,18 +107,34 @@ pub async fn task_main(

    let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
        if let Err(err) = conn {
-            error!("failed to accept TLS connection for websockets: {err:?}");
+            error!(
+                protocol = "http",
+                "failed to accept TLS connection: {err:?}"
+            );
+            TLS_HANDSHAKE_FAILURES.inc();
            ready(false)
        } else {
+            info!(protocol = "http", "accepted new TLS connection");
            ready(true)
        }
    });

    let make_svc = hyper::service::make_service_fn(
-        |stream: &tokio_rustls::server::TlsStream<WithClientIp<AddrStream>>| {
-            let (io, _) = stream.get_ref();
-            let client_addr = io.client_addr();
-            let remote_addr = io.inner.remote_addr();
+        |stream: &tokio_rustls::server::TlsStream<
+            WithConnectionGuard<WithClientIp<AddrStream>>,
+        >| {
+            let (conn, _) = stream.get_ref();
+
+            // this is jank. should dissapear with hyper 1.0 migration.
+            let gauge = conn
+                .gauge
+                .lock()
+                .expect("lock should not be poisoned")
+                .take()
+                .expect("gauge should be set on connection start");
+
+            let client_addr = conn.inner.client_addr();
+            let remote_addr = conn.inner.inner.remote_addr();
            let backend = backend.clone();
            let ws_connections = ws_connections.clone();
            let endpoint_rate_limiter = endpoint_rate_limiter.clone();
@@ -127,8 +145,8 @@ pub async fn task_main(
                    None if config.require_client_ip => bail!("missing required client ip"),
                    None => remote_addr,
                };
-                Ok(MetricService::new(hyper::service::service_fn(
-                    move |req: Request<Body>| {
+                Ok(MetricService::new(
+                    hyper::service::service_fn(move |req: Request<Body>| {
                        let backend = backend.clone();
                        let ws_connections = ws_connections.clone();
                        let endpoint_rate_limiter = endpoint_rate_limiter.clone();
@@ -149,8 +167,9 @@ pub async fn task_main(
                                .map_or_else(|e| e.into_response(), |r| r),
                            )
                        }
-                    },
-                )))
+                    }),
+                    gauge,
+                ))
            }
        },
    );
@@ -172,13 +191,8 @@ struct MetricService<S> {
 }

 impl<S> MetricService<S> {
-    fn new(inner: S) -> MetricService<S> {
-        MetricService {
-            inner,
-            _gauge: NUM_CLIENT_CONNECTION_GAUGE
-                .with_label_values(&["http"])
-                .guard(),
-        }
+    fn new(inner: S, _gauge: IntCounterPairGuard) -> MetricService<S> {
+        MetricService { inner, _gauge }
    }
 }

--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,7 +1,6 @@
 use std::sync::Arc;

 use anyhow::bail;
-use futures::pin_mut;
 use futures::StreamExt;
 use hyper::body::HttpBody;
 use hyper::header;
@@ -531,13 +530,12 @@ async fn query_to_json<T: GenericClient>(
 ) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
    info!("executing query");
    let query_params = data.params;
-    let row_stream = client.query_raw_txt(&data.query, query_params).await?;
+    let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?);
    info!("finished executing query");

    // Manually drain the stream into a vector to leave row_stream hanging
    // around to get a command tag. Also check that the response is not too
    // big.
-    pin_mut!(row_stream);
    let mut rows: Vec<tokio_postgres::Row> = Vec::new();
    while let Some(row) = row_stream.next().await {
        let row = row?;
--- a/proxy/src/serverless/tls_listener.rs
+++ b/proxy/src/serverless/tls_listener.rs
@@ -0,0 +1,283 @@
+use std::{
+    pin::Pin,
+    task::{Context, Poll},
+    time::Duration,
+};
+
+use futures::{Future, Stream, StreamExt};
+use pin_project_lite::pin_project;
+use thiserror::Error;
+use tokio::{
+    io::{AsyncRead, AsyncWrite},
+    task::JoinSet,
+    time::timeout,
+};
+
+/// Default timeout for the TLS handshake.
+pub const DEFAULT_HANDSHAKE_TIMEOUT: Duration = Duration::from_secs(10);
+
+/// Trait for TLS implementation.
+///
+/// Implementations are provided by the rustls and native-tls features.
+pub trait AsyncTls<C: AsyncRead + AsyncWrite>: Clone {
+    /// The type of the TLS stream created from the underlying stream.
+    type Stream: Send + 'static;
+    /// Error type for completing the TLS handshake
+    type Error: std::error::Error + Send + 'static;
+    /// Type of the Future for the TLS stream that is accepted.
+    type AcceptFuture: Future<Output = Result<Self::Stream, Self::Error>> + Send + 'static;
+
+    /// Accept a TLS connection on an underlying stream
+    fn accept(&self, stream: C) -> Self::AcceptFuture;
+}
+
+/// Asynchronously accept connections.
+pub trait AsyncAccept {
+    /// The type of the connection that is accepted.
+    type Connection: AsyncRead + AsyncWrite;
+    /// The type of error that may be returned.
+    type Error;
+
+    /// Poll to accept the next connection.
+    fn poll_accept(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<Self::Connection, Self::Error>>>;
+
+    /// Return a new `AsyncAccept` that stops accepting connections after
+    /// `ender` completes.
+    ///
+    /// Useful for graceful shutdown.
+    ///
+    /// See [examples/echo.rs](https://github.com/tmccombs/tls-listener/blob/main/examples/echo.rs)
+    /// for example of how to use.
+    fn until<F: Future>(self, ender: F) -> Until<Self, F>
+    where
+        Self: Sized,
+    {
+        Until {
+            acceptor: self,
+            ender,
+        }
+    }
+}
+
+pin_project! {
+    ///
+    /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself
+    /// encrypted using TLS.
+    ///
+    /// It is similar to:
+    ///
+    /// ```ignore
+    /// tcpListener.and_then(|s| tlsAcceptor.accept(s))
+    /// ```
+    ///
+    /// except that it has the ability to accept multiple transport-level connections
+    /// simultaneously while the TLS handshake is pending for other connections.
+    ///
+    /// By default, if a client fails the TLS handshake, that is treated as an error, and the
+    /// `TlsListener` will return an `Err`. If the `TlsListener` is passed directly to a hyper
+    /// [`Server`][1], then an invalid handshake can cause the server to stop accepting connections.
+    /// See [`http-stream.rs`][2] or [`http-low-level`][3] examples, for examples of how to avoid this.
+    ///
+    /// Note that if the maximum number of pending connections is greater than 1, the resulting
+    /// [`T::Stream`][4] connections may come in a different order than the connections produced by the
+    /// underlying listener.
+    ///
+    /// [1]: https://docs.rs/hyper/latest/hyper/server/struct.Server.html
+    /// [2]: https://github.com/tmccombs/tls-listener/blob/main/examples/http-stream.rs
+    /// [3]: https://github.com/tmccombs/tls-listener/blob/main/examples/http-low-level.rs
+    /// [4]: AsyncTls::Stream
+    ///
+    #[allow(clippy::type_complexity)]
+    pub struct TlsListener<A: AsyncAccept, T: AsyncTls<A::Connection>> {
+        #[pin]
+        listener: A,
+        tls: T,
+        waiting: JoinSet<Result<Result<T::Stream, T::Error>, tokio::time::error::Elapsed>>,
+        timeout: Duration,
+    }
+}
+
+/// Builder for `TlsListener`.
+#[derive(Clone)]
+pub struct Builder<T> {
+    tls: T,
+    handshake_timeout: Duration,
+}
+
+/// Wraps errors from either the listener or the TLS Acceptor
+#[derive(Debug, Error)]
+pub enum Error<LE: std::error::Error, TE: std::error::Error> {
+    /// An error that arose from the listener ([AsyncAccept::Error])
+    #[error("{0}")]
+    ListenerError(#[source] LE),
+    /// An error that occurred during the TLS accept handshake
+    #[error("{0}")]
+    TlsAcceptError(#[source] TE),
+}
+
+impl<A: AsyncAccept, T> TlsListener<A, T>
+where
+    T: AsyncTls<A::Connection>,
+{
+    /// Create a `TlsListener` with default options.
+    pub fn new(tls: T, listener: A) -> Self {
+        builder(tls).listen(listener)
+    }
+}
+
+impl<A, T> TlsListener<A, T>
+where
+    A: AsyncAccept,
+    A::Error: std::error::Error,
+    T: AsyncTls<A::Connection>,
+{
+    /// Accept the next connection
+    ///
+    /// This is essentially an alias to `self.next()` with a more domain-appropriate name.
+    pub async fn accept(&mut self) -> Option<<Self as Stream>::Item>
+    where
+        Self: Unpin,
+    {
+        self.next().await
+    }
+
+    /// Replaces the Tls Acceptor configuration, which will be used for new connections.
+    ///
+    /// This can be used to change the certificate used at runtime.
+    pub fn replace_acceptor(&mut self, acceptor: T) {
+        self.tls = acceptor;
+    }
+
+    /// Replaces the Tls Acceptor configuration from a pinned reference to `Self`.
+    ///
+    /// This is useful if your listener is `!Unpin`.
+    ///
+    /// This can be used to change the certificate used at runtime.
+    pub fn replace_acceptor_pin(self: Pin<&mut Self>, acceptor: T) {
+        *self.project().tls = acceptor;
+    }
+}
+
+impl<A, T> Stream for TlsListener<A, T>
+where
+    A: AsyncAccept,
+    A::Error: std::error::Error,
+    T: AsyncTls<A::Connection>,
+{
+    type Item = Result<T::Stream, Error<A::Error, T::Error>>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let mut this = self.project();
+
+        loop {
+            match this.listener.as_mut().poll_accept(cx) {
+                Poll::Pending => break,
+                Poll::Ready(Some(Ok(conn))) => {
+                    this.waiting
+                        .spawn(timeout(*this.timeout, this.tls.accept(conn)));
+                }
+                Poll::Ready(Some(Err(e))) => {
+                    return Poll::Ready(Some(Err(Error::ListenerError(e))));
+                }
+                Poll::Ready(None) => return Poll::Ready(None),
+            }
+        }
+
+        loop {
+            return match this.waiting.poll_join_next(cx) {
+                Poll::Ready(Some(Ok(Ok(conn)))) => {
+                    Poll::Ready(Some(conn.map_err(Error::TlsAcceptError)))
+                }
+                // The handshake timed out, try getting another connection from the queue
+                Poll::Ready(Some(Ok(Err(_)))) => continue,
+                // The handshake panicked
+                Poll::Ready(Some(Err(e))) if e.is_panic() => {
+                    std::panic::resume_unwind(e.into_panic())
+                }
+                // The handshake was externally aborted
+                Poll::Ready(Some(Err(_))) => unreachable!("handshake tasks are never aborted"),
+                _ => Poll::Pending,
+            };
+        }
+    }
+}
+
+impl<C: AsyncRead + AsyncWrite + Unpin + Send + 'static> AsyncTls<C> for tokio_rustls::TlsAcceptor {
+    type Stream = tokio_rustls::server::TlsStream<C>;
+    type Error = std::io::Error;
+    type AcceptFuture = tokio_rustls::Accept<C>;
+
+    fn accept(&self, conn: C) -> Self::AcceptFuture {
+        tokio_rustls::TlsAcceptor::accept(self, conn)
+    }
+}
+
+impl<T> Builder<T> {
+    /// Set the timeout for handshakes.
+    ///
+    /// If a timeout takes longer than `timeout`, then the handshake will be
+    /// aborted and the underlying connection will be dropped.
+    ///
+    /// Defaults to `DEFAULT_HANDSHAKE_TIMEOUT`.
+    pub fn handshake_timeout(&mut self, timeout: Duration) -> &mut Self {
+        self.handshake_timeout = timeout;
+        self
+    }
+
+    /// Create a `TlsListener` from the builder
+    ///
+    /// Actually build the `TlsListener`. The `listener` argument should be
+    /// an implementation of the `AsyncAccept` trait that accepts new connections
+    /// that the `TlsListener` will  encrypt using TLS.
+    pub fn listen<A: AsyncAccept>(&self, listener: A) -> TlsListener<A, T>
+    where
+        T: AsyncTls<A::Connection>,
+    {
+        TlsListener {
+            listener,
+            tls: self.tls.clone(),
+            waiting: JoinSet::new(),
+            timeout: self.handshake_timeout,
+        }
+    }
+}
+
+/// Create a new Builder for a TlsListener
+///
+/// `server_config` will be used to configure the TLS sessions.
+pub fn builder<T>(tls: T) -> Builder<T> {
+    Builder {
+        tls,
+        handshake_timeout: DEFAULT_HANDSHAKE_TIMEOUT,
+    }
+}
+
+pin_project! {
+    /// See [`AsyncAccept::until`]
+    pub struct Until<A, E> {
+        #[pin]
+        acceptor: A,
+        #[pin]
+        ender: E,
+    }
+}
+
+impl<A: AsyncAccept, E: Future> AsyncAccept for Until<A, E> {
+    type Connection = A::Connection;
+    type Error = A::Error;
+
+    fn poll_accept(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
+        let this = self.project();
+
+        match this.ender.poll(cx) {
+            Poll::Pending => this.acceptor.poll_accept(cx),
+            Poll::Ready(_) => Poll::Ready(None),
+        }
+    }
+}
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -3,6 +3,7 @@ use crate::{
    config::ProxyConfig,
    context::RequestMonitoring,
    error::{io_error, ReportableError},
+    metrics::NUM_CLIENT_CONNECTION_GAUGE,
    proxy::{handle_client, ClientMode},
    rate_limiter::EndpointRateLimiter,
 };
@@ -138,6 +139,10 @@ pub async fn serve_websocket(
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
    let websocket = websocket.await?;
+    let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
+        .with_label_values(&["ws"])
+        .guard();
+
    let res = handle_client(
        config,
        &mut ctx,
@@ -145,6 +150,7 @@ pub async fn serve_websocket(
        WebSocketRw::new(websocket),
        ClientMode::Websockets { hostname },
        endpoint_rate_limiter,
+        conn_gauge,
    )
    .await;

--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -1,5 +1,6 @@
 use crate::config::TlsServerEndPoint;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
+use crate::metrics::TLS_HANDSHAKE_FAILURES;
 use bytes::BytesMut;

 use pq_proto::framed::{ConnectionError, Framed};
@@ -224,7 +225,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
    /// If possible, upgrade raw stream into a secure TLS-based stream.
    pub async fn upgrade(self, cfg: Arc<ServerConfig>) -> Result<TlsStream<S>, StreamUpgradeError> {
        match self {
-            Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?),
+            Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg)
+                .accept(raw)
+                .await
+                .inspect_err(|_| TLS_HANDSHAKE_FAILURES.inc())?),
            Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls),
        }
    }
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -11,7 +11,7 @@ use utils::id::TimelineId;
 use crate::cloud_admin_api::BranchData;
 use crate::metadata_stream::stream_listing;
 use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
-use futures_util::{pin_mut, StreamExt};
+use futures_util::StreamExt;
 use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
 use pageserver::tenant::storage_layer::LayerFileName;
 use pageserver::tenant::IndexPart;
@@ -285,8 +285,7 @@ pub(crate) async fn list_timeline_blobs(
    let mut index_parts: Vec<ObjectIdentifier> = Vec::new();
    let mut initdb_archive: bool = false;

-    let stream = stream_listing(s3_client, &timeline_dir_target);
-    pin_mut!(stream);
+    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
    while let Some(obj) = stream.next().await {
        let obj = obj?;
        let key = obj.key();
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -12,7 +12,7 @@ use aws_sdk_s3::{
    types::{Delete, ObjectIdentifier},
    Client,
 };
-use futures_util::{pin_mut, TryStreamExt};
+use futures_util::TryStreamExt;
 use pageserver_api::shard::TenantShardId;
 use serde::{Deserialize, Serialize};
 use tokio_stream::StreamExt;
@@ -199,12 +199,12 @@ async fn find_garbage_inner(
            }
        }
    });
-    let tenants_checked = tenants_checked.try_buffer_unordered(CONSOLE_CONCURRENCY);
+    let mut tenants_checked =
+        std::pin::pin!(tenants_checked.try_buffer_unordered(CONSOLE_CONCURRENCY));

    // Process the results of Tenant checks.  If a Tenant is garbage, it goes into
    // the `GarbageList`.  Else it goes into `active_tenants` for more detailed timeline
    // checks if they are enabled by the `depth` parameter.
-    pin_mut!(tenants_checked);
    let mut garbage = GarbageList::new(node_kind, bucket_config);
    let mut active_tenants: Vec<TenantShardId> = vec![];
    let mut counter = 0;
@@ -267,10 +267,10 @@ async fn find_garbage_inner(
                .map(|r| (ttid, r))
        }
    });
-    let timelines_checked = timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY);
+    let mut timelines_checked =
+        std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY));

    // Update the GarbageList with any timelines which appear not to exist.
-    pin_mut!(timelines_checked);
    while let Some(result) = timelines_checked.next().await {
        let (ttid, console_result) = result?;
        if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) {
@@ -425,9 +425,9 @@ pub async fn purge_garbage(
            }
        }
    });
-    let get_objects_results = get_objects_results.try_buffer_unordered(S3_CONCURRENCY);
+    let mut get_objects_results =
+        std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY));

-    pin_mut!(get_objects_results);
    let mut objects_to_delete = Vec::new();
    while let Some(result) = get_objects_results.next().await {
        let mut object_list = result?;
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -7,7 +7,7 @@ use crate::checks::{
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
 use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use aws_sdk_s3::Client;
-use futures_util::{pin_mut, StreamExt, TryStreamExt};
+use futures_util::{StreamExt, TryStreamExt};
 use histogram::Histogram;
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
 use pageserver::tenant::IndexPart;
@@ -226,7 +226,7 @@ pub async fn scan_metadata(
        Ok((ttid, data))
    }
    let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid));
-    let timelines = timelines.try_buffered(CONCURRENCY);
+    let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));

    // We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different
    // shards in the same tenant might refer to one anothers' keys if a shard split has happened.
@@ -309,7 +309,6 @@ pub async fn scan_metadata(
    // all results for the same tenant will be adjacent.  We accumulate these,
    // and then call `analyze_tenant` to flush, when we see the next tenant ID.
    let mut summary = MetadataSummary::new();
-    pin_mut!(timelines);
    while let Some(i) = timelines.next().await {
        let (ttid, data) = i?;
        summary.update_data(&data);
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -68,7 +68,7 @@ async fn handle_socket(
    // is not Unpin, and all pgbackend/framed/tokio dependencies require stream
    // to be Unpin. Which is reasonable, as indeed something like TimeoutReader
    // shouldn't be moved.
-    tokio::pin!(socket);
+    let socket = std::pin::pin!(socket);

    let traffic_metrics = TrafficMetrics::new();
    if let Some(current_az) = conf.availability_zone.as_deref() {
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -1,5 +1,6 @@
 import asyncio
 import json
+import os
 from pathlib import Path
 from typing import Any, Dict, Tuple

@@ -19,6 +20,10 @@ from performance.pageserver.util import (
@pytest.mark.parametrize("n_tenants", [10])
@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"])
@pytest.mark.timeout(1000)
+@pytest.mark.skipif(
+    os.getenv("CI", "false") == "true",
+    reason="The test if flaky on CI: https://github.com/neondatabase/neon/issues/7006",
+)
 def test_basebackup_with_high_slru_count(
    neon_env_builder: NeonEnvBuilder,
    zenbenchmark: NeonBenchmarker,
--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -160,8 +160,6 @@ def get_scales_matrix(default: int = 10) -> List[int]:
            scale = get_scale_for_db(int(s.removesuffix("mb")))
        elif s.endswith("gb"):
            scale = get_scale_for_db(int(s.removesuffix("gb")) * 1024)
-        elif s.endswith("tb"):
-            scale = get_scale_for_db(int(s.removesuffix("tb")) * 1024 * 1024)
        else:
            scale = int(s)
        rv.append(scale)
--- a/test_runner/pg_clients/csharp/npgsql/Dockerfile
+++ b/test_runner/pg_clients/csharp/npgsql/Dockerfile
@@ -1,4 +1,4 @@
-FROM mcr.microsoft.com/dotnet/sdk:7.0 AS build
+FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build
 WORKDIR /source

 COPY *.csproj .
@@ -7,7 +7,7 @@ RUN dotnet restore
 COPY . .
 RUN dotnet publish -c release -o /app --no-restore

-FROM mcr.microsoft.com/dotnet/runtime:7.0
+FROM mcr.microsoft.com/dotnet/runtime:8.0
 WORKDIR /app
 COPY --from=build /app .

--- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
+++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
@@ -2,13 +2,13 @@

  <PropertyGroup>
    <OutputType>Exe</OutputType>
-    <TargetFramework>net7.0</TargetFramework>
+    <TargetFramework>net8.0</TargetFramework>
    <ImplicitUsings>enable</ImplicitUsings>
    <Nullable>enable</Nullable>
  </PropertyGroup>

  <ItemGroup>
-    <PackageReference Include="Npgsql" Version="7.0.4" />
+    <PackageReference Include="Npgsql" Version="8.0.2" />
  </ItemGroup>

 </Project>
--- a/test_runner/pg_clients/java/jdbc/Dockerfile
+++ b/test_runner/pg_clients/java/jdbc/Dockerfile
@@ -1,10 +1,10 @@
-FROM openjdk:20
+FROM openjdk:21
 WORKDIR /source

 COPY . .

 WORKDIR /app
-RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.6.0.jar && \
+RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.7.2.jar && \
    javac -d /app /source/Example.java

 CMD ["java", "-cp", "/app/postgresql.jar:.", "Example"]
--- a/test_runner/pg_clients/python/asyncpg/Dockerfile
+++ b/test_runner/pg_clients/python/asyncpg/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11
+FROM python:3.12
 WORKDIR /source

 COPY . .
--- a/test_runner/pg_clients/python/asyncpg/requirements.txt
+++ b/test_runner/pg_clients/python/asyncpg/requirements.txt
@@ -1 +1 @@
-asyncpg==0.27.0
+asyncpg==0.29.0
--- a/test_runner/pg_clients/python/pg8000/Dockerfile
+++ b/test_runner/pg_clients/python/pg8000/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11
+FROM python:3.12
 WORKDIR /source

 COPY . .
--- a/test_runner/pg_clients/python/pg8000/requirements.txt
+++ b/test_runner/pg_clients/python/pg8000/requirements.txt
@@ -1,2 +1,2 @@
-pg8000==1.29.8
+pg8000==1.30.5
 scramp>=1.4.3
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
@@ -19,9 +19,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"

 [[package]]
 name = "async-trait"
-version = "0.1.74"
+version = "0.1.77"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a66537f1bb974b254c98ed142ff995236e81b9d0fe4db0575f46612cb15eb0f9"
+checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -51,9 +51,9 @@ dependencies = [

 [[package]]
 name = "base64"
-version = "0.21.4"
+version = "0.21.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ba43ea6f343b788c8764558649e08df62f86c6ef251fdaeb1ffd010a9ae50a2"
+checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"

 [[package]]
 name = "bitflags"
@@ -63,9 +63,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"

 [[package]]
 name = "bitflags"
-version = "2.4.1"
+version = "2.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
+checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf"

 [[package]]
 name = "block-buffer"
@@ -78,9 +78,9 @@ dependencies = [

 [[package]]
 name = "bumpalo"
-version = "3.14.0"
+version = "3.15.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec"
+checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b"

 [[package]]
 name = "byteorder"
@@ -96,12 +96,9 @@ checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"

 [[package]]
 name = "cc"
-version = "1.0.83"
+version = "1.0.89"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0"
-dependencies = [
- "libc",
-]
+checksum = "a0ba8f7aaa012f30d5b2861462f6708eccd49c3c39863fe083a308035f63d723"

 [[package]]
 name = "cfg-if"
@@ -111,9 +108,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"

 [[package]]
 name = "core-foundation"
-version = "0.9.3"
+version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146"
+checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
 dependencies = [
 "core-foundation-sys",
 "libc",
@@ -121,15 +118,15 @@ dependencies = [

 [[package]]
 name = "core-foundation-sys"
-version = "0.8.4"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
+checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"

 [[package]]
 name = "cpufeatures"
-version = "0.2.9"
+version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1"
+checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504"
 dependencies = [
 "libc",
 ]
@@ -157,12 +154,12 @@ dependencies = [

 [[package]]
 name = "errno"
-version = "0.3.5"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
+checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
 dependencies = [
 "libc",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]

 [[package]]
@@ -200,9 +197,9 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"

 [[package]]
 name = "futures"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40"
+checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0"
 dependencies = [
 "futures-channel",
 "futures-core",
@@ -215,9 +212,9 @@ dependencies = [

 [[package]]
 name = "futures-channel"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2"
+checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
 dependencies = [
 "futures-core",
 "futures-sink",
@@ -225,15 +222,15 @@ dependencies = [

 [[package]]
 name = "futures-core"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c"
+checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"

 [[package]]
 name = "futures-executor"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0"
+checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d"
 dependencies = [
 "futures-core",
 "futures-task",
@@ -242,15 +239,15 @@ dependencies = [

 [[package]]
 name = "futures-io"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964"
+checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"

 [[package]]
 name = "futures-macro"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
+checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -259,21 +256,21 @@ dependencies = [

 [[package]]
 name = "futures-sink"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e"
+checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"

 [[package]]
 name = "futures-task"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65"
+checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"

 [[package]]
 name = "futures-util"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533"
+checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
 dependencies = [
 "futures-channel",
 "futures-core",
@@ -299,9 +296,9 @@ dependencies = [

 [[package]]
 name = "getrandom"
-version = "0.2.10"
+version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427"
+checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5"
 dependencies = [
 "cfg-if",
 "libc",
@@ -310,9 +307,9 @@ dependencies = [

 [[package]]
 name = "gimli"
-version = "0.28.0"
+version = "0.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0"
+checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"

 [[package]]
 name = "hmac"
@@ -325,9 +322,9 @@ dependencies = [

 [[package]]
 name = "js-sys"
-version = "0.3.64"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a"
+checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
 dependencies = [
 "wasm-bindgen",
 ]
@@ -340,15 +337,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"

 [[package]]
 name = "libc"
-version = "0.2.149"
+version = "0.2.153"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
+checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"

 [[package]]
 name = "linux-raw-sys"
-version = "0.4.10"
+version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"
+checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"

 [[package]]
 name = "lock_api"
@@ -362,9 +359,9 @@ dependencies = [

 [[package]]
 name = "log"
-version = "0.4.20"
+version = "0.4.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
+checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"

 [[package]]
 name = "md-5"
@@ -378,28 +375,28 @@ dependencies = [

 [[package]]
 name = "memchr"
-version = "2.6.4"
+version = "2.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
+checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149"

 [[package]]
 name = "miniz_oxide"
-version = "0.7.1"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7"
 dependencies = [
 "adler",
 ]

 [[package]]
 name = "mio"
-version = "0.8.8"
+version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2"
+checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
 dependencies = [
 "libc",
 "wasi",
- "windows-sys",
+ "windows-sys 0.48.0",
 ]

 [[package]]
@@ -422,26 +419,26 @@ dependencies = [

 [[package]]
 name = "object"
-version = "0.32.1"
+version = "0.32.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9cf5f9dd3933bd50a9e1f149ec995f39ae2c496d31fd772c1fd45ebc27e902b0"
+checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
 dependencies = [
 "memchr",
 ]

 [[package]]
 name = "once_cell"
-version = "1.18.0"
+version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"

 [[package]]
 name = "openssl"
-version = "0.10.60"
+version = "0.10.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800"
+checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.4.2",
 "cfg-if",
 "foreign-types",
 "libc",
@@ -469,9 +466,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"

 [[package]]
 name = "openssl-sys"
-version = "0.9.96"
+version = "0.9.101"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f"
+checksum = "dda2b0f344e78efc2facf7d195d098df0dd72151b26ab98da807afc26c198dff"
 dependencies = [
 "cc",
 "libc",
@@ -497,16 +494,16 @@ checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e"
 dependencies = [
 "cfg-if",
 "libc",
- "redox_syscall 0.4.1",
+ "redox_syscall",
 "smallvec",
- "windows-targets",
+ "windows-targets 0.48.5",
 ]

 [[package]]
 name = "percent-encoding"
-version = "2.3.0"
+version = "2.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"

 [[package]]
 name = "phf"
@@ -540,9 +537,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"

 [[package]]
 name = "pkg-config"
-version = "0.3.27"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
+checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"

 [[package]]
 name = "postgres-native-tls"
@@ -594,18 +591,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"

 [[package]]
 name = "proc-macro2"
-version = "1.0.69"
+version = "1.0.78"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da"
+checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
 dependencies = [
 "unicode-ident",
 ]

 [[package]]
 name = "quote"
-version = "1.0.33"
+version = "1.0.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
+checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
 dependencies = [
 "proc-macro2",
 ]
@@ -640,15 +637,6 @@ dependencies = [
 "getrandom",
 ]

-[[package]]
-name = "redox_syscall"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
-dependencies = [
- "bitflags 1.3.2",
-]
-
 [[package]]
 name = "redox_syscall"
 version = "0.4.1"
@@ -676,24 +664,24 @@ checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"

 [[package]]
 name = "rustix"
-version = "0.38.19"
+version = "0.38.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "745ecfa778e66b2b63c88a61cb36e0eea109e803b0b86bf9879fbc77c70e86ed"
+checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.4.2",
 "errno",
 "libc",
 "linux-raw-sys",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]

 [[package]]
 name = "schannel"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c3733bf4cf7ea0880754e19cb5a462007c4a8c1914bff372ccc95b464f1df88"
+checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534"
 dependencies = [
- "windows-sys",
+ "windows-sys 0.52.0",
 ]

 [[package]]
@@ -753,18 +741,18 @@ dependencies = [

 [[package]]
 name = "smallvec"
-version = "1.11.1"
+version = "1.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a"
+checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"

 [[package]]
 name = "socket2"
-version = "0.5.4"
+version = "0.5.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4031e820eb552adee9295814c0ced9e5cf38ddf1e8b7d566d6de8e2538ea989e"
+checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871"
 dependencies = [
 "libc",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]

 [[package]]
@@ -786,9 +774,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"

 [[package]]
 name = "syn"
-version = "2.0.38"
+version = "2.0.52"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b"
+checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -797,15 +785,14 @@ dependencies = [

 [[package]]
 name = "tempfile"
-version = "3.8.0"
+version = "3.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef"
+checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1"
 dependencies = [
 "cfg-if",
 "fastrand",
- "redox_syscall 0.3.5",
 "rustix",
- "windows-sys",
+ "windows-sys 0.52.0",
 ]

 [[package]]
@@ -825,9 +812,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"

 [[package]]
 name = "tokio"
-version = "1.33.0"
+version = "1.36.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f38200e3ef7995e5ef13baec2f432a6da0aa9ac495b2c0e8f3b7eec2c92d653"
+checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
 dependencies = [
 "backtrace",
 "bytes",
@@ -836,14 +823,14 @@ dependencies = [
 "pin-project-lite",
 "socket2",
 "tokio-macros",
- "windows-sys",
+ "windows-sys 0.48.0",
 ]

 [[package]]
 name = "tokio-macros"
-version = "2.1.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
+checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -888,9 +875,9 @@ dependencies = [

 [[package]]
 name = "tokio-util"
-version = "0.7.9"
+version = "0.7.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d68074620f57a0b21594d9735eb2e98ab38b17f80d3fcb189fca266771ca60d"
+checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15"
 dependencies = [
 "bytes",
 "futures-core",
@@ -927,9 +914,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"

 [[package]]
 name = "unicode-bidi"
-version = "0.3.13"
+version = "0.3.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
+checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75"

 [[package]]
 name = "unicode-ident"
@@ -939,9 +926,9 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"

 [[package]]
 name = "unicode-normalization"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
+checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
 dependencies = [
 "tinyvec",
 ]
@@ -965,10 +952,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

 [[package]]
-name = "wasm-bindgen"
-version = "0.2.87"
+name = "wasite"
+version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342"
+checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
 dependencies = [
 "cfg-if",
 "wasm-bindgen-macro",
@@ -976,9 +969,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd"
+checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
 dependencies = [
 "bumpalo",
 "log",
@@ -991,9 +984,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d"
+checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
 dependencies = [
 "quote",
 "wasm-bindgen-macro-support",
@@ -1001,9 +994,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
+checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -1014,15 +1007,15 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1"
+checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"

 [[package]]
 name = "web-sys"
-version = "0.3.64"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b"
+checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
 dependencies = [
 "js-sys",
 "wasm-bindgen",
@@ -1030,11 +1023,12 @@ dependencies = [

 [[package]]
 name = "whoami"
-version = "1.4.1"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22fc3756b8a9133049b26c7f61ab35416c130e8c09b660f5b3958b446f52cc50"
+checksum = "0fec781d48b41f8163426ed18e8fc2864c12937df9ce54c88ede7bd47270893e"
 dependencies = [
- "wasm-bindgen",
+ "redox_syscall",
+ "wasite",
 "web-sys",
 ]

@@ -1044,7 +1038,16 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
 dependencies = [
- "windows-targets",
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.4",
 ]

 [[package]]
@@ -1053,13 +1056,28 @@ version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
 dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.4",
+ "windows_aarch64_msvc 0.52.4",
+ "windows_i686_gnu 0.52.4",
+ "windows_i686_msvc 0.52.4",
+ "windows_x86_64_gnu 0.52.4",
+ "windows_x86_64_gnullvm 0.52.4",
+ "windows_x86_64_msvc 0.52.4",
 ]

 [[package]]
@@ -1068,38 +1086,80 @@ version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"

+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"

+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"

+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"

+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"

+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
+
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"

+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
@@ -9,7 +9,7 @@ publish = false
 [dependencies]
 native-tls = "0.2.11"
 postgres-native-tls = "0.5.0"
-tokio = { version = "1.33", features=["rt", "macros"] }
+tokio = { version = "1.36", features=["rt", "macros"] }
 tokio-postgres = "0.7.10"


--- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
+++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
@@ -1,4 +1,4 @@
-FROM rust:1.73
+FROM rust:1.76
 WORKDIR /source

 COPY . .
--- a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
+++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
@@ -1,11 +1,11 @@
-FROM swift:5.8 AS build
+FROM swift:5.9 AS build
 RUN apt-get -q update && apt-get -q install -y libssl-dev
 WORKDIR /source

 COPY . .
 RUN swift build --configuration release

-FROM swift:5.8
+FROM swift:5.9
 WORKDIR /app
 COPY --from=build /source/.build/release .
 CMD ["/app/PostgresClientKitExample"]
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
@@ -1,10 +1,10 @@
-FROM swift:5.8 AS build
+FROM swift:5.9 AS build
 WORKDIR /source

 COPY . .
 RUN swift build --configuration release

-FROM swift:5.8
+FROM swift:5.9
 WORKDIR /app
 COPY --from=build /source/.build/release .
 CMD ["/app/PostgresNIOExample"]
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
@@ -5,8 +5,8 @@
      "kind" : "remoteSourceControl",
      "location" : "https://github.com/vapor/postgres-nio.git",
      "state" : {
-        "revision" : "061a0836d7c1887e04a975d1d2eaa2ef5fd7dfab",
-        "version" : "1.16.0"
+        "revision" : "69ccfdf4c80144d845e3b439961b7ec6cd7ae33f",
+        "version" : "1.20.2"
      }
    },
    {
@@ -14,8 +14,8 @@
      "kind" : "remoteSourceControl",
      "location" : "https://github.com/apple/swift-atomics.git",
      "state" : {
-        "revision" : "6c89474e62719ddcc1e9614989fff2f68208fe10",
-        "version" : "1.1.0"
+        "revision" : "cd142fd2f64be2100422d658e7411e39489da985",
+        "version" : "1.2.0"
      }
    },
    {
@@ -41,8 +41,8 @@
      "kind" : "remoteSourceControl",
      "location" : "https://github.com/apple/swift-log.git",
      "state" : {
-        "revision" : "32e8d724467f8fe623624570367e3d50c5638e46",
-        "version" : "1.5.2"
+        "revision" : "e97a6fcb1ab07462881ac165fdbb37f067e205d5",
+        "version" : "1.5.4"
      }
    },
    {
@@ -50,8 +50,8 @@
      "kind" : "remoteSourceControl",
      "location" : "https://github.com/apple/swift-metrics.git",
      "state" : {
-        "revision" : "9b39d811a83cf18b79d7d5513b06f8b290198b10",
-        "version" : "2.3.3"
+        "revision" : "971ba26378ab69c43737ee7ba967a896cb74c0d1",
+        "version" : "2.4.1"
      }
    },
    {
@@ -59,8 +59,8 @@
      "kind" : "remoteSourceControl",
      "location" : "https://github.com/apple/swift-nio.git",
      "state" : {
-        "revision" : "6213ba7a06febe8fef60563a4a7d26a4085783cf",
-        "version" : "2.54.0"
+        "revision" : "635b2589494c97e48c62514bc8b37ced762e0a62",
+        "version" : "2.63.0"
      }
    },
    {
@@ -68,8 +68,8 @@
      "kind" : "remoteSourceControl",
      "location" : "https://github.com/apple/swift-nio-ssl.git",
      "state" : {
-        "revision" : "e866a626e105042a6a72a870c88b4c531ba05f83",
-        "version" : "2.24.0"
+        "revision" : "7c381eb6083542b124a6c18fae742f55001dc2b5",
+        "version" : "2.26.0"
      }
    },
    {
@@ -77,8 +77,17 @@
      "kind" : "remoteSourceControl",
      "location" : "https://github.com/apple/swift-nio-transport-services.git",
      "state" : {
-        "revision" : "41f4098903878418537020075a4d8a6e20a0b182",
-        "version" : "1.17.0"
+        "revision" : "6cbe0ed2b394f21ab0d46b9f0c50c6be964968ce",
+        "version" : "1.20.1"
+      }
+    },
+    {
+      "identity" : "swift-system",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-system.git",
+      "state" : {
+        "revision" : "025bcb1165deab2e20d4eaba79967ce73013f496",
+        "version" : "1.2.1"
      }
    }
  ],
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
@@ -1,10 +1,10 @@
-// swift-tools-version:5.8
+// swift-tools-version:5.9
 import PackageDescription

 let package = Package(
    name: "PostgresNIOExample",
    dependencies: [
-        .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.16.0")
+        .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.20.2")
    ],
    targets: [
        .executableTarget(
--- a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
+++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
@@ -1,4 +1,4 @@
-FROM node:20
+FROM node:21
 WORKDIR /source

 COPY . .
--- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
+++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
@@ -5,24 +5,24 @@
  "packages": {
    "": {
      "dependencies": {
-        "postgresql-client": "2.5.9"
+        "postgresql-client": "2.10.5"
      }
    },
    "node_modules/doublylinked": {
-      "version": "2.5.2",
-      "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.2.tgz",
-      "integrity": "sha512-TDh0XfQWWDrfvGdAN0hLNIdkTXlw04nVCO5B/37ie4dV0yw1iT9ZrZ6tD+q/0SwXxeI/u6TF9Mxgd7s5/XYV6A==",
+      "version": "2.5.4",
+      "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.4.tgz",
+      "integrity": "sha512-jBCKDnFkEHJRjQvYEl5N9VngRV8ypHgw6a52OK4VN57eV2r2rYvgOx9uABdY78INNoW7S6auULp+KBVm/jfYqw==",
      "engines": {
        "node": ">= 10.0"
      }
    },
    "node_modules/lightning-pool": {
-      "version": "4.2.1",
-      "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-4.2.1.tgz",
-      "integrity": "sha512-/pUIoGD3nzTH/wI4TYiJM3cLPeUOzGMTfFeBRuxaOAnwL0LZfwvqn5YFqsfyF98M0C3UXxWgfTz+Lu6okkno+g==",
+      "version": "4.2.2",
+      "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-4.2.2.tgz",
+      "integrity": "sha512-KW0Df0IbjNLxy5wAsdErTKYtHGwefLRQseHNksEctyaL7gtRwJT0nqLa2uiRdNYDwKSnZtqOjSjUNtfxmfH1qw==",
      "dependencies": {
-        "doublylinked": "^2.5.2",
-        "putil-promisify": "^1.8.6"
+        "doublylinked": "^2.5.3",
+        "putil-promisify": "^1.10.1"
      }
    },
    "node_modules/obuf": {
@@ -42,16 +42,16 @@
      }
    },
    "node_modules/postgresql-client": {
-      "version": "2.5.9",
-      "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.9.tgz",
-      "integrity": "sha512-s+kgTN6TfWLzehEyxw4Im4odnxVRCbZ0DEJzWS6SLowPAmB2m1/DOiOvZC0+ZVoi5AfbGE6SBqFxKguSyVAXZg==",
+      "version": "2.10.5",
+      "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.10.5.tgz",
+      "integrity": "sha512-R3EC16pUdbgrzk1J2MQLj7jY2TepWurJHoK90nOeLZj1XTpL/+wL1VCneTmclRVKDuKVjFHr+FASV47KrLpAbw==",
      "dependencies": {
-        "doublylinked": "^2.5.2",
-        "lightning-pool": "^4.2.1",
+        "doublylinked": "^2.5.4",
+        "lightning-pool": "^4.2.2",
        "postgres-bytea": "^3.0.0",
-        "power-tasks": "^1.7.0",
-        "putil-merge": "^3.10.3",
-        "putil-promisify": "^1.10.0",
+        "power-tasks": "^1.7.3",
+        "putil-merge": "^3.12.1",
+        "putil-promisify": "^1.10.1",
        "putil-varhelpers": "^1.6.5"
      },
      "engines": {
@@ -60,30 +60,29 @@
      }
    },
    "node_modules/power-tasks": {
-      "version": "1.7.0",
-      "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.0.tgz",
-      "integrity": "sha512-rndZXCDxhuIDjPUJJvQwBDHaYagCkjvbPF/NA+omh/Ef4rAI9KtnvdA0k98dyiGpn1zXOpc6c2c0JWzg/xAhJg==",
+      "version": "1.7.3",
+      "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.3.tgz",
+      "integrity": "sha512-EnkjLfaX4PxFYHbUWyWzlE4I8SgctaW9jx4qQXrVRoELlqBXrxIMtuhHzRwsHv2qs1tO7efOcZa6/wDCdCjRfA==",
      "dependencies": {
-        "doublylinked": "^2.5.2",
-        "strict-typed-events": "^2.3.1"
+        "doublylinked": "^2.5.4",
+        "strict-typed-events": "^2.3.3"
      },
      "engines": {
-        "node": ">=14.0",
-        "npm": ">=7.0.0"
+        "node": ">=16.0"
      }
    },
    "node_modules/putil-merge": {
-      "version": "3.10.3",
-      "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.10.3.tgz",
-      "integrity": "sha512-B18CYi0/SmBYl9+fgowYWkgzJM/8XcLSeafHrFrGzwySQuOzLW0sOGx0CdFVp9zqaxgLctexUdGoSPpm6CPM6A==",
+      "version": "3.12.1",
+      "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.12.1.tgz",
+      "integrity": "sha512-4clPyRkJPrd5zl98AP7I3JamyXbx0ixe2CnfvGwoTyWSr7Kslcv8weoKjfU4BMBifkWIRL54l4OrNe97pYcDwQ==",
      "engines": {
        "node": ">= 10.0"
      }
    },
    "node_modules/putil-promisify": {
-      "version": "1.10.0",
-      "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.10.0.tgz",
-      "integrity": "sha512-zYPoAoMxmf8pC+I75kRkYkVMwU4ZbZl82aTGema175bmhQ06BEJuuOlzOy1buQK9G+hCyQ+BFpzMTKAJhD8rZw==",
+      "version": "1.10.1",
+      "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.10.1.tgz",
+      "integrity": "sha512-1jm0egJNrj5eBDRj15Cg08RNHDV91OVEHeeYjAFRcs663PXxFokndxcJAGbaO6CSErCTp8eTgC8vuOF+fvXIAA==",
      "engines": {
        "node": ">= 14.0"
      }
@@ -97,21 +96,21 @@
      }
    },
    "node_modules/strict-typed-events": {
-      "version": "2.3.1",
-      "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.3.1.tgz",
-      "integrity": "sha512-Z1h8KpVbrVg34Vwy/VwTD/tS9tFebH2h1Kvw4xnPkKpkISMwUpnqwU44rMfkKMpXbFCybIgDt7ARoCGTzURZhQ==",
+      "version": "2.3.3",
+      "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.3.3.tgz",
+      "integrity": "sha512-Vc8/N5giCVpO2n5BCskqDD9ns7RkdEq0pFd4yQk1ROULusJDbjORNvbtyEPxxK7Xqn9/NdW8XHLxv/PvUTgFsA==",
      "dependencies": {
-        "putil-promisify": "^1.8.5",
-        "ts-gems": "^2.2.0"
+        "putil-promisify": "^1.10.1",
+        "ts-gems": "^3.1.0"
      },
      "engines": {
        "node": ">=16.0"
      }
    },
    "node_modules/ts-gems": {
-      "version": "2.4.0",
-      "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.4.0.tgz",
-      "integrity": "sha512-SdugYAXoWvbqrxLodIObzxhEKacDxh5LfAJIiIkiH7q5thvuuCzdmkdTVQYf7uEDrEpPhfx4tokDMamdO3be9A=="
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-3.1.1.tgz",
+      "integrity": "sha512-Li1Z44FnxN06c1lBwFepb932jPYT+4eOvOmoiC30lOTkvOJOERr9xZFg3UA9y19OYO9CrW3ZSqNL66DUSuwFTw=="
    }
  }
 }
--- a/test_runner/pg_clients/typescript/postgresql-client/package.json
+++ b/test_runner/pg_clients/typescript/postgresql-client/package.json
@@ -1,6 +1,6 @@
 {
  "type": "module",
  "dependencies": {
-    "postgresql-client": "2.5.9"
+    "postgresql-client": "2.10.5"
  }
 }
--- a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
+++ b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
@@ -1,4 +1,4 @@
-FROM node:20
+FROM node:21
 WORKDIR /source

 COPY . .
--- a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
+++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
@@ -5,14 +5,14 @@
  "packages": {
    "": {
      "dependencies": {
-        "@neondatabase/serverless": "0.4.18",
-        "ws": "8.13.0"
+        "@neondatabase/serverless": "0.9.0",
+        "ws": "8.16.0"
      }
    },
    "node_modules/@neondatabase/serverless": {
-      "version": "0.4.18",
-      "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.4.18.tgz",
-      "integrity": "sha512-2TZnIyRGC/+0fjZ8TKCzaSTPUD94PM7NBGuantGZbUrbWyqBwGnUoRtdZAQ95qBKVHqORLVfymlv2NE+HQMFeA==",
+      "version": "0.9.0",
+      "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.0.tgz",
+      "integrity": "sha512-mmJnUAzlzvxNSZuuhI6kgJjH+JgFdBMYUWxihtq/nj0Tjt+Y5UU3W+SvRFoucnd5NObYkuLYQzk+zV5DGFKGJg==",
      "dependencies": {
        "@types/pg": "8.6.6"
      }
@@ -96,9 +96,9 @@
      }
    },
    "node_modules/ws": {
-      "version": "8.13.0",
-      "resolved": "https://registry.npmjs.org/ws/-/ws-8.13.0.tgz",
-      "integrity": "sha512-x9vcZYTrFPC7aSIbj7sRCYo7L/Xb8Iy+pW0ng0wt2vCJv7M9HOMy0UoN3rr+IFC7hb7vXoqS+P9ktyLLLhO+LA==",
+      "version": "8.16.0",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-8.16.0.tgz",
+      "integrity": "sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==",
      "engines": {
        "node": ">=10.0.0"
      },
--- a/test_runner/pg_clients/typescript/serverless-driver/package.json
+++ b/test_runner/pg_clients/typescript/serverless-driver/package.json
@@ -1,7 +1,7 @@
 {
  "type": "module",
  "dependencies": {
-    "@neondatabase/serverless": "0.4.18",
-    "ws": "8.13.0"
+    "@neondatabase/serverless": "0.9.0",
+    "ws": "8.16.0"
  }
 }
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -1,12 +1,9 @@
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
-from fixtures.pg_version import PgVersion, skip_on_postgres
+from fixtures.pg_version import PgVersion
 from fixtures.utils import wait_until


-@skip_on_postgres(
-    PgVersion.V15, reason="skip on pg15 due to https://github.com/neondatabase/neon/issues/6969"
-)
 def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
    env = neon_simple_env
    env.neon_cli.create_branch("test_neon_superuser_publisher", "empty")
@@ -97,3 +94,6 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
        assert cur.fetchall()[0][0] != "<insufficient privilege>"
        cur.execute("RESET ROLE")
        cur.execute("DROP ROLE not_a_superuser")
+        query = "DROP SUBSCRIPTION sub CASCADE"
+        log.info(f"Dropping subscription: {query}")
+        cur.execute(query)
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -0,0 +1,118 @@
+import json
+import uuid
+
+from anyio import Path
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
+from fixtures.pg_version import PgVersion
+from fixtures.types import TenantId, TimelineId
+from fixtures.utils import wait_until
+
+
+def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    env = neon_env_builder.init_start()
+
+    env.pageserver.tenant_detach(env.initial_tenant)
+
+    env.pageserver.allowed_errors.append(
+        # https://github.com/neondatabase/neon/issues/6925
+        r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
+    )
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+
+    rate_limit_rps = 100
+    compaction_period = 5
+    env.pageserver.tenant_create(
+        tenant_id,
+        conf={
+            "compaction_period": f"{compaction_period}s",
+            "timeline_get_throttle": {
+                "task_kinds": ["PageRequestHandler"],
+                "initial": 0,
+                "refill_interval": "100ms",
+                "refill_amount": int(rate_limit_rps / 10),
+                "max": int(rate_limit_rps / 10),
+                "fair": True,
+            },
+        },
+    )
+
+    ps_http = env.pageserver.http_client()
+
+    ps_http.timeline_create(PgVersion.V16, tenant_id, timeline_id)
+
+    def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: int):
+        cmd = [
+            str(env.neon_binpath / "pagebench"),
+            "get-page-latest-lsn",
+            "--mgmt-api-endpoint",
+            ps_http.base_url,
+            "--page-service-connstring",
+            env.pageserver.connstr(password=None),
+            "--runtime",
+            f"{duration_secs}s",
+            f"{tenant_id}/{timeline_id}",
+        ]
+
+        basepath = pg_bin.run_capture(cmd, with_command_header=False)
+        results_path = Path(basepath + ".stdout")
+        log.info(f"Benchmark results at: {results_path}")
+
+        with open(results_path, "r") as f:
+            results = json.load(f)
+        log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
+        return int(results["total"]["request_count"])
+
+    log.info("warmup / make sure metrics are present")
+    run_pagebench_at_max_speed_and_get_total_requests_completed(2)
+    metrics_query = {
+        "tenant_id": str(tenant_id),
+        "timeline_id": str(timeline_id),
+        "smgr_query_type": "get_page_at_lsn",
+    }
+    metric_name = "pageserver_smgr_query_seconds_sum"
+    smgr_query_seconds_pre = ps_http.get_metric_value(metric_name, metrics_query)
+    assert smgr_query_seconds_pre is not None
+
+    marker = uuid.uuid4().hex
+    ps_http.post_tracing_event("info", marker)
+    _, marker_offset = wait_until(
+        10, 0.5, lambda: env.pageserver.assert_log_contains(marker, offset=None)
+    )
+
+    log.info("run pagebench")
+    duration_secs = 10
+    actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs)
+
+    log.info("validate the client is capped at the configured rps limit")
+    expect_ncompleted = duration_secs * rate_limit_rps
+    delta_abs = abs(expect_ncompleted - actual_ncompleted)
+    threshold = 0.05 * expect_ncompleted
+    assert (
+        threshold / rate_limit_rps < 0.1 * duration_secs
+    ), "test self-test: unrealistic expecations regarding precision in this test"
+    assert (
+        delta_abs < 0.05 * expect_ncompleted
+    ), "the throttling deviates more than 5percent from the expectation"
+
+    log.info("validate that we logged the throttling")
+
+    wait_until(
+        10,
+        compaction_period / 10,
+        lambda: env.pageserver.assert_log_contains(
+            f".*{tenant_id}.*shard was throttled in the last n_seconds.*",
+            offset=marker_offset,
+        ),
+    )
+
+    log.info("validate that the metric doesn't include throttle wait time")
+    smgr_query_seconds_post = ps_http.get_metric_value(metric_name, metrics_query)
+    assert smgr_query_seconds_post is not None
+    actual_smgr_query_seconds = smgr_query_seconds_post - smgr_query_seconds_pre
+
+    assert (
+        duration_secs >= 10 * actual_smgr_query_seconds
+    ), "smgr metrics should not include throttle wait time"
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -329,14 +329,15 @@ def test_remote_storage_upload_queue_retries(
    churn_while_failpoints_active_thread.start()

    # wait for churn thread's data to get stuck in the upload queue
-    wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(10, 0.5, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
-    wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    # Exponential back-off in upload queue, so, gracious timeouts.
+
+    wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
+    wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
+    wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))

    # unblock churn operations
    configure_storage_sync_failpoints("off")

-    # ... and wait for them to finish. Exponential back-off in upload queue, so, gracious timeouts.
    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -1,7 +1,7 @@
 import time
 from collections import defaultdict
 from datetime import datetime, timezone
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union

 import pytest
 from fixtures.log_helper import log
@@ -443,10 +443,12 @@ def test_sharding_service_compute_hook(

    # Initial notification from tenant creation
    assert len(notifications) == 1
-    expect = {
+    expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = {
        "tenant_id": str(env.initial_tenant),
+        "stripe_size": None,
        "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
    }
+    assert notifications[0] == expect

    env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})

@@ -460,6 +462,7 @@ def test_sharding_service_compute_hook(
    log.info(f"notifications: {notifications}")
    expect = {
        "tenant_id": str(env.initial_tenant),
+        "stripe_size": None,
        "shards": [{"node_id": int(env.pageservers[1].id), "shard_number": 0}],
    }

@@ -475,10 +478,27 @@ def test_sharding_service_compute_hook(

    def received_restart_notification():
        assert len(notifications) == 3
-        assert notifications[1] == expect
+        assert notifications[2] == expect

    wait_until(10, 1, received_restart_notification)

+    # Splitting a tenant should cause its stripe size to become visible in the compute notification
+    env.attachment_service.tenant_shard_split(env.initial_tenant, shard_count=2)
+    expect = {
+        "tenant_id": str(env.initial_tenant),
+        "stripe_size": 32768,
+        "shards": [
+            {"node_id": int(env.pageservers[1].id), "shard_number": 0},
+            {"node_id": int(env.pageservers[1].id), "shard_number": 1},
+        ],
+    }
+
+    def received_split_notification():
+        assert len(notifications) == 4
+        assert notifications[3] == expect
+
+    wait_until(10, 1, received_split_notification)
+
    env.attachment_service.consistency_check()


--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -176,7 +176,7 @@ build: |
      # actually build the thing...
      && make install

-  FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.0 AS postgres-exporter
+  FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter

  FROM burningalchemist/sql_exporter:0.13 AS sql-exporter

@@ -193,7 +193,7 @@ build: |
          pkg-config

  # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
-  ENV PGBOUNCER_TAG pgbouncer_1_22_0
+  ENV PGBOUNCER_TAG pgbouncer_1_22_1
  RUN set -e \
      && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
      && cd pgbouncer \