pageserver: add DeletionNotify to wait for deletion queue execution

tests: make Workload more determinstic (#10741 )
## Problem Previously, Workload was reconfiguring the compute before each run of writes, which was meant to be a no-op when nothing changed, but was actually writing extra data due to an issue being fixed in https://github.com/neondatabase/neon/pull/10696. The row counts in tests were too low in some cases, these tests were only working because of those extra writes that shouldn't have been happening, and moreover were relying on checkpoints happening. ## Summary of changes - Only reconfigure compute if the attached pageserver actually changed. If pageserver is set to None, that means controller is managing everything, so never reconfigure compute. - Update tests that wrote too few rows. --------- Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
2026-02-05 03:30:36 +00:00 · 2025-02-12 18:20:39 +01:00 · 2025-02-12 12:35:29 +00:00 · 2025-02-12 11:43:23 +00:00 · 2025-02-12 10:52:26 +00:00 · 2025-02-11 23:43:58 +00:00
228 changed files with 9668 additions and 5052 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -24,3 +24,4 @@
 !storage_controller/
 !vendor/postgres-*/
 !workspace_hack/
+!build_tools/patches
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -27,3 +27,4 @@ config-variables:
  - SLACK_ON_CALL_QA_STAGING_STREAM
  - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
  - SLACK_ON_CALL_STORAGE_STAGING_STREAM
+  - SLACK_CICD_CHANNEL_ID
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -41,7 +41,10 @@ inputs:
    description: 'Path to directory containing libpq library - it is caller responsibility to provision the libpq library'
    required: false
    default: '/tmp/neon/pg_install/v16/lib'
-  
+  project_settings:
+    description: 'A JSON object with project settings'
+    required: false
+    default: '{}'

 outputs:
  dsn:
@@ -73,7 +76,7 @@ runs:
              \"provisioner\": \"k8s-neonvm\",
              \"autoscaling_limit_min_cu\": ${MIN_CU},
              \"autoscaling_limit_max_cu\": ${MAX_CU},
-              \"settings\": { }
+              \"settings\": ${PROJECT_SETTINGS}
            }
          }")

@@ -92,12 +95,12 @@ runs:
        if [ "${SHARD_SPLIT_PROJECT}" = "true" ]; then
          # determine tenant ID
          TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"`
-          
+
          echo "Splitting project ${project_id} with tenant_id ${TENANT_ID} into $((SHARD_COUNT)) shards with stripe size $((STRIPE_SIZE))"

          echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split"
          echo "with body {\"new_shard_count\": $((SHARD_COUNT)), \"new_stripe_size\": $((STRIPE_SIZE))}"
-          
+
          # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set)
          curl -X PUT \
            "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" \
@@ -118,3 +121,4 @@ runs:
        STRIPE_SIZE: ${{ inputs.stripe_size }}
        PSQL: ${{ inputs.psql_path }}
        LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }}
+        PROJECT_SETTINGS: ${{ inputs.project_settings }}
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -121,6 +121,8 @@ runs:
        export DEFAULT_PG_VERSION=${PG_VERSION#v}
        export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
        export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}
+        export ASAN_OPTIONS=detect_leaks=0:detect_stack_use_after_return=0:abort_on_error=1:strict_string_checks=1:check_initialization_order=1:strict_init_order=1
+        export UBSAN_OPTIONS=abort_on_error=1:print_stacktrace=1

        if [ "${BUILD_TYPE}" = "remote" ]; then
          export REMOTE_ENV=1
--- a/.github/file-filters.yaml
+++ b/.github/file-filters.yaml
@@ -1,4 +1,5 @@
 rust_code: ['**/*.rs', '**/Cargo.toml', '**/Cargo.lock']
+rust_dependencies: ['**/Cargo.lock']

 v14: ['vendor/postgres-v14/**', 'Makefile', 'pgxn/**']
 v15: ['vendor/postgres-v15/**', 'Makefile', 'pgxn/**']
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -23,6 +23,11 @@ on:
        description: 'a json object of postgres versions and lfc states to run regression tests on'
        required: true
        type: string
+      sanitizers:
+        description: 'enabled or disabled'
+        required: false
+        default: 'disabled'
+        type: string

 defaults:
  run:
@@ -87,6 +92,7 @@ jobs:
      - name: Set env variables
        env:
          ARCH: ${{ inputs.arch }}
+          SANITIZERS: ${{ inputs.sanitizers }}
        run: |
          CARGO_FEATURES="--features testing"
          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
@@ -99,8 +105,14 @@ jobs:
            cov_prefix=""
            CARGO_FLAGS="--locked --release"
          fi
+          if [[ $SANITIZERS == 'enabled' ]]; then
+            make_vars="WITH_SANITIZERS=yes"
+          else
+            make_vars=""
+          fi
          {
            echo "cov_prefix=${cov_prefix}"
+            echo "make_vars=${make_vars}"
            echo "CARGO_FEATURES=${CARGO_FEATURES}"
            echo "CARGO_FLAGS=${CARGO_FLAGS}"
            echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
@@ -136,37 +148,39 @@ jobs:

      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v14 -j$(nproc)
+        run: mold -run make ${make_vars} postgres-v14 -j$(nproc)

      - name: Build postgres v15
        if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v15 -j$(nproc)
+        run: mold -run make ${make_vars} postgres-v15 -j$(nproc)

      - name: Build postgres v16
        if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v16 -j$(nproc)
+        run: mold -run make ${make_vars} postgres-v16 -j$(nproc)

      - name: Build postgres v17
        if: steps.cache_pg_17.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v17 -j$(nproc)
+        run: mold -run make ${make_vars} postgres-v17 -j$(nproc)

      - name: Build neon extensions
-        run: mold -run make neon-pg-ext -j$(nproc)
+        run: mold -run make ${make_vars} neon-pg-ext -j$(nproc)

      - name: Build walproposer-lib
-        run: mold -run make walproposer-lib -j$(nproc)
+        run: mold -run make ${make_vars} walproposer-lib -j$(nproc)

      - name: Run cargo build
+        env:
+          WITH_TESTS: ${{ inputs.sanitizers != 'enabled' && '--tests' || '' }}
        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
-          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+          export ASAN_OPTIONS=detect_leaks=0
+          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins ${WITH_TESTS}

      # Do install *before* running rust tests because they might recompile the
      # binaries with different features/flags.
      - name: Install rust binaries
        env:
          ARCH: ${{ inputs.arch }}
+          SANITIZERS: ${{ inputs.sanitizers }}
        run: |
          # Install target binaries
          mkdir -p /tmp/neon/bin/
@@ -181,7 +195,7 @@ jobs:
          done

          # Install test executables and write list of all binaries (for code coverage)
-          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
+          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' && $SANITIZERS != 'enabled' ]]; then
            # Keep bloated coverage data files away from the rest of the artifact
            mkdir -p /tmp/coverage/

@@ -214,11 +228,10 @@ jobs:
          role-duration-seconds: 18000 # 5 hours

      - name: Run rust tests
+        if: ${{ inputs.sanitizers != 'enabled' }}
        env:
          NEXTEST_RETRIES: 3
        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
          LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib
          export LD_LIBRARY_PATH

@@ -271,6 +284,27 @@ jobs:
          path: /tmp/neon
          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

+      - name: Check diesel schema
+        if: inputs.build-type == 'release' && inputs.arch == 'x64'
+        env:
+          DATABASE_URL: postgresql://localhost:1235/storage_controller
+          POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+        run: |
+          export ASAN_OPTIONS=detect_leaks=0
+          /tmp/neon/bin/neon_local init
+          /tmp/neon/bin/neon_local storage_controller start
+
+          diesel print-schema > storage_controller/src/schema.rs
+
+          if [ -n "$(git diff storage_controller/src/schema.rs)" ]; then
+            echo >&2 "Uncommitted changes in diesel schema"
+
+            git diff .
+            exit 1
+          fi
+
+          /tmp/neon/bin/neon_local storage_controller stop
+
      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
      - name: Merge and upload coverage data
        if: inputs.build-type == 'debug'
@@ -303,7 +337,7 @@ jobs:
      - name: Pytest regression tests
        continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
        uses: ./.github/actions/run-python-test-set
-        timeout-minutes: 60
+        timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 60 || 180 }}
        with:
          build_type: ${{ inputs.build-type }}
          test_selection: regress
@@ -321,6 +355,7 @@ jobs:
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
          USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
+          SANITIZERS: ${{ inputs.sanitizers }}

      # Temporary disable this step until we figure out why it's so flaky
      # Ref https://github.com/neondatabase/neon/issues/4540
--- a/.github/workflows/_check-codestyle-rust.yml
+++ b/.github/workflows/_check-codestyle-rust.yml
@@ -16,6 +16,9 @@ defaults:
  run:
    shell: bash -euxo pipefail {0}

+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
 jobs:
  check-codestyle-rust:
    strategy:
@@ -84,8 +87,3 @@ jobs:
        run: |
          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
-
-      # https://github.com/EmbarkStudios/cargo-deny
-      - name: Check rust licenses/bans/advisories/sources
-        if: ${{ !cancelled() }}
-        run: cargo deny check --hide-inclusion-graph
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -67,9 +67,9 @@ jobs:

      - uses: actions/checkout@v4
        with:
-          ref: main
+          ref: ${{ github.event.pull_request.head.sha }}
          token: ${{ secrets.CI_ACCESS_TOKEN }}
-      
+
      - name: Look for existing PR
        id: get-pr
        env:
@@ -77,7 +77,7 @@ jobs:
        run: |
          ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
          echo "ALREADY_CREATED=${ALREADY_CREATED}" >> ${GITHUB_OUTPUT}
-      
+
      - name: Get changed labels
        id: get-labels
        if: steps.get-pr.outputs.ALREADY_CREATED != ''
@@ -94,8 +94,6 @@ jobs:
          echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT}
          echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT}

-      - run: gh pr checkout "${PR_NUMBER}"
-
      - run: git checkout -b "${BRANCH}"

      - run: git push --force origin "${BRANCH}"
@@ -103,7 +101,7 @@ jobs:

      - name: Create a Pull Request for CI run (if required)
        if: steps.get-pr.outputs.ALREADY_CREATED == ''
-        env: 
+        env:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
          cat << EOF > body.md
@@ -140,7 +138,7 @@ jobs:

      - run: git push --force origin "${BRANCH}"
        if: steps.get-pr.outputs.ALREADY_CREATED != ''
-             
+
  cleanup:
    # Close PRs and delete branchs if the original PR is closed.

--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -340,7 +340,7 @@ jobs:
          ],
          "pg_version" : [
            16,17
-          ],
+          ]
        }'

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
@@ -810,7 +810,7 @@ jobs:
    # We might change it after https://github.com/neondatabase/neon/issues/2900.
    #
    # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
    permissions:
      contents: write
      statuses: write
@@ -929,7 +929,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  user-examples-compare:
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
    permissions:
      contents: write
      statuses: write
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -235,7 +235,7 @@ jobs:
          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV

      - name: Run cargo build (only for v17)
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu)
+        run: cargo build --all --release -j$(sysctl -n hw.ncpu)

      - name: Check that no warnings are produced (only for v17)
        run: ./run_clippy.sh
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -45,6 +45,26 @@ jobs:
            run cancel-previous-in-concurrency-group.yml \
              --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"

+  files-changed:
+    needs: [ check-permissions ]
+    runs-on: [ self-hosted, small ]
+    timeout-minutes: 3
+    outputs:
+      check-rust-dependencies: ${{ steps.files-changed.outputs.rust_dependencies }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Check for file changes
+        uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36  # v3.0.2
+        id: files-changed
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          filters: .github/file-filters.yaml
+
  tag:
    needs: [ check-permissions ]
    runs-on: [ self-hosted, small ]
@@ -170,6 +190,14 @@ jobs:
      archs: '["x64", "arm64"]'
    secrets: inherit

+  check-dependencies-rust:
+    needs: [ files-changed, build-build-tools-image ]
+    if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' }}
+    uses: ./.github/workflows/cargo-deny.yml
+    with:
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+    secrets: inherit
+
  build-and-test-locally:
    needs: [ tag, build-build-tools-image ]
    strategy:
@@ -654,7 +682,7 @@ jobs:
          push: true
          pull: true
          file: compute/compute-node.Dockerfile
-          target: neon-pg-ext-test
+          target: extension-tests
          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
          tags: |
            neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
@@ -1332,6 +1360,8 @@ jobs:
      - build-and-test-locally
      - check-codestyle-python
      - check-codestyle-rust
+      - check-dependencies-rust
+      - files-changed
      - promote-images-dev
      - test-images
      - trigger-custom-extensions-build-and-wait
@@ -1344,4 +1374,11 @@ jobs:
        if: |
          contains(needs.*.result, 'failure')
          || contains(needs.*.result, 'cancelled')
-          || contains(needs.*.result, 'skipped')
+          || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true')
+          || needs.build-and-test-locally.result == 'skipped'
+          || needs.check-codestyle-python.result == 'skipped'
+          || needs.check-codestyle-rust.result == 'skipped'
+          || needs.files-changed.result == 'skipped'
+          || needs.promote-images-dev.result == 'skipped'
+          || needs.test-images.result == 'skipped'
+          || needs.trigger-custom-extensions-build-and-wait.result == 'skipped'
--- a/.github/workflows/build_and_test_with_sanitizers.yml
+++ b/.github/workflows/build_and_test_with_sanitizers.yml
@@ -0,0 +1,134 @@
+name: Build and Test with Sanitizers
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:   '0 1 * * *' # run once a day, timezone is utc
+  workflow_dispatch:
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+  COPT: '-Werror'
+
+jobs:
+  tag:
+    runs-on: [ self-hosted, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+    outputs:
+      build-tag: ${{steps.build-tag.outputs.tag}}
+
+    steps:
+      # Need `fetch-depth: 0` to count the number of commits in the branch
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Get build tag
+        run: |
+          echo run:$GITHUB_RUN_ID
+          echo ref:$GITHUB_REF_NAME
+          echo rev:$(git rev-list --count HEAD)
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'"
+            echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
+          fi
+        shell: bash
+        id: build-tag
+
+  build-build-tools-image:
+    uses: ./.github/workflows/build-build-tools-image.yml
+    secrets: inherit
+
+  build-and-test-locally:
+    needs: [ tag, build-build-tools-image ]
+    strategy:
+      fail-fast: false
+      matrix:
+        arch: [ x64, arm64 ]
+        build-type: [ release ]
+    uses: ./.github/workflows/_build-and-test-locally.yml
+    with:
+      arch: ${{ matrix.arch }}
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      build-tag: ${{ needs.tag.outputs.build-tag }}
+      build-type: ${{ matrix.build-type }}
+      test-cfg: '[{"pg_version":"v17"}]'
+      sanitizers: enabled
+    secrets: inherit
+
+
+  create-test-report:
+    needs: [ build-and-test-locally, build-build-tools-image ]
+    if: ${{ !cancelled() }}
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
+    outputs:
+      report-url: ${{ steps.create-allure-report.outputs.report-url }}
+
+    runs-on: [ self-hosted, small ]
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Create Allure report
+        if: ${{ !cancelled() }}
+        id: create-allure-report
+        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        env:
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+
+      - uses: actions/github-script@v7
+        if: ${{ !cancelled() }}
+        with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
+          script: |
+            const report = {
+              reportUrl:     "${{ steps.create-allure-report.outputs.report-url }}",
+              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
+            }
+
+            const coverage = {}
+
+            const script = require("./scripts/comment-test-report.js")
+            await script({
+              github,
+              context,
+              fetch,
+              report,
+              coverage,
+            })
--- a/.github/workflows/cargo-deny.yml
+++ b/.github/workflows/cargo-deny.yml
@@ -0,0 +1,57 @@
+name: cargo deny checks
+
+on:
+  workflow_call:
+    inputs:
+      build-tools-image:
+        required: false
+        type: string
+  schedule:
+    - cron: '0 0 * * *'
+
+jobs:
+  cargo-deny:
+    strategy:
+      matrix:
+        ref: >-
+          ${{
+            fromJSON(
+              github.event_name == 'schedule'
+                && '["main","release","release-proxy","release-compute"]'
+                || format('["{0}"]', github.sha)
+            )
+          }}
+
+    runs-on: [self-hosted, small]
+
+    container:
+      image: ${{ inputs.build-tools-image || 'neondatabase/build-tools:pinned' }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ matrix.ref }}
+
+      - name: Check rust licenses/bans/advisories/sources
+        env:
+          CARGO_DENY_TARGET: >-
+            ${{ github.event_name == 'schedule' && 'advisories' || 'all' }}
+        run: cargo deny check --hide-inclusion-graph $CARGO_DENY_TARGET
+
+      - name: Post to a Slack channel
+        if: ${{ github.event_name == 'schedule' && failure() }}
+        uses: slackapi/slack-github-action@v2
+        with:
+          method: chat.postMessage
+          token: ${{ secrets.SLACK_BOT_TOKEN }}
+          payload: |
+            channel: ${{ vars.SLACK_CICD_CHANNEL_ID }}
+            text: |
+              Periodic cargo-deny on ${{ matrix.ref }}: ${{ job.status }}
+              <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+              Pinging @oncall-devprod.
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -114,7 +114,7 @@ jobs:
        run: make walproposer-lib -j$(nproc)

      - name: Produce the build stats
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
+        run: cargo build --all --release --timings -j$(nproc)

      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v4
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -12,8 +12,8 @@ on:
  pull_request:
    paths:
      - '.github/workflows/pg-clients.yml'
-      - 'test_runner/pg_clients/**'
-      - 'test_runner/logical_repl/**'
+      - 'test_runner/pg_clients/**/*.py'
+      - 'test_runner/logical_repl/**/*.py'
      - 'poetry.lock'
  workflow_dispatch:

@@ -104,6 +104,8 @@ jobs:
        with:
          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
          postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+          project_settings: >-
+            {"enable_logical_replication": true}

      - name: Run tests
        uses: ./.github/actions/run-python-test-set
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -59,7 +59,10 @@ jobs:
          echo "${RUST_CHANGED_FILES}"

  build-build-tools-image:
-    if: needs.get-changed-files.outputs.python-changed == 'true'
+    if: |
+      false
+      || needs.get-changed-files.outputs.python-changed == 'true'
+      || needs.get-changed-files.outputs.rust-changed == 'true'
    needs: [ get-changed-files ]
    uses: ./.github/workflows/build-build-tools-image.yml
    with:
@@ -92,7 +95,8 @@ jobs:
  # - conclusion
  # - neon-cloud-e2e
  conclusion:
-    if: always()
+    # Do not run job on Pull Requests as it interferes with the `conclusion` job from the `build_and_test` workflow
+    if: always() && github.event_name == 'merge_group'
    permissions:
      statuses: write # for `github.repos.createCommitStatus(...)`
      contents: write
@@ -124,6 +128,8 @@ jobs:
      - name: Fail the job if any of the dependencies do not succeed or skipped
        run: exit 1
        if: |
-          (contains(needs.check-codestyle-python.result, 'skipped') && needs.get-changed-files.outputs.python-changed == 'true')
+          false
+          || (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true')
+          || (needs.check-codestyle-rust.result   == 'skipped' && needs.get-changed-files.outputs.rust-changed   == 'true')
          || contains(needs.*.result, 'failure')
          || contains(needs.*.result, 'cancelled')
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -206,6 +206,16 @@ dependencies = [
 "syn 2.0.90",
 ]

+[[package]]
+name = "assert-json-diff"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "async-channel"
 version = "1.9.0"
@@ -776,7 +786,7 @@ dependencies = [
 [[package]]
 name = "azure_core"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
 dependencies = [
 "async-trait",
 "base64 0.22.1",
@@ -805,7 +815,7 @@ dependencies = [
 [[package]]
 name = "azure_identity"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
 dependencies = [
 "async-lock",
 "async-trait",
@@ -824,7 +834,7 @@ dependencies = [
 [[package]]
 name = "azure_storage"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
 dependencies = [
 "RustyXML",
 "async-lock",
@@ -842,7 +852,7 @@ dependencies = [
 [[package]]
 name = "azure_storage_blobs"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
 dependencies = [
 "RustyXML",
 "azure_core",
@@ -862,7 +872,7 @@ dependencies = [
 [[package]]
 name = "azure_svc_blobstorage"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#985db729824be324ed11527e45de722250028d9e"
 dependencies = [
 "azure_core",
 "bytes",
@@ -941,6 +951,18 @@ version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"

+[[package]]
+name = "bb8"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d89aabfae550a5c44b43ab941844ffcd2e993cb6900b342debf59e9ea74acdb8"
+dependencies = [
+ "async-trait",
+ "futures-util",
+ "parking_lot 0.12.1",
+ "tokio",
+]
+
 [[package]]
 name = "bcder"
 version = "0.7.4"
@@ -966,7 +988,7 @@ version = "0.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "cexpr",
 "clang-sys",
 "itertools 0.12.1",
@@ -994,9 +1016,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"

 [[package]]
 name = "bitflags"
-version = "2.4.1"
+version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
+checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"

 [[package]]
 name = "block-buffer"
@@ -1007,6 +1029,12 @@ dependencies = [
 "generic-array",
 ]

+[[package]]
+name = "boxcar"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42"
+
 [[package]]
 name = "bstr"
 version = "1.5.0"
@@ -1213,6 +1241,20 @@ version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"

+[[package]]
+name = "clashmap"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93bd59c81e2bd87a775ae2de75f070f7e2bfe97363a6ad652f46824564c23e4d"
+dependencies = [
+ "crossbeam-utils",
+ "hashbrown 0.15.2",
+ "lock_api",
+ "parking_lot_core 0.9.8",
+ "polonius-the-crab",
+ "replace_with",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -1391,6 +1433,7 @@ dependencies = [
 "comfy-table",
 "compute_api",
 "futures",
+ "http-utils",
 "humantime",
 "humantime-serde",
 "hyper 0.14.30",
@@ -1549,7 +1592,7 @@ version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "crossterm_winapi",
 "libc",
 "parking_lot 0.12.1",
@@ -1780,16 +1823,29 @@ version = "2.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ccf1bedf64cdb9643204a36dd15b19a6ce8e7aa7f7b105868e9f1fad5ffa7d12"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "byteorder",
 "chrono",
 "diesel_derives",
 "itoa",
- "pq-sys",
- "r2d2",
 "serde_json",
 ]

+[[package]]
+name = "diesel-async"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51a307ac00f7c23f526a04a77761a0519b9f0eb2838ebf5b905a58580095bdcb"
+dependencies = [
+ "async-trait",
+ "bb8",
+ "diesel",
+ "futures-util",
+ "scoped-futures",
+ "tokio",
+ "tokio-postgres",
+]
+
 [[package]]
 name = "diesel_derives"
 version = "2.2.1"
@@ -2403,6 +2459,16 @@ dependencies = [
 "wasm-bindgen",
 ]

+[[package]]
+name = "gettid"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "397256552fed4a9e577850498071831ec8f18ea83368aecc114cab469dcb43e5"
+dependencies = [
+ "libc",
+ "winapi",
+]
+
 [[package]]
 name = "gimli"
 version = "0.31.1"
@@ -2531,6 +2597,12 @@ dependencies = [
 "allocator-api2",
 ]

+[[package]]
+name = "hashbrown"
+version = "0.15.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
+
 [[package]]
 name = "hashlink"
 version = "0.9.1"
@@ -2581,6 +2653,15 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46"

+[[package]]
+name = "higher-kinded-types"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "561985554c8b8d4808605c90a5f1979cc6c31a5d20b78465cd59501233c6678e"
+dependencies = [
+ "never-say-never",
+]
+
 [[package]]
 name = "hmac"
 version = "0.12.1"
@@ -2677,6 +2758,38 @@ dependencies = [
 "url",
 ]

+[[package]]
+name = "http-utils"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "backtrace",
+ "bytes",
+ "fail",
+ "flate2",
+ "hyper 0.14.30",
+ "inferno 0.12.0",
+ "itertools 0.10.5",
+ "jemalloc_pprof",
+ "metrics",
+ "once_cell",
+ "pprof",
+ "regex",
+ "routerify",
+ "serde",
+ "serde_json",
+ "serde_path_to_error",
+ "thiserror 1.0.69",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "tracing",
+ "url",
+ "utils",
+ "uuid",
+ "workspace_hack",
+]
+
 [[package]]
 name = "httparse"
 version = "1.8.0"
@@ -3059,11 +3172,11 @@ dependencies = [

 [[package]]
 name = "inotify"
-version = "0.9.6"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff"
+checksum = "f37dccff2791ab604f9babef0ba14fbe0be30bd368dc541e2b08d07c8aa908f3"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.8.0",
 "inotify-sys",
 "libc",
 ]
@@ -3240,9 +3353,9 @@ dependencies = [

 [[package]]
 name = "kqueue"
-version = "1.0.7"
+version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c8fc60ba15bf51257aa9807a48a61013db043fcf3a78cb0d916e8e396dcad98"
+checksum = "7447f1ca1b7b563588a205fe93dea8df60fd981423a768bc1c0ded35ed147d0c"
 dependencies = [
 "kqueue-sys",
 "libc",
@@ -3250,9 +3363,9 @@ dependencies = [

 [[package]]
 name = "kqueue-sys"
-version = "1.0.3"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8367585489f01bc55dd27404dcf56b95e6da061a256a666ab23be9ba96a2e587"
+checksum = "ed9625ffda8729b85e45cf04090035ac368927b8cebc34898e7c120f52e4838b"
 dependencies = [
 "bitflags 1.3.2",
 "libc",
@@ -3279,9 +3392,9 @@ dependencies = [

 [[package]]
 name = "libc"
-version = "0.2.167"
+version = "0.2.169"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc"
+checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"

 [[package]]
 name = "libloading"
@@ -3528,14 +3641,14 @@ dependencies = [

 [[package]]
 name = "mio"
-version = "0.8.11"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
+checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd"
 dependencies = [
 "libc",
 "log",
 "wasi 0.11.0+wasi-snapshot-preview1",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]

 [[package]]
@@ -3544,6 +3657,12 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"

+[[package]]
+name = "never-say-never"
+version = "6.6.666"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf5a574dadd7941adeaa71823ecba5e28331b8313fb2e1c6a5c7e5981ea53ad6"
+
 [[package]]
 name = "nix"
 version = "0.25.1"
@@ -3575,7 +3694,7 @@ version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "cfg-if",
 "libc",
 "memoffset 0.9.0",
@@ -3593,12 +3712,11 @@ dependencies = [

 [[package]]
 name = "notify"
-version = "6.1.1"
+version = "8.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d"
+checksum = "2fee8403b3d66ac7b26aee6e40a897d85dc5ce26f44da36b8b73e987cc52e943"
 dependencies = [
- "bitflags 2.4.1",
- "crossbeam-channel",
+ "bitflags 2.8.0",
 "filetime",
 "fsevent-sys",
 "inotify",
@@ -3606,10 +3724,17 @@ dependencies = [
 "libc",
 "log",
 "mio",
+ "notify-types",
 "walkdir",
- "windows-sys 0.48.0",
+ "windows-sys 0.59.0",
 ]

+[[package]]
+name = "notify-types"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e0826a989adedc2a244799e823aece04662b66609d96af8dff7ac6df9a8925d"
+
 [[package]]
 name = "ntapi"
 version = "0.4.1"
@@ -4019,6 +4144,7 @@ dependencies = [
 "futures",
 "hex",
 "hex-literal",
+ "http-utils",
 "humantime",
 "humantime-serde",
 "hyper 0.14.30",
@@ -4119,6 +4245,7 @@ dependencies = [
 "anyhow",
 "bytes",
 "futures",
+ "http-utils",
 "pageserver_api",
 "postgres",
 "reqwest",
@@ -4155,6 +4282,16 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "papaya"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c"
+dependencies = [
+ "equivalent",
+ "seize",
+]
+
 [[package]]
 name = "parking"
 version = "2.1.1"
@@ -4421,10 +4558,20 @@ dependencies = [
 "plotters-backend",
 ]

+[[package]]
+name = "polonius-the-crab"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e97ca2c89572ae41bbec1c99498251f87dd5a94e500c5ec19c382dd593dd5ce9"
+dependencies = [
+ "higher-kinded-types",
+ "never-say-never",
+]
+
 [[package]]
 name = "postgres"
-version = "0.19.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+version = "0.19.7"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4437,9 +4584,9 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.22.1",
 "byteorder",
 "bytes",
 "fallible-iterator",
@@ -4471,7 +4618,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
 "bytes",
 "chrono",
@@ -4603,15 +4750,6 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"

-[[package]]
-name = "pq-sys"
-version = "0.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6cc05d7ea95200187117196eee9edd0644424911821aeb28a18ce60ea0b8793"
-dependencies = [
- "vcpkg",
-]
-
 [[package]]
 name = "pq_proto"
 version = "0.1.0"
@@ -4660,7 +4798,7 @@ version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "chrono",
 "flate2",
 "hex",
@@ -4675,7 +4813,7 @@ version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "chrono",
 "hex",
 ]
@@ -4781,6 +4919,7 @@ dependencies = [
 "ahash",
 "anyhow",
 "arc-swap",
+ "assert-json-diff",
 "async-compression",
 "async-trait",
 "atomic-take",
@@ -4788,15 +4927,16 @@ dependencies = [
 "aws-sdk-iam",
 "aws-sigv4",
 "base64 0.13.1",
+ "boxcar",
 "bstr",
 "bytes",
 "camino",
 "camino-tempfile",
 "chrono",
 "clap",
+ "clashmap",
 "compute_api",
 "consumption_metrics",
- "dashmap 5.5.0",
 "ecdsa 0.16.9",
 "ed25519-dalek",
 "env_logger 0.10.2",
@@ -4804,6 +4944,7 @@ dependencies = [
 "flate2",
 "framed-websockets",
 "futures",
+ "gettid",
 "hashbrown 0.14.5",
 "hashlink",
 "hex",
@@ -4811,6 +4952,7 @@ dependencies = [
 "hostname",
 "http 1.1.0",
 "http-body-util",
+ "http-utils",
 "humantime",
 "humantime-serde",
 "hyper 0.14.30",
@@ -4826,7 +4968,9 @@ dependencies = [
 "measured",
 "metrics",
 "once_cell",
+ "opentelemetry",
 "p256 0.13.2",
+ "papaya",
 "parking_lot 0.12.1",
 "parquet",
 "parquet_derive",
@@ -4873,6 +5017,9 @@ dependencies = [
 "tokio-tungstenite 0.21.0",
 "tokio-util",
 "tracing",
+ "tracing-log",
+ "tracing-opentelemetry",
+ "tracing-serde",
 "tracing-subscriber",
 "tracing-utils",
 "try-lock",
@@ -4924,17 +5071,6 @@ dependencies = [
 "proc-macro2",
 ]

-[[package]]
-name = "r2d2"
-version = "0.8.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93"
-dependencies = [
- "log",
- "parking_lot 0.12.1",
- "scheduled-thread-pool",
-]
-
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5215,6 +5351,12 @@ dependencies = [
 "utils",
 ]

+[[package]]
+name = "replace_with"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3a8614ee435691de62bcffcf4a66d91b3594bf1428a5722e79103249a095690"
+
 [[package]]
 name = "reqwest"
 version = "0.12.4"
@@ -5494,7 +5636,7 @@ version = "0.38.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "errno",
 "libc",
 "linux-raw-sys 0.4.14",
@@ -5658,6 +5800,7 @@ dependencies = [
 "futures",
 "hex",
 "http 1.1.0",
+ "http-utils",
 "humantime",
 "hyper 0.14.30",
 "itertools 0.10.5",
@@ -5722,6 +5865,7 @@ dependencies = [
 name = "safekeeper_client"
 version = "0.1.0"
 dependencies = [
+ "http-utils",
 "reqwest",
 "safekeeper_api",
 "serde",
@@ -5749,12 +5893,12 @@ dependencies = [
 ]

 [[package]]
-name = "scheduled-thread-pool"
-version = "0.2.7"
+name = "scoped-futures"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19"
+checksum = "1b24aae2d0636530f359e9d5ef0c04669d11c5e756699b27a6a6d845d8329091"
 dependencies = [
- "parking_lot 0.12.1",
+ "pin-project-lite",
 ]

 [[package]]
@@ -5831,6 +5975,16 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "seize"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "semver"
 version = "1.0.17"
@@ -6289,10 +6443,12 @@ dependencies = [
 "clap",
 "control_plane",
 "diesel",
+ "diesel-async",
 "diesel_migrations",
 "fail",
 "futures",
 "hex",
+ "http-utils",
 "humantime",
 "hyper 0.14.30",
 "itertools 0.10.5",
@@ -6303,10 +6459,12 @@ dependencies = [
 "pageserver_api",
 "pageserver_client",
 "postgres_connection",
- "r2d2",
 "rand 0.8.5",
 "reqwest",
 "routerify",
+ "rustls 0.23.18",
+ "rustls-native-certs 0.8.0",
+ "scoped-futures",
 "scopeguard",
 "serde",
 "serde_json",
@@ -6314,6 +6472,8 @@ dependencies = [
 "strum_macros",
 "thiserror 1.0.69",
 "tokio",
+ "tokio-postgres",
+ "tokio-postgres-rustls",
 "tokio-util",
 "tracing",
 "utils",
@@ -6556,7 +6716,7 @@ dependencies = [
 "fastrand 2.2.0",
 "once_cell",
 "rustix",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]

 [[package]]
@@ -6768,21 +6928,20 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"

 [[package]]
 name = "tokio"
-version = "1.38.1"
+version = "1.43.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df"
+checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e"
 dependencies = [
 "backtrace",
 "bytes",
 "libc",
 "mio",
- "num_cpus",
 "parking_lot 0.12.1",
 "pin-project-lite",
 "signal-hook-registry",
 "socket2",
 "tokio-macros",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]

 [[package]]
@@ -6813,9 +6972,9 @@ dependencies = [

 [[package]]
 name = "tokio-macros"
-version = "2.3.0"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
+checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -6824,8 +6983,8 @@ dependencies = [

 [[package]]
 name = "tokio-postgres"
-version = "0.7.9"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+version = "0.7.10"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -7100,7 +7259,7 @@ version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "bytes",
 "http 1.1.0",
 "http-body 1.0.0",
@@ -7454,48 +7613,38 @@ dependencies = [
 "criterion",
 "diatomic-waker",
 "fail",
- "flate2",
 "futures",
 "git-version",
 "hex",
 "hex-literal",
 "humantime",
- "hyper 0.14.30",
 "inferno 0.12.0",
- "itertools 0.10.5",
- "jemalloc_pprof",
 "jsonwebtoken",
 "metrics",
 "nix 0.27.1",
 "once_cell",
 "pin-project-lite",
 "postgres_connection",
- "pprof",
 "pq_proto",
 "rand 0.8.5",
 "regex",
- "routerify",
 "scopeguard",
 "sentry",
 "serde",
 "serde_assert",
 "serde_json",
- "serde_path_to_error",
 "serde_with",
 "signal-hook",
 "strum",
 "strum_macros",
 "thiserror 1.0.69",
 "tokio",
- "tokio-stream",
 "tokio-tar",
 "tokio-util",
 "toml_edit",
 "tracing",
 "tracing-error",
 "tracing-subscriber",
- "url",
- "uuid",
 "walkdir",
 ]

@@ -7515,12 +7664,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"

-[[package]]
-name = "vcpkg"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
-
 [[package]]
 name = "version_check"
 version = "0.9.4"
@@ -7603,9 +7746,9 @@ dependencies = [

 [[package]]
 name = "walkdir"
-version = "2.3.3"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
 dependencies = [
 "same-file",
 "winapi-util",
@@ -7857,6 +8000,15 @@ dependencies = [
 "windows-targets 0.52.6",
 ]

+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.48.0"
@@ -8085,7 +8237,9 @@ dependencies = [
 "tower 0.4.13",
 "tracing",
 "tracing-core",
+ "tracing-log",
 "url",
+ "uuid",
 "zerocopy",
 "zeroize",
 "zstd",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,6 +18,7 @@ members = [
    "storage_scrubber",
    "workspace_hack",
    "libs/compute_api",
+    "libs/http-utils",
    "libs/pageserver_api",
    "libs/postgres_ffi",
    "libs/safekeeper_api",
@@ -54,6 +55,7 @@ async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
 backtrace = "0.3.74"
 flate2 = "1.0.26"
+assert-json-diff = "2"
 async-stream = "0.3"
 async-trait = "0.1"
 aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] }
@@ -77,10 +79,10 @@ camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive", "env"] }
+clashmap = { version = "1.0", features = ["raw-api"] }
 comfy-table = "7.1"
 const_format = "0.2"
 crc32c = "0.6"
-dashmap = { version = "5.5.0", features = ["raw-api"] }
 diatomic-waker = { version = "0.2.3" }
 either = "1.8"
 enum-map = "2.4.2"
@@ -123,7 +125,7 @@ measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
 memoffset = "0.9"
 nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
-notify = "6.0.0"
+notify = "8.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
@@ -177,7 +179,7 @@ test-context = "0.3"
 thiserror = "1.0"
 tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
-tokio = { version = "1.17", features = ["macros"] }
+tokio = { version = "1.41", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
@@ -193,7 +195,9 @@ tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
 tower-service = "0.3.3"
 tracing = "0.1"
 tracing-error = "0.2"
+tracing-log = "0.2"
 tracing-opentelemetry = "0.28"
+tracing-serde = "0.2.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
@@ -226,6 +230,7 @@ azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rus
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
+http-utils = { version = "0.1", path = "./libs/http-utils/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
--- a/2
+++ b/2
@@ -45,7 +45,7 @@ COPY --chown=nonroot . .

 ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
+    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
      --bin pg_sni_router  \
      --bin pageserver  \
      --bin pagectl  \
--- a/19
+++ b/19
@@ -10,18 +10,29 @@ ICU_PREFIX_DIR := /usr/local/icu
 # environment variable.
 #
 BUILD_TYPE ?= debug
+WITH_SANITIZERS ?= no
 ifeq ($(BUILD_TYPE),release)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl
 	PG_CFLAGS = -O2 -g3 $(CFLAGS)
+	PG_LDFLAGS = $(LDFLAGS)
 	# Unfortunately, `--profile=...` is a nightly feature
 	CARGO_BUILD_FLAGS += --release
 else ifeq ($(BUILD_TYPE),debug)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
 	PG_CFLAGS = -O0 -g3 $(CFLAGS)
+	PG_LDFLAGS = $(LDFLAGS)
 else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif

+ifeq ($(WITH_SANITIZERS),yes)
+	PG_CFLAGS += -fsanitize=address -fsanitize=undefined -fno-sanitize-recover
+	COPT += -Wno-error # to avoid failing on warnings induced by sanitizers
+	PG_LDFLAGS = -fsanitize=address -fsanitize=undefined -static-libasan -static-libubsan $(LDFLAGS)
+	export CC := gcc
+	export ASAN_OPTIONS := detect_leaks=0
+endif
+
 ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
 	# Exclude static build openssl, icu for local build (MacOS, Linux)
 	# Only keep for build type release and debug
@@ -33,7 +44,9 @@ endif
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
-	PG_CONFIGURE_OPTS += --with-libseccomp
+	ifneq ($(WITH_SANITIZERS),yes)
+		PG_CONFIGURE_OPTS += --with-libseccomp
+	endif
 else ifeq ($(UNAME_S),Darwin)
 	PG_CFLAGS += -DUSE_PREFETCH
 	ifndef DISABLE_HOMEBREW
@@ -64,8 +77,6 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
 CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 # Force cargo not to print progress bar
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
-# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
-CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib

 CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"

@@ -108,7 +119,7 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 	EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
 	(cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \
 	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
-		CFLAGS='$(PG_CFLAGS)' \
+		CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \
 		$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
 		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)

--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -3,10 +3,17 @@ ARG DEBIAN_VERSION=bookworm
 FROM debian:bookworm-slim AS pgcopydb_builder
 ARG DEBIAN_VERSION

+# Use strict mode for bash to catch errors early
+SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
+
+# By default, /bin/sh used in debian images will treat '\n' as eol,
+# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
-    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
    echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc

+COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
+
 RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
        set -e && \
        apt update && \
@@ -39,6 +46,7 @@ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
        mkdir /tmp/pgcopydb && \
        tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \
        cd /tmp/pgcopydb && \
+        patch -p1 < /pgcopydbv017.patch && \
        make -s clean && \
        make -s -j12 install && \
        libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \
@@ -55,7 +63,8 @@ ARG DEBIAN_VERSION

 # Add nonroot user
 RUN useradd -ms /bin/bash nonroot -b /home
-SHELL ["/bin/bash", "-c"]
+# Use strict mode for bash to catch errors early
+SHELL ["/bin/bash", "-euo", "pipefail", "-c"]

 RUN mkdir -p /pgcopydb/bin && \
    mkdir -p /pgcopydb/lib && \
@@ -66,7 +75,7 @@ COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/p
 COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5

 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
-    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
    echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc

 # System deps
@@ -190,8 +199,14 @@ RUN set -e \
 # It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
 # And patches from us:
 # - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
-RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
-    && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
+RUN set +o pipefail && \
+	 for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do \
+		yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')";\
+	 done && \
+	set -o pipefail
+# Split into separate step to debug flaky failures here
+RUN wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
+    && ls -laht lcov.tar.gz && sha256sum lcov.tar.gz \
    && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992  lcov.tar.gz" | sha256sum --check \
    && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
    && cd lcov \
@@ -253,7 +268,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.84.0
+ENV RUSTC_VERSION=1.84.1
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
@@ -261,6 +276,7 @@ ARG CARGO_HAKARI_VERSION=0.9.33
 ARG CARGO_DENY_VERSION=0.16.2
 ARG CARGO_HACK_VERSION=0.6.33
 ARG CARGO_NEXTEST_VERSION=0.9.85
+ARG CARGO_DIESEL_CLI_VERSION=2.2.6
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
 	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
@@ -274,6 +290,8 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
    cargo install cargo-hack          --version ${CARGO_HACK_VERSION} && \
    cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
+    cargo install diesel_cli          --version ${CARGO_DIESEL_CLI_VERSION} \
+                                      --features postgres-bundled --no-default-features && \
    rm -rf /home/nonroot/.cargo/registry && \
    rm -rf /home/nonroot/.cargo/git

--- a/build_tools/patches/pgcopydbv017.patch
+++ b/build_tools/patches/pgcopydbv017.patch
@@ -0,0 +1,57 @@
+diff --git a/src/bin/pgcopydb/copydb.c b/src/bin/pgcopydb/copydb.c
+index d730b03..69a9be9 100644
+--- a/src/bin/pgcopydb/copydb.c
+++ b/src/bin/pgcopydb/copydb.c
+@@ -44,6 +44,7 @@ GUC dstSettings[] = {
+ 	{ "synchronous_commit", "'off'" },
+ 	{ "statement_timeout", "0" },
+ 	{ "lock_timeout", "0" },
+	{ "idle_in_transaction_session_timeout", "0" },
+ 	{ NULL, NULL },
+ };
+ 
+diff --git a/src/bin/pgcopydb/pgsql.c b/src/bin/pgcopydb/pgsql.c
+index 94f2f46..e051ba8 100644
+--- a/src/bin/pgcopydb/pgsql.c
+++ b/src/bin/pgcopydb/pgsql.c
+@@ -2319,6 +2319,11 @@ pgsql_execute_log_error(PGSQL *pgsql,
+ 
+ 	LinesBuffer lbuf = { 0 };
+ 
+	if (message != NULL){
+		// make sure message is writable by splitLines
+		message = strdup(message);
+	}
+
+ 	if (!splitLines(&lbuf, message))
+ 	{
+ 		/* errors have already been logged */
+@@ -2332,6 +2337,7 @@ pgsql_execute_log_error(PGSQL *pgsql,
+ 				  PQbackendPID(pgsql->connection),
+ 				  lbuf.lines[lineNumber]);
+ 	}
+        free(message); // free copy of message we created above
+ 
+ 	if (pgsql->logSQL)
+ 	{
+@@ -3174,11 +3180,18 @@ pgcopy_log_error(PGSQL *pgsql, PGresult *res, const char *context)
+ 		/* errors have already been logged */
+ 		return;
+ 	}
+-
+ 	if (res != NULL)
+ 	{
+ 		char *sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
+-		strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate));
+		if (sqlstate == NULL)
+		{
+			// PQresultErrorField returned NULL!
+			pgsql->sqlstate[0] = '\0';  // Set to an empty string to avoid segfault
+		}
+		else
+		{
+			strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate));
+		}
+ 	}
+ 
+ 	char *endpoint =
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
--- a/compute/patches/pg_hint_plan_v16.patch
+++ b/compute/patches/pg_hint_plan_v16.patch
@@ -6,16 +6,16 @@ index da723b8..5328114 100644
 ----
 -- No.A-1-1-3
 CREATE EXTENSION pg_hint_plan;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
 -- No.A-1-2-3
 DROP EXTENSION pg_hint_plan;
 -- No.A-1-1-4
 CREATE SCHEMA other_schema;
 CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
 ERROR:  extension "pg_hint_plan" must be installed in schema "hint_plan"
 CREATE EXTENSION pg_hint_plan;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
 DROP SCHEMA other_schema;
 ----
 ---- No. A-5-1 comment pattern
@@ -35,7 +35,7 @@ index d372459..6282afe 100644
 SET client_min_messages TO LOG;
 SET pg_hint_plan.enable_hint TO on;
 CREATE EXTENSION file_fdw;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw
 CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
 CREATE USER MAPPING FOR PUBLIC SERVER file_server;
 CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
--- a/compute/patches/pg_hint_plan_v17.patch
+++ b/compute/patches/pg_hint_plan_v17.patch
@@ -6,16 +6,16 @@ index e7d68a1..65a056c 100644
 ----
 -- No.A-1-1-3
 CREATE EXTENSION pg_hint_plan;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
 -- No.A-1-2-3
 DROP EXTENSION pg_hint_plan;
 -- No.A-1-1-4
 CREATE SCHEMA other_schema;
 CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
 ERROR:  extension "pg_hint_plan" must be installed in schema "hint_plan"
 CREATE EXTENSION pg_hint_plan;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
 DROP SCHEMA other_schema;
 ----
 ---- No. A-5-1 comment pattern
@@ -168,7 +168,7 @@ index 017fa4b..98d989b 100644
 SET client_min_messages TO LOG;
 SET pg_hint_plan.enable_hint TO on;
 CREATE EXTENSION file_fdw;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw
 CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
 CREATE USER MAPPING FOR PUBLIC SERVER file_server;
 CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -47,7 +47,9 @@ files:
      # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
      # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
      # regardless of hostname (ALL)
-      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
+      #
+      # Also allow it to shut down the VM. The fast_import job does that when it's finished.
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
  - filename: cgconfig.conf
    content: |
      # Configuration for cgroups in VM compute nodes
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -34,18 +34,21 @@
 //!             -r http://pg-ext-s3-gateway \
 //! ```
 use std::collections::HashMap;
+use std::ffi::OsString;
 use std::fs::File;
 use std::path::Path;
 use std::process::exit;
 use std::str::FromStr;
 use std::sync::atomic::Ordering;
 use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
+use std::time::SystemTime;
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
 use chrono::Utc;
-use clap::Arg;
+use clap::Parser;
 use compute_tools::disk_quota::set_disk_quota;
+use compute_tools::http::server::Server;
 use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
@@ -60,7 +63,6 @@ use compute_tools::compute::{
 };
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version_string;
-use compute_tools::http::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
@@ -73,10 +75,109 @@ use utils::failpoint_support;
 // in-case of not-set environment var
 const BUILD_TAG_DEFAULT: &str = "latest";

-fn main() -> Result<()> {
-    let scenario = failpoint_support::init();
+// Compatibility hack: if the control plane specified any remote-ext-config
+// use the default value for extension storage proxy gateway.
+// Remove this once the control plane is updated to pass the gateway URL
+fn parse_remote_ext_config(arg: &str) -> Result<String> {
+    if arg.starts_with("http") {
+        Ok(arg.trim_end_matches('/').to_string())
+    } else {
+        Ok("http://pg-ext-s3-gateway".to_string())
+    }
+}

-    let (build_tag, clap_args) = init()?;
+/// Generate a compute ID if one is not supplied. This exists to keep forward
+/// compatibility tests working, but will be removed in a future iteration.
+fn generate_compute_id() -> String {
+    let now = SystemTime::now();
+
+    format!(
+        "compute-{}",
+        now.duration_since(SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_secs()
+    )
+}
+
+#[derive(Parser)]
+#[command(rename_all = "kebab-case")]
+struct Cli {
+    #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
+    pub pgbin: String,
+
+    #[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
+    pub remote_ext_config: Option<String>,
+
+    /// The port to bind the external listening HTTP server to. Clients running
+    /// outside the compute will talk to the compute through this port. Keep
+    /// the previous name for this argument around for a smoother release
+    /// with the control plane.
+    ///
+    /// TODO: Remove the alias after the control plane release which teaches the
+    /// control plane about the renamed argument.
+    #[arg(long, alias = "http-port", default_value_t = 3080)]
+    pub external_http_port: u16,
+
+    /// The port to bind the internal listening HTTP server to. Clients like
+    /// the neon extension (for installing remote extensions) and local_proxy.
+    #[arg(long)]
+    pub internal_http_port: Option<u16>,
+
+    #[arg(short = 'D', long, value_name = "DATADIR")]
+    pub pgdata: String,
+
+    #[arg(short = 'C', long, value_name = "DATABASE_URL")]
+    pub connstr: String,
+
+    #[cfg(target_os = "linux")]
+    #[arg(long, default_value = "neon-postgres")]
+    pub cgroup: String,
+
+    #[cfg(target_os = "linux")]
+    #[arg(
+        long,
+        default_value = "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor"
+    )]
+    pub filecache_connstr: String,
+
+    #[cfg(target_os = "linux")]
+    #[arg(long, default_value = "0.0.0.0:10301")]
+    pub vm_monitor_addr: String,
+
+    #[arg(long, action = clap::ArgAction::SetTrue)]
+    pub resize_swap_on_bind: bool,
+
+    #[arg(long)]
+    pub set_disk_quota_for_fs: Option<String>,
+
+    #[arg(short = 's', long = "spec", group = "spec")]
+    pub spec_json: Option<String>,
+
+    #[arg(short = 'S', long, group = "spec-path")]
+    pub spec_path: Option<OsString>,
+
+    #[arg(short = 'i', long, group = "compute-id", default_value = generate_compute_id())]
+    pub compute_id: String,
+
+    #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")]
+    pub control_plane_uri: Option<String>,
+}
+
+fn main() -> Result<()> {
+    let cli = Cli::parse();
+
+    // For historical reasons, the main thread that processes the spec and launches postgres
+    // is synchronous, but we always have this tokio runtime available and we "enter" it so
+    // that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...)
+    // from all parts of compute_ctl.
+    let runtime = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()?;
+    let _rt_guard = runtime.enter();
+
+    let build_tag = runtime.block_on(init())?;
+
+    let scenario = failpoint_support::init();

    // enable core dumping for all child processes
    setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
@@ -85,13 +186,11 @@ fn main() -> Result<()> {
        // Enter startup tracing context
        let _startup_context_guard = startup_context_from_env();

-        let cli_args = process_cli(&clap_args)?;
+        let cli_spec = try_spec_from_cli(&cli)?;

-        let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
+        let compute = wait_spec(build_tag, &cli, cli_spec)?;

-        let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;
-
-        start_postgres(&clap_args, wait_spec_result)?
+        start_postgres(&cli, compute)?

        // Startup is finished, exit the startup tracing span
    };
@@ -108,8 +207,8 @@ fn main() -> Result<()> {
    deinit_and_exit(wait_pg_result);
 }

-fn init() -> Result<(String, clap::ArgMatches)> {
-    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
+async fn init() -> Result<String> {
+    init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?;

    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
    thread::spawn(move || {
@@ -123,66 +222,7 @@ fn init() -> Result<(String, clap::ArgMatches)> {
        .to_string();
    info!("build_tag: {build_tag}");

-    Ok((build_tag, cli().get_matches()))
-}
-
-fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
-    let pgbin_default = "postgres";
-    let pgbin = matches
-        .get_one::<String>("pgbin")
-        .map(|s| s.as_str())
-        .unwrap_or(pgbin_default);
-
-    let ext_remote_storage = matches
-        .get_one::<String>("remote-ext-config")
-        // Compatibility hack: if the control plane specified any remote-ext-config
-        // use the default value for extension storage proxy gateway.
-        // Remove this once the control plane is updated to pass the gateway URL
-        .map(|conf| {
-            if conf.starts_with("http") {
-                conf.trim_end_matches('/')
-            } else {
-                "http://pg-ext-s3-gateway"
-            }
-        });
-
-    let http_port = *matches
-        .get_one::<u16>("http-port")
-        .expect("http-port is required");
-    let pgdata = matches
-        .get_one::<String>("pgdata")
-        .expect("PGDATA path is required");
-    let connstr = matches
-        .get_one::<String>("connstr")
-        .expect("Postgres connection string is required");
-    let spec_json = matches.get_one::<String>("spec");
-    let spec_path = matches.get_one::<String>("spec-path");
-    let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
-    let set_disk_quota_for_fs = matches.get_one::<String>("set-disk-quota-for-fs");
-
-    Ok(ProcessCliResult {
-        connstr,
-        pgdata,
-        pgbin,
-        ext_remote_storage,
-        http_port,
-        spec_json,
-        spec_path,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs,
-    })
-}
-
-struct ProcessCliResult<'clap> {
-    connstr: &'clap str,
-    pgdata: &'clap str,
-    pgbin: &'clap str,
-    ext_remote_storage: Option<&'clap str>,
-    http_port: u16,
-    spec_json: Option<&'clap String>,
-    spec_path: Option<&'clap String>,
-    resize_swap_on_bind: bool,
-    set_disk_quota_for_fs: Option<&'clap String>,
+    Ok(build_tag)
 }

 fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
@@ -235,19 +275,9 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
    }
 }

-fn try_spec_from_cli(
-    matches: &clap::ArgMatches,
-    ProcessCliResult {
-        spec_json,
-        spec_path,
-        ..
-    }: &ProcessCliResult,
-) -> Result<CliSpecParams> {
-    let compute_id = matches.get_one::<String>("compute-id");
-    let control_plane_uri = matches.get_one::<String>("control-plane-uri");
-
+fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
    // First, try to get cluster spec from the cli argument
-    if let Some(spec_json) = spec_json {
+    if let Some(ref spec_json) = cli.spec_json {
        info!("got spec from cli argument {}", spec_json);
        return Ok(CliSpecParams {
            spec: Some(serde_json::from_str(spec_json)?),
@@ -256,7 +286,7 @@ fn try_spec_from_cli(
    }

    // Second, try to read it from the file if path is provided
-    if let Some(spec_path) = spec_path {
+    if let Some(ref spec_path) = cli.spec_path {
        let file = File::open(Path::new(spec_path))?;
        return Ok(CliSpecParams {
            spec: Some(serde_json::from_reader(file)?),
@@ -264,17 +294,11 @@ fn try_spec_from_cli(
        });
    }

-    let Some(compute_id) = compute_id else {
-        panic!(
-            "compute spec should be provided by one of the following ways: \
-                --spec OR --spec-path OR --control-plane-uri and --compute-id"
-        );
-    };
-    let Some(control_plane_uri) = control_plane_uri else {
-        panic!("must specify both --control-plane-uri and --compute-id or none");
+    if cli.control_plane_uri.is_none() {
+        panic!("must specify --control-plane-uri");
    };

-    match get_spec_from_control_plane(control_plane_uri, compute_id) {
+    match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
        Ok(spec) => Ok(CliSpecParams {
            spec,
            live_config_allowed: true,
@@ -298,21 +322,12 @@ struct CliSpecParams {

 fn wait_spec(
    build_tag: String,
-    ProcessCliResult {
-        connstr,
-        pgdata,
-        pgbin,
-        ext_remote_storage,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs,
-        http_port,
-        ..
-    }: ProcessCliResult,
+    cli: &Cli,
    CliSpecParams {
        spec,
        live_config_allowed,
    }: CliSpecParams,
-) -> Result<WaitSpecResult> {
+) -> Result<Arc<ComputeNode>> {
    let mut new_state = ComputeState::new();
    let spec_set;

@@ -324,23 +339,25 @@ fn wait_spec(
    } else {
        spec_set = false;
    }
-    let connstr = Url::parse(connstr).context("cannot parse connstr as a URL")?;
+    let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
    let conn_conf = postgres::config::Config::from_str(connstr.as_str())
        .context("cannot build postgres config from connstr")?;
    let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
        .context("cannot build tokio postgres config from connstr")?;
    let compute_node = ComputeNode {
+        compute_id: cli.compute_id.clone(),
        connstr,
        conn_conf,
        tokio_conn_conf,
-        pgdata: pgdata.to_string(),
-        pgbin: pgbin.to_string(),
-        pgversion: get_pg_version_string(pgbin),
-        http_port,
+        pgdata: cli.pgdata.clone(),
+        pgbin: cli.pgbin.clone(),
+        pgversion: get_pg_version_string(&cli.pgbin),
+        external_http_port: cli.external_http_port,
+        internal_http_port: cli.internal_http_port.unwrap_or(cli.external_http_port + 1),
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
-        ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
+        ext_remote_storage: cli.remote_ext_config.clone(),
        ext_download_progress: RwLock::new(HashMap::new()),
        build_tag,
    };
@@ -354,10 +371,13 @@ fn wait_spec(
        compute.prewarm_postgres()?;
    }

-    // Launch http service first, so that we can serve control-plane requests
-    // while configuration is still in progress.
-    let _http_handle =
-        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
+    // Launch the external HTTP server first, so that we can serve control plane
+    // requests while configuration is still in progress.
+    Server::External(cli.external_http_port).launch(&compute);
+
+    // The internal HTTP server could be launched later, but there isn't much
+    // sense in waiting.
+    Server::Internal(cli.internal_http_port.unwrap_or(cli.external_http_port + 1)).launch(&compute);

    if !spec_set {
        // No spec provided, hang waiting for it.
@@ -389,27 +409,12 @@ fn wait_spec(

    launch_lsn_lease_bg_task_for_static(&compute);

-    Ok(WaitSpecResult {
-        compute,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(),
-    })
-}
-
-struct WaitSpecResult {
-    compute: Arc<ComputeNode>,
-    resize_swap_on_bind: bool,
-    set_disk_quota_for_fs: Option<String>,
+    Ok(compute)
 }

 fn start_postgres(
-    // need to allow unused because `matches` is only used if target_os = "linux"
-    #[allow(unused_variables)] matches: &clap::ArgMatches,
-    WaitSpecResult {
-        compute,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs,
-    }: WaitSpecResult,
+    cli: &Cli,
+    compute: Arc<ComputeNode>,
 ) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
    // We got all we need, update the state.
    let mut state = compute.state.lock().unwrap();
@@ -437,7 +442,7 @@ fn start_postgres(
    let mut delay_exit = false;

    // Resize swap to the desired size if the compute spec says so
-    if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) {
+    if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) {
        // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
        // *before* starting postgres.
        //
@@ -464,9 +469,9 @@ fn start_postgres(

    // Set disk quota if the compute spec says so
    if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
-        (disk_quota_bytes, set_disk_quota_for_fs)
+        (disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref())
    {
-        match set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) {
+        match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) {
            Ok(()) => {
                let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
                info!(%disk_quota_bytes, %size_mib, "set disk quota");
@@ -509,27 +514,6 @@ fn start_postgres(
        if #[cfg(target_os = "linux")] {
            use std::env;
            use tokio_util::sync::CancellationToken;
-            let vm_monitor_addr = matches
-                .get_one::<String>("vm-monitor-addr")
-                .expect("--vm-monitor-addr should always be set because it has a default arg");
-            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
-            let cgroup = matches.get_one::<String>("cgroup");
-
-            // Only make a runtime if we need to.
-            // Note: it seems like you can make a runtime in an inner scope and
-            // if you start a task in it it won't be dropped. However, make it
-            // in the outermost scope just to be safe.
-            let rt = if env::var_os("AUTOSCALING").is_some() {
-                Some(
-                    tokio::runtime::Builder::new_multi_thread()
-                        .worker_threads(4)
-                        .enable_all()
-                        .build()
-                        .expect("failed to create tokio runtime for monitor")
-                )
-            } else {
-                None
-            };

            // This token is used internally by the monitor to clean up all threads
            let token = CancellationToken::new();
@@ -538,19 +522,22 @@ fn start_postgres(
            let pgconnstr = if disable_lfc_resizing.unwrap_or(false) {
                None
            } else {
-                file_cache_connstr.cloned()
+                Some(cli.filecache_connstr.clone())
            };

-            let vm_monitor = rt.as_ref().map(|rt| {
-                rt.spawn(vm_monitor::start(
+            let vm_monitor = if env::var_os("AUTOSCALING").is_some() {
+                let vm_monitor = tokio::spawn(vm_monitor::start(
                    Box::leak(Box::new(vm_monitor::Args {
-                        cgroup: cgroup.cloned(),
+                        cgroup: Some(cli.cgroup.clone()),
                        pgconnstr,
-                        addr: vm_monitor_addr.clone(),
+                        addr: cli.vm_monitor_addr.clone(),
                    })),
                    token.clone(),
-                ))
-            });
+                ));
+                Some(vm_monitor)
+            } else {
+                None
+            };
        }
    }

@@ -560,8 +547,6 @@ fn start_postgres(
            delay_exit,
            compute,
            #[cfg(target_os = "linux")]
-            rt,
-            #[cfg(target_os = "linux")]
            token,
            #[cfg(target_os = "linux")]
            vm_monitor,
@@ -569,15 +554,13 @@ fn start_postgres(
    ))
 }

-type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>);
+type PostgresHandle = (std::process::Child, tokio::task::JoinHandle<Result<()>>);

 struct StartPostgresResult {
    delay_exit: bool,
    // passed through from WaitSpecResult
    compute: Arc<ComputeNode>,

-    #[cfg(target_os = "linux")]
-    rt: Option<tokio::runtime::Runtime>,
    #[cfg(target_os = "linux")]
    token: tokio_util::sync::CancellationToken,
    #[cfg(target_os = "linux")]
@@ -596,10 +579,10 @@ fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
            .expect("failed to start waiting on Postgres process");
        PG_PID.store(0, Ordering::SeqCst);

-        // Process has exited, so we can join the logs thread.
-        let _ = logs_handle
-            .join()
-            .map_err(|e| tracing::error!("log thread panicked: {:?}", e));
+        // Process has exited. Wait for the log collecting task to finish.
+        let _ = tokio::runtime::Handle::current()
+            .block_on(logs_handle)
+            .map_err(|e| tracing::error!("log task panicked: {:?}", e));

        info!("Postgres exited with code {}, shutting down", ecode);
        exit_code = ecode.code()
@@ -620,8 +603,6 @@ fn cleanup_after_postgres_exit(
        vm_monitor,
        #[cfg(target_os = "linux")]
        token,
-        #[cfg(target_os = "linux")]
-        rt,
    }: StartPostgresResult,
 ) -> Result<bool> {
    // Terminate the vm_monitor so it releases the file watcher on
@@ -634,10 +615,6 @@ fn cleanup_after_postgres_exit(
                token.cancel();
                // Kills the actual task running the monitor
                handle.abort();
-
-                // If handle is some, rt must have been used to produce it, and
-                // hence is also some
-                rt.unwrap().shutdown_timeout(Duration::from_secs(2));
            }
        }
    }
@@ -702,105 +679,6 @@ fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
    exit(exit_code.unwrap_or(1))
 }

-fn cli() -> clap::Command {
-    // Env variable is set by `cargo`
-    let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
-    clap::Command::new("compute_ctl")
-        .version(version)
-        .arg(
-            Arg::new("http-port")
-                .long("http-port")
-                .value_name("HTTP_PORT")
-                .default_value("3080")
-                .value_parser(clap::value_parser!(u16))
-                .required(false),
-        )
-        .arg(
-            Arg::new("connstr")
-                .short('C')
-                .long("connstr")
-                .value_name("DATABASE_URL")
-                .required(true),
-        )
-        .arg(
-            Arg::new("pgdata")
-                .short('D')
-                .long("pgdata")
-                .value_name("DATADIR")
-                .required(true),
-        )
-        .arg(
-            Arg::new("pgbin")
-                .short('b')
-                .long("pgbin")
-                .default_value("postgres")
-                .value_name("POSTGRES_PATH"),
-        )
-        .arg(
-            Arg::new("spec")
-                .short('s')
-                .long("spec")
-                .value_name("SPEC_JSON"),
-        )
-        .arg(
-            Arg::new("spec-path")
-                .short('S')
-                .long("spec-path")
-                .value_name("SPEC_PATH"),
-        )
-        .arg(
-            Arg::new("compute-id")
-                .short('i')
-                .long("compute-id")
-                .value_name("COMPUTE_ID"),
-        )
-        .arg(
-            Arg::new("control-plane-uri")
-                .short('p')
-                .long("control-plane-uri")
-                .value_name("CONTROL_PLANE_API_BASE_URI"),
-        )
-        .arg(
-            Arg::new("remote-ext-config")
-                .short('r')
-                .long("remote-ext-config")
-                .value_name("REMOTE_EXT_CONFIG"),
-        )
-        // TODO(fprasx): we currently have default arguments because the cloud PR
-        // to pass them in hasn't been merged yet. We should get rid of them once
-        // the PR is merged.
-        .arg(
-            Arg::new("vm-monitor-addr")
-                .long("vm-monitor-addr")
-                .default_value("0.0.0.0:10301")
-                .value_name("VM_MONITOR_ADDR"),
-        )
-        .arg(
-            Arg::new("cgroup")
-                .long("cgroup")
-                .default_value("neon-postgres")
-                .value_name("CGROUP"),
-        )
-        .arg(
-            Arg::new("filecache-connstr")
-                .long("filecache-connstr")
-                .default_value(
-                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor",
-                )
-                .value_name("FILECACHE_CONNSTR"),
-        )
-        .arg(
-            Arg::new("resize-swap-on-bind")
-                .long("resize-swap-on-bind")
-                .action(clap::ArgAction::SetTrue),
-        )
-        .arg(
-            Arg::new("set-disk-quota-for-fs")
-                .long("set-disk-quota-for-fs")
-                .value_name("SET_DISK_QUOTA_FOR_FS")
-        )
-}
-
 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
 /// to prevent leakage. TODO: it is better to convert compute_ctl to async and
 /// wait for termination which would be easy then.
@@ -810,7 +688,14 @@ fn handle_exit_signal(sig: i32) {
    exit(1);
 }

-#[test]
-fn verify_cli() {
-    cli().debug_assert()
+#[cfg(test)]
+mod test {
+    use clap::CommandFactory;
+
+    use super::Cli;
+
+    #[test]
+    fn verify_cli() {
+        Cli::command().debug_assert()
+    }
 }
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -60,6 +60,16 @@ struct Args {
    pg_lib_dir: Utf8PathBuf,
    #[clap(long)]
    pg_port: Option<u16>, // port to run postgres on, 5432 is default
+
+    /// Number of CPUs in the system. This is used to configure # of
+    /// parallel worker processes, for index creation.
+    #[clap(long, env = "NEON_IMPORTER_NUM_CPUS")]
+    num_cpus: Option<usize>,
+
+    /// Amount of RAM in the system. This is used to configure shared_buffers
+    /// and maintenance_work_mem.
+    #[clap(long, env = "NEON_IMPORTER_MEMORY_MB")]
+    memory_mb: Option<usize>,
 }

 #[serde_with::serde_as]
@@ -202,7 +212,16 @@ pub(crate) async fn main() -> anyhow::Result<()> {
    .await
    .context("initdb")?;

-    let nproc = num_cpus::get();
+    // If the caller didn't specify CPU / RAM to use for sizing, default to
+    // number of CPUs in the system, and pretty arbitrarily, 256 MB of RAM.
+    let nproc = args.num_cpus.unwrap_or_else(num_cpus::get);
+    let memory_mb = args.memory_mb.unwrap_or(256);
+
+    // Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for
+    // maintenance_work_mem (i.e. for sorting during index creation), and leave the rest
+    // available for misc other stuff that PostgreSQL uses memory for.
+    let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize;
+    let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize;

    //
    // Launch postgres process
@@ -212,12 +231,15 @@ pub(crate) async fn main() -> anyhow::Result<()> {
        .arg(&pgdata_dir)
        .args(["-p", &format!("{pg_port}")])
        .args(["-c", "wal_level=minimal"])
-        .args(["-c", "shared_buffers=10GB"])
+        .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")])
        .args(["-c", "max_wal_senders=0"])
        .args(["-c", "fsync=off"])
        .args(["-c", "full_page_writes=off"])
        .args(["-c", "synchronous_commit=off"])
-        .args(["-c", "maintenance_work_mem=8388608"])
+        .args([
+            "-c",
+            &format!("maintenance_work_mem={maintenance_work_mem_mb}MB"),
+        ])
        .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")])
        .args(["-c", &format!("max_parallel_workers={nproc}")])
        .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
@@ -231,6 +253,14 @@ pub(crate) async fn main() -> anyhow::Result<()> {
        ])
        .env_clear()
        .env("LD_LIBRARY_PATH", &pg_lib_dir)
+        .env(
+            "ASAN_OPTIONS",
+            std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+        )
+        .env(
+            "UBSAN_OPTIONS",
+            std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+        )
        .stdout(std::process::Stdio::piped())
        .stderr(std::process::Stdio::piped())
        .spawn()
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -140,5 +140,34 @@ pub async fn get_database_schema(
            warn!("pg_dump stderr: {}", line)
        }
    });
-    Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))))
+
+    #[allow(dead_code)]
+    struct SchemaStream<S> {
+        // We keep a reference to the child process to ensure it stays alive
+        // while the stream is being consumed. When SchemaStream is dropped,
+        // cmd will be dropped, which triggers kill_on_drop and terminates pg_dump
+        cmd: tokio::process::Child,
+        stream: S,
+    }
+
+    impl<S> Stream for SchemaStream<S>
+    where
+        S: Stream<Item = Result<bytes::Bytes, std::io::Error>> + Unpin,
+    {
+        type Item = Result<bytes::Bytes, std::io::Error>;
+
+        fn poll_next(
+            mut self: std::pin::Pin<&mut Self>,
+            cx: &mut std::task::Context<'_>,
+        ) -> std::task::Poll<Option<Self::Item>> {
+            Stream::poll_next(std::pin::Pin::new(&mut self.stream), cx)
+        }
+    }
+
+    let schema_stream = SchemaStream {
+        cmd,
+        stream: initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))),
+    };
+
+    Ok(schema_stream)
 }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -9,7 +9,6 @@ use std::str::FromStr;
 use std::sync::atomic::AtomicU32;
 use std::sync::atomic::Ordering;
 use std::sync::{Arc, Condvar, Mutex, RwLock};
-use std::thread;
 use std::time::Duration;
 use std::time::Instant;

@@ -59,6 +58,8 @@ pub static PG_PID: AtomicU32 = AtomicU32::new(0);

 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
+    /// The ID of the compute
+    pub compute_id: String,
    // Url type maintains proper escaping
    pub connstr: url::Url,
    // We connect to Postgres from many different places, so build configs once
@@ -81,8 +82,10 @@ pub struct ComputeNode {
    /// - we push spec and it does configuration
    /// - but then it is restarted without any spec again
    pub live_config_allowed: bool,
-    /// The port that the compute's HTTP server listens on
-    pub http_port: u16,
+    /// The port that the compute's external HTTP server listens on
+    pub external_http_port: u16,
+    /// The port that the compute's internal HTTP server listens on
+    pub internal_http_port: u16,
    /// Volatile part of the `ComputeNode`, which should be used under `Mutex`.
    /// To allow HTTP API server to serving status requests, while configuration
    /// is in progress, lock should be held only for short periods of time to do
@@ -546,11 +549,7 @@ impl ComputeNode {
    pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result<Option<Lsn>> {
        let start_time = Utc::now();

-        // Run actual work with new tokio runtime
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .expect("failed to create rt");
+        let rt = tokio::runtime::Handle::current();
        let result = rt.block_on(self.check_safekeepers_synced_async(compute_state));

        // Record runtime
@@ -597,9 +596,9 @@ impl ComputeNode {
        SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);

        // Process has exited, so we can join the logs thread.
-        let _ = logs_handle
-            .join()
-            .map_err(|e| tracing::error!("log thread panicked: {:?}", e));
+        let _ = tokio::runtime::Handle::current()
+            .block_on(logs_handle)
+            .map_err(|e| tracing::error!("log task panicked: {:?}", e));

        if !sync_output.status.success() {
            anyhow::bail!(
@@ -634,7 +633,7 @@ impl ComputeNode {
        config::write_postgres_conf(
            &pgdata_path.join("postgresql.conf"),
            &pspec.spec,
-            self.http_port,
+            self.internal_http_port,
        )?;

        // Syncing safekeepers is only safe with primary nodes: if a primary
@@ -784,7 +783,7 @@ impl ComputeNode {
    pub fn start_postgres(
        &self,
        storage_auth_token: Option<String>,
-    ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
+    ) -> Result<(std::process::Child, tokio::task::JoinHandle<Result<()>>)> {
        let pgdata_path = Path::new(&self.pgdata);

        // Run postgres as a child process.
@@ -800,7 +799,7 @@ impl ComputeNode {
            .expect("cannot start postgres process");
        PG_PID.store(pg.id(), Ordering::SeqCst);

-        // Start a thread to collect logs from stderr.
+        // Start a task to collect logs from stderr.
        let stderr = pg.stderr.take().expect("stderr should be captured");
        let logs_handle = handle_postgres_logs(stderr);

@@ -809,20 +808,28 @@ impl ComputeNode {
        Ok((pg, logs_handle))
    }

-    /// Do post configuration of the already started Postgres. This function spawns a background thread to
+    /// Do post configuration of the already started Postgres. This function spawns a background task to
    /// configure the database after applying the compute spec. Currently, it upgrades the neon extension
    /// version. In the future, it may upgrade all 3rd-party extensions.
    #[instrument(skip_all)]
    pub fn post_apply_config(&self) -> Result<()> {
-        let conf = self.get_conn_conf(Some("compute_ctl:post_apply_config"));
-        thread::spawn(move || {
-            let func = || {
-                let mut client = conf.connect(NoTls)?;
+        let conf = self.get_tokio_conn_conf(Some("compute_ctl:post_apply_config"));
+        tokio::spawn(async move {
+            let res = async {
+                let (mut client, connection) = conf.connect(NoTls).await?;
+                tokio::spawn(async move {
+                    if let Err(e) = connection.await {
+                        eprintln!("connection error: {}", e);
+                    }
+                });
+
                handle_neon_extension_upgrade(&mut client)
+                    .await
                    .context("handle_neon_extension_upgrade")?;
                Ok::<_, anyhow::Error>(())
-            };
-            if let Err(err) = func() {
+            }
+            .await;
+            if let Err(err) = res {
                error!("error while post_apply_config: {err:#}");
            }
        });
@@ -919,13 +926,10 @@ impl ComputeNode {
        conf: Arc<tokio_postgres::Config>,
        concurrency: usize,
    ) -> Result<()> {
-        let rt = tokio::runtime::Builder::new_multi_thread()
-            .enable_all()
-            .build()?;
-
        info!("Applying config with max {} concurrency", concurrency);
        debug!("Config: {:?}", spec);

+        let rt = tokio::runtime::Handle::current();
        rt.block_on(async {
            // Proceed with post-startup configuration. Note, that order of operations is important.
            let client = Self::get_maintenance_client(&conf).await?;
@@ -1319,14 +1323,18 @@ impl ComputeNode {
        }

        // Run migrations separately to not hold up cold starts
-        thread::spawn(move || {
-            let conf = conf.as_ref().clone();
-            let mut conf = postgres::config::Config::from(conf);
+        tokio::spawn(async move {
+            let mut conf = conf.as_ref().clone();
            conf.application_name("compute_ctl:migrations");

-            match conf.connect(NoTls) {
-                Ok(mut client) => {
-                    if let Err(e) = handle_migrations(&mut client) {
+            match conf.connect(NoTls).await {
+                Ok((mut client, connection)) => {
+                    tokio::spawn(async move {
+                        if let Err(e) = connection.await {
+                            eprintln!("connection error: {}", e);
+                        }
+                    });
+                    if let Err(e) = handle_migrations(&mut client).await {
                        error!("Failed to run migrations: {}", e);
                    }
                }
@@ -1363,16 +1371,11 @@ impl ComputeNode {
        if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
            info!("tuning pgbouncer");

-            let rt = tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()
-                .expect("failed to create rt");
-
-            // Spawn a thread to do the tuning,
+            // Spawn a background task to do the tuning,
            // so that we don't block the main thread that starts Postgres.
            let pgbouncer_settings = pgbouncer_settings.clone();
-            let _handle = thread::spawn(move || {
-                let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
+            tokio::spawn(async move {
+                let res = tune_pgbouncer(pgbouncer_settings).await;
                if let Err(err) = res {
                    error!("error while tuning pgbouncer: {err:?}");
                }
@@ -1382,20 +1385,20 @@ impl ComputeNode {
        if let Some(ref local_proxy) = spec.local_proxy_config {
            info!("configuring local_proxy");

-            // Spawn a thread to do the configuration,
+            // Spawn a background task to do the configuration,
            // so that we don't block the main thread that starts Postgres.
            let local_proxy = local_proxy.clone();
-            let _handle = Some(thread::spawn(move || {
+            tokio::spawn(async move {
                if let Err(err) = local_proxy::configure(&local_proxy) {
                    error!("error while configuring local_proxy: {err:?}");
                }
-            }));
+            });
        }

        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
        let postgresql_conf_path = pgdata_path.join("postgresql.conf");
-        config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?;
+        config::write_postgres_conf(&postgresql_conf_path, &spec, self.internal_http_port)?;

        let max_concurrent_connections = spec.reconfigure_concurrency;

@@ -1431,7 +1434,9 @@ impl ComputeNode {
    }

    #[instrument(skip_all)]
-    pub fn start_compute(&self) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
+    pub fn start_compute(
+        &self,
+    ) -> Result<(std::process::Child, tokio::task::JoinHandle<Result<()>>)> {
        let compute_state = self.state.lock().unwrap().clone();
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
@@ -1446,16 +1451,11 @@ impl ComputeNode {
        if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
            info!("tuning pgbouncer");

-            let rt = tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()
-                .expect("failed to create rt");
-
-            // Spawn a thread to do the tuning,
+            // Spawn a background task to do the tuning,
            // so that we don't block the main thread that starts Postgres.
            let pgbouncer_settings = pgbouncer_settings.clone();
-            let _handle = thread::spawn(move || {
-                let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
+            let _handle = tokio::spawn(async move {
+                let res = tune_pgbouncer(pgbouncer_settings).await;
                if let Err(err) = res {
                    error!("error while tuning pgbouncer: {err:?}");
                }
@@ -1465,10 +1465,10 @@ impl ComputeNode {
        if let Some(local_proxy) = &pspec.spec.local_proxy_config {
            info!("configuring local_proxy");

-            // Spawn a thread to do the configuration,
+            // Spawn a background task to do the configuration,
            // so that we don't block the main thread that starts Postgres.
            let local_proxy = local_proxy.clone();
-            let _handle = thread::spawn(move || {
+            let _handle = tokio::spawn(async move {
                if let Err(err) = local_proxy::configure(&local_proxy) {
                    error!("error while configuring local_proxy: {err:?}");
                }
@@ -1487,7 +1487,8 @@ impl ComputeNode {
            extension_server::create_control_files(remote_extensions, &self.pgbin);

            let library_load_start_time = Utc::now();
-            let remote_ext_metrics = self.prepare_preload_libraries(&pspec.spec)?;
+            let rt = tokio::runtime::Handle::current();
+            let remote_ext_metrics = rt.block_on(self.prepare_preload_libraries(&pspec.spec))?;

            let library_load_time = Utc::now()
                .signed_duration_since(library_load_start_time)
@@ -1542,7 +1543,7 @@ impl ComputeNode {
            self.post_apply_config()?;

            let conf = self.get_conn_conf(None);
-            thread::spawn(move || {
+            tokio::task::spawn_blocking(|| {
                let res = get_installed_extensions(conf);
                match res {
                    Ok(extensions) => {
@@ -1891,7 +1892,6 @@ LIMIT 100",
        Ok(ext_version)
    }

-    #[tokio::main]
    pub async fn prepare_preload_libraries(
        &self,
        spec: &ComputeSpec,
--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -51,9 +51,12 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
 pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
    let compute = Arc::clone(compute);

+    let runtime = tokio::runtime::Handle::current();
+
    thread::Builder::new()
        .name("compute-configurator".into())
        .spawn(move || {
+            let _rt_guard = runtime.enter();
            configurator_main_loop(&compute);
            info!("configurator thread is exited");
        })
--- a/compute_tools/src/http/mod.rs
+++ b/compute_tools/src/http/mod.rs
@@ -4,11 +4,9 @@ use http::{header::CONTENT_TYPE, StatusCode};
 use serde::Serialize;
 use tracing::error;

-pub use server::launch_http_server;
-
 mod extract;
 mod routes;
-mod server;
+pub mod server;

 /// Convenience response builder for JSON responses
 struct JsonResponse;
--- a/compute_tools/src/http/routes/failpoints.rs
+++ b/compute_tools/src/http/routes/failpoints.rs
@@ -1,7 +1,21 @@
 use axum::response::{IntoResponse, Response};
 use http::StatusCode;
+use serde::{Deserialize, Serialize};
 use tracing::info;
-use utils::failpoint_support::{apply_failpoint, ConfigureFailpointsRequest};
+use utils::failpoint_support::apply_failpoint;
+
+pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
+
+/// Information for configuring a single fail point
+#[derive(Debug, Serialize, Deserialize)]
+pub struct FailpointConfig {
+    /// Name of the fail point
+    pub name: String,
+    /// List of actions to take, using the format described in `fail::cfg`
+    ///
+    /// We also support `actions = "exit"` to cause the fail point to immediately exit.
+    pub actions: String,
+}

 use crate::http::{extract::Json, JsonResponse};

--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -1,7 +1,7 @@
 use std::{
+    fmt::Display,
    net::{IpAddr, Ipv6Addr, SocketAddr},
    sync::Arc,
-    thread,
    time::Duration,
 };

@@ -26,46 +26,65 @@ use super::routes::{
 };
 use crate::compute::ComputeNode;

-async fn handle_404() -> Response {
-    StatusCode::NOT_FOUND.into_response()
-}
-
 const X_REQUEST_ID: &str = "x-request-id";

-/// This middleware function allows compute_ctl to generate its own request ID
-/// if one isn't supplied. The control plane will always send one as a UUID. The
-/// neon Postgres extension on the other hand does not send one.
-async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
-    let headers = request.headers_mut();
-
-    if headers.get(X_REQUEST_ID).is_none() {
-        headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
-    }
-
-    next.run(request).await
+/// `compute_ctl` has two servers: internal and external. The internal server
+/// binds to the loopback interface and handles communication from clients on
+/// the compute. The external server is what receives communication from the
+/// control plane, the metrics scraper, etc. We make the distinction because
+/// certain routes in `compute_ctl` only need to be exposed to local processes
+/// like Postgres via the neon extension and local_proxy.
+#[derive(Clone, Copy, Debug)]
+pub enum Server {
+    Internal(u16),
+    External(u16),
 }

-/// Run the HTTP server and wait on it forever.
-#[tokio::main]
-async fn serve(port: u16, compute: Arc<ComputeNode>) {
-    let mut app = Router::new()
-        .route("/check_writability", post(check_writability::is_writable))
-        .route("/configure", post(configure::configure))
-        .route("/database_schema", get(database_schema::get_schema_dump))
-        .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
-        .route(
-            "/extension_server/{*filename}",
-            post(extension_server::download_extension),
-        )
-        .route("/extensions", post(extensions::install_extension))
-        .route("/grants", post(grants::add_grant))
-        .route("/insights", get(insights::get_insights))
-        .route("/metrics", get(metrics::get_metrics))
-        .route("/metrics.json", get(metrics_json::get_metrics))
-        .route("/status", get(status::get_status))
-        .route("/terminate", post(terminate::terminate))
-        .fallback(handle_404)
-        .layer(
+impl Display for Server {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Server::Internal(_) => f.write_str("internal"),
+            Server::External(_) => f.write_str("external"),
+        }
+    }
+}
+
+impl From<Server> for Router<Arc<ComputeNode>> {
+    fn from(server: Server) -> Self {
+        let mut router = Router::<Arc<ComputeNode>>::new();
+
+        router = match server {
+            Server::Internal(_) => {
+                router = router
+                    .route(
+                        "/extension_server/{*filename}",
+                        post(extension_server::download_extension),
+                    )
+                    .route("/extensions", post(extensions::install_extension))
+                    .route("/grants", post(grants::add_grant));
+
+                // Add in any testing support
+                if cfg!(feature = "testing") {
+                    use super::routes::failpoints;
+
+                    router = router.route("/failpoints", post(failpoints::configure_failpoints));
+                }
+
+                router
+            }
+            Server::External(_) => router
+                .route("/check_writability", post(check_writability::is_writable))
+                .route("/configure", post(configure::configure))
+                .route("/database_schema", get(database_schema::get_schema_dump))
+                .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
+                .route("/insights", get(insights::get_insights))
+                .route("/metrics", get(metrics::get_metrics))
+                .route("/metrics.json", get(metrics_json::get_metrics))
+                .route("/status", get(status::get_status))
+                .route("/terminate", post(terminate::terminate)),
+        };
+
+        router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer(
            ServiceBuilder::new()
                // Add this middleware since we assume the request ID exists
                .layer(middleware::from_fn(maybe_add_request_id_header))
@@ -105,45 +124,88 @@ async fn serve(port: u16, compute: Arc<ComputeNode>) {
                )
                .layer(PropagateRequestIdLayer::x_request_id()),
        )
-        .with_state(compute);
+    }
+}

-    // Add in any testing support
-    if cfg!(feature = "testing") {
-        use super::routes::failpoints;
-
-        app = app.route("/failpoints", post(failpoints::configure_failpoints))
+impl Server {
+    async fn handle_404() -> impl IntoResponse {
+        StatusCode::NOT_FOUND
    }

-    // This usually binds to both IPv4 and IPv6 on Linux, see
-    // https://github.com/rust-lang/rust/pull/34440 for more information
-    let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
-    let listener = match TcpListener::bind(&addr).await {
-        Ok(listener) => listener,
-        Err(e) => {
-            error!(
-                "failed to bind the compute_ctl HTTP server to port {}: {}",
-                port, e
-            );
-            return;
+    async fn handle_405() -> impl IntoResponse {
+        StatusCode::METHOD_NOT_ALLOWED
+    }
+
+    async fn listener(&self) -> Result<TcpListener> {
+        let addr = SocketAddr::new(self.ip(), self.port());
+        let listener = TcpListener::bind(&addr).await?;
+
+        Ok(listener)
+    }
+
+    fn ip(&self) -> IpAddr {
+        match self {
+            // TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners
+            // allow binding to localhost
+            Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
+            Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
        }
-    };
-
-    if let Ok(local_addr) = listener.local_addr() {
-        info!("compute_ctl HTTP server listening on {}", local_addr);
-    } else {
-        info!("compute_ctl HTTP server listening on port {}", port);
    }

-    if let Err(e) = axum::serve(listener, app).await {
-        error!("compute_ctl HTTP server error: {}", e);
+    fn port(self) -> u16 {
+        match self {
+            Server::Internal(port) => port,
+            Server::External(port) => port,
+        }
+    }
+
+    async fn serve(self, compute: Arc<ComputeNode>) {
+        let listener = self.listener().await.unwrap_or_else(|e| {
+            // If we can't bind, the compute cannot operate correctly
+            panic!(
+                "failed to bind the compute_ctl {} HTTP server to {}: {}",
+                self,
+                SocketAddr::new(self.ip(), self.port()),
+                e
+            );
+        });
+
+        if tracing::enabled!(tracing::Level::INFO) {
+            let local_addr = match listener.local_addr() {
+                Ok(local_addr) => local_addr,
+                Err(_) => SocketAddr::new(self.ip(), self.port()),
+            };
+
+            info!(
+                "compute_ctl {} HTTP server listening at {}",
+                self, local_addr
+            );
+        }
+
+        let router = Router::from(self).with_state(compute);
+
+        if let Err(e) = axum::serve(listener, router).await {
+            error!("compute_ctl {} HTTP server error: {}", self, e);
+        }
+    }
+
+    pub fn launch(self, compute: &Arc<ComputeNode>) {
+        let state = Arc::clone(compute);
+
+        info!("Launching the {} server", self);
+
+        tokio::spawn(self.serve(state));
    }
 }

-/// Launch a separate HTTP server thread and return its `JoinHandle`.
-pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
-    let state = Arc::clone(state);
+/// This middleware function allows compute_ctl to generate its own request ID
+/// if one isn't supplied. The control plane will always send one as a UUID. The
+/// neon Postgres extension on the other hand does not send one.
+async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
+    let headers = request.headers_mut();
+    if headers.get(X_REQUEST_ID).is_none() {
+        headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
+    }

-    Ok(thread::Builder::new()
-        .name("http-server".into())
-        .spawn(move || serve(port, state))?)
+    next.run(request).await
 }
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -11,7 +11,7 @@ use tracing_subscriber::prelude::*;
 /// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See
 /// `tracing-utils` package description.
 ///
-pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
+pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
    // Initialize Logging
    let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
@@ -22,7 +22,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
        .with_writer(std::io::stderr);

    // Initialize OpenTelemetry
-    let otlp_layer = tracing_utils::init_tracing_without_runtime("compute_ctl");
+    let otlp_layer = tracing_utils::init_tracing("compute_ctl").await;

    // Put it all together
    tracing_subscriber::registry()
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -1,6 +1,6 @@
 use anyhow::{Context, Result};
 use fail::fail_point;
-use postgres::{Client, Transaction};
+use tokio_postgres::{Client, Transaction};
 use tracing::{error, info};

 use crate::metrics::DB_MIGRATION_FAILED;
@@ -21,10 +21,11 @@ impl<'m> MigrationRunner<'m> {
    }

    /// Get the current value neon_migration.migration_id
-    fn get_migration_id(&mut self) -> Result<i64> {
+    async fn get_migration_id(&mut self) -> Result<i64> {
        let row = self
            .client
-            .query_one("SELECT id FROM neon_migration.migration_id", &[])?;
+            .query_one("SELECT id FROM neon_migration.migration_id", &[])
+            .await?;

        Ok(row.get::<&str, i64>("id"))
    }
@@ -34,7 +35,7 @@ impl<'m> MigrationRunner<'m> {
    /// This function has a fail point called compute-migration, which can be
    /// used if you would like to fail the application of a series of migrations
    /// at some point.
-    fn update_migration_id(txn: &mut Transaction, migration_id: i64) -> Result<()> {
+    async fn update_migration_id(txn: &mut Transaction<'_>, migration_id: i64) -> Result<()> {
        // We use this fail point in order to check that failing in the
        // middle of applying a series of migrations fails in an expected
        // manner
@@ -59,31 +60,38 @@ impl<'m> MigrationRunner<'m> {
            "UPDATE neon_migration.migration_id SET id = $1",
            &[&migration_id],
        )
+        .await
        .with_context(|| format!("update neon_migration.migration_id to {migration_id}"))?;

        Ok(())
    }

    /// Prepare the migrations the target database for handling migrations
-    fn prepare_database(&mut self) -> Result<()> {
+    async fn prepare_database(&mut self) -> Result<()> {
        self.client
-            .simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration")?;
-        self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)")?;
-        self.client.simple_query(
-            "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING",
-        )?;
+            .simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration")
+            .await?;
+        self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)").await?;
        self.client
-            .simple_query("ALTER SCHEMA neon_migration OWNER TO cloud_admin")?;
+            .simple_query(
+                "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING",
+            )
+            .await?;
        self.client
-            .simple_query("REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC")?;
+            .simple_query("ALTER SCHEMA neon_migration OWNER TO cloud_admin")
+            .await?;
+        self.client
+            .simple_query("REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC")
+            .await?;

        Ok(())
    }

    /// Run an individual migration in a separate transaction block.
-    fn run_migration(client: &mut Client, migration_id: i64, migration: &str) -> Result<()> {
+    async fn run_migration(client: &mut Client, migration_id: i64, migration: &str) -> Result<()> {
        let mut txn = client
            .transaction()
+            .await
            .with_context(|| format!("begin transaction for migration {migration_id}"))?;

        if migration.starts_with("-- SKIP") {
@@ -92,35 +100,38 @@ impl<'m> MigrationRunner<'m> {
            // Even though we are skipping the migration, updating the
            // migration ID should help keep logic easy to understand when
            // trying to understand the state of a cluster.
-            Self::update_migration_id(&mut txn, migration_id)?;
+            Self::update_migration_id(&mut txn, migration_id).await?;
        } else {
            info!("Running migration id={}:\n{}\n", migration_id, migration);

            txn.simple_query(migration)
+                .await
                .with_context(|| format!("apply migration {migration_id}"))?;

-            Self::update_migration_id(&mut txn, migration_id)?;
+            Self::update_migration_id(&mut txn, migration_id).await?;
        }

        txn.commit()
+            .await
            .with_context(|| format!("commit transaction for migration {migration_id}"))?;

        Ok(())
    }

    /// Run the configured set of migrations
-    pub fn run_migrations(mut self) -> Result<()> {
+    pub async fn run_migrations(mut self) -> Result<()> {
        self.prepare_database()
+            .await
            .context("prepare database to handle migrations")?;

-        let mut current_migration = self.get_migration_id()? as usize;
+        let mut current_migration = self.get_migration_id().await? as usize;
        while current_migration < self.migrations.len() {
            // The index lags the migration ID by 1, so the current migration
            // ID is also the next index
            let migration_id = (current_migration + 1) as i64;
            let migration = self.migrations[current_migration];

-            match Self::run_migration(self.client, migration_id, migration) {
+            match Self::run_migration(self.client, migration_id, migration).await {
                Ok(_) => {
                    info!("Finished migration id={}", migration_id);
                }
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -7,7 +7,6 @@ use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::Child;
 use std::str::FromStr;
-use std::thread::JoinHandle;
 use std::time::{Duration, Instant};

 use anyhow::{bail, Result};
@@ -16,6 +15,7 @@ use ini::Ini;
 use notify::{RecursiveMode, Watcher};
 use postgres::config::Config;
 use tokio::io::AsyncBufReadExt;
+use tokio::task::JoinHandle;
 use tokio::time::timeout;
 use tokio_postgres;
 use tokio_postgres::NoTls;
@@ -477,23 +477,13 @@ pub async fn tune_pgbouncer(pgbouncer_config: HashMap<String, String>) -> Result
    Ok(())
 }

-/// Spawn a thread that will read Postgres logs from `stderr`, join multiline logs
+/// Spawn a task that will read Postgres logs from `stderr`, join multiline logs
 /// and send them to the logger. In the future we may also want to add context to
 /// these logs.
-pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> {
-    std::thread::spawn(move || {
-        let runtime = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .expect("failed to build tokio runtime");
-
-        let res = runtime.block_on(async move {
-            let stderr = tokio::process::ChildStderr::from_std(stderr)?;
-            handle_postgres_logs_async(stderr).await
-        });
-        if let Err(e) = res {
-            tracing::error!("error while processing postgres logs: {}", e);
-        }
+pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<Result<()>> {
+    tokio::spawn(async move {
+        let stderr = tokio::process::ChildStderr::from_std(stderr)?;
+        handle_postgres_logs_async(stderr).await
    })
 }

--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,8 +1,8 @@
 use anyhow::{anyhow, bail, Result};
-use postgres::Client;
 use reqwest::StatusCode;
 use std::fs::File;
 use std::path::Path;
+use tokio_postgres::Client;
 use tracing::{error, info, instrument, warn};

 use crate::config;
@@ -166,17 +166,17 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
 }

 #[instrument(skip_all)]
-pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
+pub async fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
    info!("handle neon extension upgrade");
    let query = "ALTER EXTENSION neon UPDATE";
    info!("update neon extension version with query: {}", query);
-    client.simple_query(query)?;
+    client.simple_query(query).await?;

    Ok(())
 }

 #[instrument(skip_all)]
-pub fn handle_migrations(client: &mut Client) -> Result<()> {
+pub async fn handle_migrations(client: &mut Client) -> Result<()> {
    info!("handle migrations");

    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@@ -206,7 +206,9 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
        ),
    ];

-    MigrationRunner::new(client, &migrations).run_migrations()?;
+    MigrationRunner::new(client, &migrations)
+        .run_migrations()
+        .await?;

    Ok(())
 }
@@ -214,7 +216,7 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
 /// Connect to the database as superuser and pre-create anon extension
 /// if it is present in shared_preload_libraries
 #[instrument(skip_all)]
-pub fn handle_extension_anon(
+pub async fn handle_extension_anon(
    spec: &ComputeSpec,
    db_owner: &str,
    db_client: &mut Client,
@@ -227,7 +229,7 @@ pub fn handle_extension_anon(
            if !grants_only {
                // check if extension is already initialized using anon.is_initialized()
                let query = "SELECT anon.is_initialized()";
-                match db_client.query(query, &[]) {
+                match db_client.query(query, &[]).await {
                    Ok(rows) => {
                        if !rows.is_empty() {
                            let is_initialized: bool = rows[0].get(0);
@@ -249,7 +251,7 @@ pub fn handle_extension_anon(
                // Users cannot create it themselves, because superuser is required.
                let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE";
                info!("creating anon extension with query: {}", query);
-                match db_client.query(query, &[]) {
+                match db_client.query(query, &[]).await {
                    Ok(_) => {}
                    Err(e) => {
                        error!("anon extension creation failed with error: {}", e);
@@ -259,7 +261,7 @@ pub fn handle_extension_anon(

                // check that extension is installed
                query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
-                let rows = db_client.query(query, &[])?;
+                let rows = db_client.query(query, &[]).await?;
                if rows.is_empty() {
                    error!("anon extension is not installed");
                    return Ok(());
@@ -268,7 +270,7 @@ pub fn handle_extension_anon(
                // Initialize anon extension
                // This also requires superuser privileges, so users cannot do it themselves.
                query = "SELECT anon.init()";
-                match db_client.query(query, &[]) {
+                match db_client.query(query, &[]).await {
                    Ok(_) => {}
                    Err(e) => {
                        error!("anon.init() failed with error: {}", e);
@@ -279,7 +281,7 @@ pub fn handle_extension_anon(

            // check that extension is installed, if not bail early
            let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
-            match db_client.query(query, &[]) {
+            match db_client.query(query, &[]).await {
                Ok(rows) => {
                    if rows.is_empty() {
                        error!("anon extension is not installed");
@@ -294,12 +296,12 @@ pub fn handle_extension_anon(

            let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner);
            info!("granting anon extension permissions with query: {}", query);
-            db_client.simple_query(&query)?;
+            db_client.simple_query(&query).await?;

            // Grant permissions to db_owner to use anon extension functions
            let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner);
            info!("granting anon extension permissions with query: {}", query);
-            db_client.simple_query(&query)?;
+            db_client.simple_query(&query).await?;

            // This is needed, because some functions are defined as SECURITY DEFINER.
            // In Postgres SECURITY DEFINER functions are executed with the privileges
@@ -314,16 +316,16 @@ pub fn handle_extension_anon(
                where nsp.nspname = 'anon';", db_owner);

            info!("change anon extension functions owner to db owner");
-            db_client.simple_query(&query)?;
+            db_client.simple_query(&query).await?;

            //  affects views as well
            let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner);
            info!("granting anon extension permissions with query: {}", query);
-            db_client.simple_query(&query)?;
+            db_client.simple_query(&query).await?;

            let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner);
            info!("granting anon extension permissions with query: {}", query);
-            db_client.simple_query(&query)?;
+            db_client.simple_query(&query).await?;
        }
    }

--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -33,6 +33,7 @@ postgres_backend.workspace = true
 safekeeper_api.workspace = true
 postgres_connection.workspace = true
 storage_broker.workspace = true
+http-utils.workspace = true
 utils.workspace = true
 whoami.workspace = true

--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -261,7 +261,13 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
    let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", backtrace_setting);

    // Pass through these environment variables to the command
-    for var in ["LLVM_PROFILE_FILE", "FAILPOINTS", "RUST_LOG"] {
+    for var in [
+        "LLVM_PROFILE_FILE",
+        "FAILPOINTS",
+        "RUST_LOG",
+        "ASAN_OPTIONS",
+        "UBSAN_OPTIONS",
+    ] {
        if let Some(val) = std::env::var_os(var) {
            filled_cmd = filled_cmd.env(var, val);
        }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -552,8 +552,10 @@ struct EndpointCreateCmdArgs {
    lsn: Option<Lsn>,
    #[clap(long)]
    pg_port: Option<u16>,
+    #[clap(long, alias = "http-port")]
+    external_http_port: Option<u16>,
    #[clap(long)]
-    http_port: Option<u16>,
+    internal_http_port: Option<u16>,
    #[clap(long = "pageserver-id")]
    endpoint_pageserver_id: Option<NodeId>,

@@ -1353,7 +1355,8 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                tenant_id,
                timeline_id,
                args.pg_port,
-                args.http_port,
+                args.external_http_port,
+                args.internal_http_port,
                args.pg_version,
                mode,
                !args.update_catalog,
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -37,6 +37,8 @@
 //! ```
 //!
 use std::collections::BTreeMap;
+use std::net::IpAddr;
+use std::net::Ipv4Addr;
 use std::net::SocketAddr;
 use std::net::TcpStream;
 use std::path::PathBuf;
@@ -73,7 +75,8 @@ pub struct EndpointConf {
    timeline_id: TimelineId,
    mode: ComputeMode,
    pg_port: u16,
-    http_port: u16,
+    external_http_port: u16,
+    internal_http_port: u16,
    pg_version: u32,
    skip_pg_catalog_updates: bool,
    drop_subscriptions_before_start: bool,
@@ -128,7 +131,7 @@ impl ComputeControlPlane {
        1 + self
            .endpoints
            .values()
-            .map(|ep| std::cmp::max(ep.pg_address.port(), ep.http_address.port()))
+            .map(|ep| std::cmp::max(ep.pg_address.port(), ep.external_http_address.port()))
            .max()
            .unwrap_or(self.base_port)
    }
@@ -140,18 +143,27 @@ impl ComputeControlPlane {
        tenant_id: TenantId,
        timeline_id: TimelineId,
        pg_port: Option<u16>,
-        http_port: Option<u16>,
+        external_http_port: Option<u16>,
+        internal_http_port: Option<u16>,
        pg_version: u32,
        mode: ComputeMode,
        skip_pg_catalog_updates: bool,
        drop_subscriptions_before_start: bool,
    ) -> Result<Arc<Endpoint>> {
        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
-        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
+        let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1);
+        let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1);
        let ep = Arc::new(Endpoint {
            endpoint_id: endpoint_id.to_owned(),
-            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
-            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
+            pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), pg_port),
+            external_http_address: SocketAddr::new(
+                IpAddr::from(Ipv4Addr::UNSPECIFIED),
+                external_http_port,
+            ),
+            internal_http_address: SocketAddr::new(
+                IpAddr::from(Ipv4Addr::LOCALHOST),
+                internal_http_port,
+            ),
            env: self.env.clone(),
            timeline_id,
            mode,
@@ -176,7 +188,8 @@ impl ComputeControlPlane {
                tenant_id,
                timeline_id,
                mode,
-                http_port,
+                external_http_port,
+                internal_http_port,
                pg_port,
                pg_version,
                skip_pg_catalog_updates,
@@ -230,9 +243,10 @@ pub struct Endpoint {
    pub timeline_id: TimelineId,
    pub mode: ComputeMode,

-    // port and address of the Postgres server and `compute_ctl`'s HTTP API
+    // port and address of the Postgres server and `compute_ctl`'s HTTP APIs
    pub pg_address: SocketAddr,
-    pub http_address: SocketAddr,
+    pub external_http_address: SocketAddr,
+    pub internal_http_address: SocketAddr,

    // postgres major version in the format: 14, 15, etc.
    pg_version: u32,
@@ -287,8 +301,15 @@ impl Endpoint {
            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;

        Ok(Endpoint {
-            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
-            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
+            pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), conf.pg_port),
+            external_http_address: SocketAddr::new(
+                IpAddr::from(Ipv4Addr::UNSPECIFIED),
+                conf.external_http_port,
+            ),
+            internal_http_address: SocketAddr::new(
+                IpAddr::from(Ipv4Addr::LOCALHOST),
+                conf.internal_http_port,
+            ),
            endpoint_id,
            env: env.clone(),
            timeline_id: conf.timeline_id,
@@ -650,24 +671,51 @@ impl Endpoint {
            println!("Also at '{}'", conn_str);
        }
        let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
-        cmd.args(["--http-port", &self.http_address.port().to_string()])
-            .args(["--pgdata", self.pgdata().to_str().unwrap()])
-            .args(["--connstr", &conn_str])
-            .args([
-                "--spec-path",
-                self.endpoint_path().join("spec.json").to_str().unwrap(),
-            ])
-            .args([
-                "--pgbin",
-                self.env
-                    .pg_bin_dir(self.pg_version)?
-                    .join("postgres")
-                    .to_str()
-                    .unwrap(),
-            ])
-            .stdin(std::process::Stdio::null())
-            .stderr(logfile.try_clone()?)
-            .stdout(logfile);
+        //cmd.args([
+        //    "--external-http-port",
+        //    &self.external_http_address.port().to_string(),
+        //])
+        //.args([
+        //    "--internal-http-port",
+        //    &self.internal_http_address.port().to_string(),
+        //])
+        cmd.args([
+            "--http-port",
+            &self.external_http_address.port().to_string(),
+        ])
+        .args(["--pgdata", self.pgdata().to_str().unwrap()])
+        .args(["--connstr", &conn_str])
+        .args([
+            "--spec-path",
+            self.endpoint_path().join("spec.json").to_str().unwrap(),
+        ])
+        .args([
+            "--pgbin",
+            self.env
+                .pg_bin_dir(self.pg_version)?
+                .join("postgres")
+                .to_str()
+                .unwrap(),
+        ])
+        // TODO: It would be nice if we generated compute IDs with the same
+        // algorithm as the real control plane.
+        //
+        // TODO: Add this back when
+        // https://github.com/neondatabase/neon/pull/10747 is merged.
+        //
+        //.args([
+        //    "--compute-id",
+        //    &format!(
+        //        "compute-{}",
+        //        SystemTime::now()
+        //            .duration_since(UNIX_EPOCH)
+        //            .unwrap()
+        //            .as_secs()
+        //    ),
+        //])
+        .stdin(std::process::Stdio::null())
+        .stderr(logfile.try_clone()?)
+        .stdout(logfile);

        if let Some(remote_ext_config) = remote_ext_config {
            cmd.args(["--remote-ext-config", remote_ext_config]);
@@ -754,8 +802,8 @@ impl Endpoint {
                reqwest::Method::GET,
                format!(
                    "http://{}:{}/status",
-                    self.http_address.ip(),
-                    self.http_address.port()
+                    self.external_http_address.ip(),
+                    self.external_http_address.port()
                ),
            )
            .send()
@@ -828,8 +876,8 @@ impl Endpoint {
        let response = client
            .post(format!(
                "http://{}:{}/configure",
-                self.http_address.ip(),
-                self.http_address.port()
+                self.external_http_address.ip(),
+                self.external_http_address.port()
            ))
            .header(CONTENT_TYPE.as_str(), "application/json")
            .body(format!(
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -357,6 +357,11 @@ impl PageServerNode {
                .map(serde_json::from_str)
                .transpose()
                .context("Failed to parse 'compaction_algorithm' json")?,
+            compaction_l0_first: settings
+                .remove("compaction_l0_first")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'compaction_l0_first' as a bool")?,
            l0_flush_delay_threshold: settings
                .remove("l0_flush_delay_threshold")
                .map(|x| x.parse::<usize>())
@@ -388,6 +393,11 @@ impl PageServerNode {
                .map(|x| x.parse::<u8>())
                .transpose()
                .context("Failed to parse 'image_creation_check_threshold' as integer")?,
+            image_creation_preempt_threshold: settings
+                .remove("image_creation_preempt_threshold")
+                .map(|x| x.parse::<usize>())
+                .transpose()
+                .context("Failed to parse 'image_creation_preempt_threshold' as integer")?,
            pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
            walreceiver_connect_timeout: settings
                .remove("walreceiver_connect_timeout")
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -17,8 +17,10 @@ use camino::Utf8PathBuf;
 use postgres_connection::PgConnectionConfig;
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
+
+use http_utils::error::HttpErrorBody;
 use utils::auth::{Claims, Scope};
-use utils::{http::error::HttpErrorBody, id::NodeId};
+use utils::id::NodeId;

 use crate::{
    background_process,
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -221,7 +221,17 @@ impl StorageController {
            "-p",
            &format!("{}", postgres_port),
        ];
-        let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
+        let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
+        let envs = [
+            ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+        ];
+        let exitcode = Command::new(bin_path)
+            .args(args)
+            .envs(envs)
+            .spawn()?
+            .wait()
+            .await?;

        Ok(exitcode.success())
    }
@@ -242,6 +252,11 @@ impl StorageController {

        let pg_bin_dir = self.get_pg_bin_dir().await?;
        let createdb_path = pg_bin_dir.join("createdb");
+        let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
+        let envs = [
+            ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+        ];
        let output = Command::new(&createdb_path)
            .args([
                "-h",
@@ -254,6 +269,7 @@ impl StorageController {
                &username(),
                DB_NAME,
            ])
+            .envs(envs)
            .output()
            .await
            .expect("Failed to spawn createdb");
--- a/deny.toml
+++ b/deny.toml
@@ -32,6 +32,7 @@ reason = "the marvin attack only affects private key decryption, not public key
 # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
 [licenses]
 allow = [
+    "0BSD",
    "Apache-2.0",
    "BSD-2-Clause",
    "BSD-3-Clause",
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -52,6 +52,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do

    if [ $pg_version -ge 16 ]; then
        docker cp ext-src $TEST_CONTAINER_NAME:/
+        docker exec $TEST_CONTAINER_NAME bash -c "apt update && apt install -y libtap-parser-sourcehandler-pgtap-perl"
        # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
        # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
        echo Adding dummy config
--- a/docker-compose/ext-src/pgjwt-src/neon-test.sh
+++ b/docker-compose/ext-src/pgjwt-src/neon-test.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -ex
+cd "$(dirname "${0}")"
+pg_prove test.sql
--- a/docker-compose/ext-src/pgjwt-src/test-upgrade.patch
+++ b/docker-compose/ext-src/pgjwt-src/test-upgrade.patch
@@ -0,0 +1,15 @@
+diff --git a/test.sql b/test.sql
+index d7a0ca8..f15bc76 100644
+--- a/test.sql
+++ b/test.sql
+@@ -9,9 +9,7 @@
+ \set ON_ERROR_STOP true
+ \set QUIET 1
+ 
+-CREATE EXTENSION pgcrypto;
+-CREATE EXTENSION pgtap;
+-CREATE EXTENSION pgjwt;
+CREATE EXTENSION IF NOT EXISTS pgtap;
+ 
+ BEGIN;
+ SELECT plan(23);
--- a/docker-compose/ext-src/pgjwt-src/test-upgrade.sh
+++ b/docker-compose/ext-src/pgjwt-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+pg_prove -d contrib_regression test.sql
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -24,7 +24,7 @@ function wait_for_ready {
 }
 function create_extensions() {
  for ext in ${1}; do
-    docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext}"
+    docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext} CASCADE"
  done
 }
 EXTENSIONS='[
@@ -40,7 +40,8 @@ EXTENSIONS='[
 {"extname": "pg_uuidv7", "extdir": "pg_uuidv7-src"},
 {"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"},
 {"extname": "semver", "extdir": "pg_semver-src"},
-{"extname": "pg_ivm", "extdir": "pg_ivm-src"}
+{"extname": "pg_ivm", "extdir": "pg_ivm-src"},
+{"extname": "pgjwt", "extdir": "pgjwt-src"}
 ]'
 EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
 TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -285,10 +285,10 @@ To summarize, list of cplane changes:

 ### storage_controller implementation

-Current 'load everything on startup and keep in memory' easy design is fine.
-Single timeline shouldn't take more than 100 bytes (it's 16 byte tenant_id, 16
-byte timeline_id, int generation, vec of ~3 safekeeper ids plus some flags), so
-10^6 of timelines shouldn't take more than 100MB.
+If desired, we may continue using current 'load everything on startup and keep
+in memory' approach: single timeline shouldn't take more than 100 bytes (it's 16
+byte tenant_id, 16 byte timeline_id, int generation, vec of ~3 safekeeper ids
+plus some flags), so 10^6 of timelines shouldn't take more than 100MB.

 Similar to pageserver attachment Intents storage_controller would have in-memory
 `MigrationRequest` (or its absense) for each timeline and pool of tasks trying
@@ -296,7 +296,7 @@ to make these request reality; this ensures one instance of storage_controller
 won't do several migrations on the same timeline concurrently. In the first
 version it is simpler to have more manual control and no retries, i.e. migration
 failure removes the request. Later we can build retries and automatic
-scheduling/migration. `MigrationRequest` is
+scheduling/migration around. `MigrationRequest` is
 ```
 enum MigrationRequest {
    To(Vec<NodeId>),
@@ -313,9 +313,9 @@ similarly, in the first version it is ok to trigger it manually).
 #### Schema

 `safekeepers` table mirroring current `nodes` should be added, except that for
-`scheduling_policy` field (seems like `status` is a better name for it): it is enough
-to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3)
-`decomissioned`.
+`scheduling_policy`: it is enough to have at least in the beginning only 3
+fields: 1) `active` 2) `paused` (initially means only not assign new tlis there
+3) `decomissioned` (node is removed).

 `timelines` table:
 ```
@@ -324,18 +324,24 @@ table! {
    timelines (tenant_id, timeline_id) {
        timeline_id -> Varchar,
        tenant_id -> Varchar,
+        start_lsn -> pg_lsn,
        generation -> Int4,
        sk_set -> Array<Int4>, // list of safekeeper ids
-        new_sk_set -> Nullable<Array<Int4>>, // list of safekeeper ids, null if not joint conf
+        new_sk_set -> Nullable<Array<Int8>>, // list of safekeeper ids, null if not joint conf
        cplane_notified_generation -> Int4,
+        deleted_at -> Nullable<Timestamptz>,
    }
 }
 ```

+`start_lsn` is needed to create timeline on safekeepers properly, see below. We
+might also want to add ancestor_timeline_id to preserve the hierarchy, but for
+this RFC it is not needed.
+
 #### API

 Node management is similar to pageserver:
-1) POST `/control/v1/safekeepers` upserts safekeeper.
+1) POST `/control/v1/safekeepers` inserts safekeeper.
 2) GET `/control/v1/safekeepers` lists safekeepers.
 3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
 4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
@@ -345,25 +351,15 @@ Node management is similar to pageserver:
 Safekeeper deploy scripts should register safekeeper at storage_contorller as
 they currently do with cplane, under the same id.

-Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline`
-would 1) choose initial set of safekeepers; 2) write to the db initial
-`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in
-case of conflict; 3) create timeline on the majority of safekeepers (already
-created is ok).
+Timeline creation/deletion will work through already existing POST and DELETE
+`tenant/:tenant_id/timeline`. Cplane is expected to retry both until they
+succeed. See next section on the implementation details.

-We don't want to block timeline creation when one safekeeper is down. Currently
-this is solved by compute implicitly creating timeline on any safekeeper it is
-connected to. This creates ugly timeline state on safekeeper when timeline is
-created, but start LSN is not defined yet. It would be nice to remove this; to
-do that, controller can in the background retry to create timeline on
-safekeeper(s) which missed that during initial creation call. It can do that
-through `pull_timeline` from majority so it doesn't need to remember
-`parent_lsn` in its db.
-
-Timeline deletion removes the row from the db and forwards deletion to the
-current configuration members. Without additional actions deletions might leak,
-see below on this; initially let's ignore these, reporting to cplane success if
-at least one safekeeper deleted the timeline (this will remove s3 data).
+We don't want to block timeline creation/deletion when one safekeeper is down.
+Currently this is crutched by compute implicitly creating timeline on any
+safekeeper it is connected to. This creates ugly timeline state on safekeeper
+when timeline is created, but start LSN is not defined yet. Next section
+describes dealing with this.

 Tenant deletion repeats timeline deletion for all timelines.

@@ -395,26 +391,6 @@ Similar call should be added for the tenant.
 It would be great to have some way of subscribing to the results (apart from
 looking at logs/metrics).

-Migration is executed as described above. One subtlety is that (local) deletion on
-source safekeeper might fail, which is not a problem if we are going to
-decomission the node but leaves garbage otherwise. I'd propose in the first version
-1) Don't attempt deletion at all if node status is `offline`.
-2) If it failed, just issue warning.
-And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and
-remove garbage timelines for manual use. It will 1) list all timelines on the
-safekeeper 2) compare each one against configuration storage: if timeline
-doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can
-be deleted under generation number if node is not member of current generation.
-
-Automating this is untrivial; we'd need to register all potential missing
-deletions <tenant_id, timeline_id, generation, node_id> in the same transaction
-which switches configurations. Similarly when timeline is fully deleted to
-prevent cplane operation from blocking when some safekeeper is not available
-deletion should be also registered.
-
-One more task pool should infinitely retry notifying control plane about changed
-safekeeper sets.
-
 3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return
   current in memory state of the timeline and pending `MigrationRequest`,
   if any.
@@ -423,12 +399,153 @@ safekeeper sets.
   migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS
   (incrementing generation as always).

+#### API implementation and reconciliation
+
+For timeline creation/deletion we want to preserve the basic assumption that
+unreachable minority (1 sk of 3) doesn't block their completion, but eventually
+we want to finish creation/deletion on nodes which missed it (unless they are
+removed). Similarly for migration; it may and should finish even though excluded
+members missed their exclusion. And of course e.g. such pending exclusion on
+node C after migration ABC -> ABD must not prevent next migration ABD -> ABE. As
+another example, if some node missed timeline creation it clearly must not block
+migration from it. Hence it is natural to have per safekeeper background
+reconciler which retries these ops until they succeed. There are 3 possible
+operation types, and the type is defined by timeline state (membership
+configuration and whether it is deleted) and safekeeper id: we may need to
+create timeline on sk (node added), locally delete it (node excluded, somewhat
+similar to detach) or globally delete it (timeline is deleted).
+
+Next, on storage controller restart in principle these pending operations can be
+figured out by comparing safekeepers state against storcon state. But it seems
+better to me to materialize them in the database; it is not expensive, avoids
+these startup scans which themselves can fail etc and makes it very easy to see
+outstanding work directly at the source of truth -- the db. So we can add table
+`safekeeper_timeline_pending_ops`
+```
+table! {
+    // timeline_id, sk_id is primary key
+    safekeeper_timeline_pending_ops (sk_id, tenant_id, timeline_id) {
+        sk_id -> int8,
+        tenant_id -> Varchar,
+        timeline_id -> Varchar,
+        generation -> Int4,
+        op_type -> Varchar,
+    }
+}
+```
+
+`op_type` can be `include` (seed from peers and ensure generation is up to
+date), `exclude` (remove locally) and `delete`. Field is actually not strictly
+needed as it can be computed from current configuration, but gives more explicit
+observability.
+
+`generation` is necessary there because after op is done reconciler must remove
+it and not remove another row with higher gen which in theory might appear.
+
+Any insert of row should overwrite (remove) all rows with the same sk and
+timeline id but lower `generation` as next op makes previous obsolete. Insertion
+of `op_type` `delete` overwrites all rows.
+
+About `exclude`: rather than adding explicit safekeeper http endpoint, it is
+reasonable to reuse membership switch endpoint: if safekeeper is not member
+of the configuration it locally removes the timeline on the switch. In this case
+404 should also be considered an 'ok' answer by the caller.
+
+So, main loop of per sk reconcile reads `safekeeper_timeline_pending_ops`
+joined with timeline configuration to get current conf (with generation `n`)
+for the safekeeper and does the jobs, infinitely retrying failures:
+1) If node is member (`include`):
+  - Check if timeline exists on it, if not, call pull_timeline on it from 
+     other members
+  - Call switch configuration to the current
+2) If node is not member (`exclude`):
+  - Call switch configuration to the current, 404 is ok.
+3) If timeline is deleted (`delete`), call delete.
+
+In cases 1 and 2 remove `safekeeper_timeline_pending_ops` for the sk and 
+timeline with generation <= `n` if `op_type` is not `delete`.
+In case 3 also remove `safekeeper_timeline_pending_ops` 
+entry + remove `timelines` entry if there is nothing left  in `safekeeper_timeline_pending_ops` for the timeline.
+
+Let's consider in details how APIs can be implemented from this angle.
+
+Timeline creation. It is assumed that cplane retries it until success, so all
+actions must be idempotent. Now, a tricky point here is timeline start LSN. For
+the initial (tenant creation) call cplane doesn't know it. However, setting
+start_lsn on safekeepers during creation is a good thing -- it provides a
+guarantee that walproposer can always find a common point in WAL histories of
+safekeeper and its own, and so absense of it would be a clear sign of
+corruption. The following sequence works:
+1) Create timeline (or observe that it exists) on pageserver,
+   figuring out last_record_lsn in response.
+2) Choose safekeepers and insert (ON CONFLICT DO NOTHING) timeline row into the
+   db. Note that last_record_lsn returned on the previous step is movable as it
+   changes once ingestion starts, insert must not overwrite it (as well as other
+   fields like membership conf). On the contrary, start_lsn used in the next
+   step must be set to the value in the db. cplane_notified_generation can be set
+   to 1 (initial generation) in insert to avoid notifying cplane about initial 
+   conf as cplane will receive it in timeline creation request anyway.
+3) Issue timeline creation calls to at least majority of safekeepers. Using
+   majority here is not necessary but handy because it guarantees that any live
+   majority will have at least one sk with created timeline and so
+   reconciliation task can use pull_timeline shared with migration instead of
+   create timeline special init case. OFC if timeline is already exists call is
+   ignored.
+4) For minority of safekeepers which could have missed creation insert
+   entries to `safekeeper_timeline_pending_ops`. We won't miss this insertion 
+   because response to cplane is sent only after it has happened, and cplane 
+   retries the call until 200 response.
+
+   There is a small question how request handler (timeline creation in this
+   case) would interact with per sk reconciler. As always I prefer to do the
+   simplest possible thing and here it seems to be just waking it up so it
+   re-reads the db for work to do. Passing work in memory is faster, but
+   that shouldn't matter, and path to scan db for work will exist anyway, 
+   simpler to reuse it.
+
+For pg version / wal segment size: while we may persist them in `timelines`
+table, it is not necessary as initial creation at step 3 can take them from
+pageserver or cplane creation call and later pull_timeline will carry them
+around.
+
+Timeline migration.
+1) CAS to the db to create joint conf, and in the same transaction create
+   `safekeeper_timeline_pending_ops` `include` entries to initialize new members
+   as well as deliver this conf to current ones; poke per sk reconcilers to work
+   on it. Also any conf change should also poke cplane notifier task(s).
+2) Once it becomes possible per alg description above, get out of joint conf
+   with another CAS. Task should get wakeups from per sk reconcilers because 
+   conf switch is required for advancement; however retries should be sleep
+   based as well as LSN advancement might be needed, though in happy path 
+   it isn't. To see whether further transition is possible on wakup migration
+   executor polls safekeepers per the algorithm. CAS creating new conf with only
+   new members should again insert entries to `safekeeper_timeline_pending_ops`
+   to switch them there, as well as `exclude` rows to remove timeline from 
+   old members.
+
+Timeline deletion: just set `deleted_at` on the timeline row and insert
+`safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by
+per sk reconcilers.
+
+When node is removed (set to `decomissioned`), `safekeeper_timeline_pending_ops`
+for it must be cleared in the same transaction.
+
+One more task pool should infinitely retry notifying control plane about changed
+safekeeper sets (trying making `cplane_notified_generation` equal `generation`).
+
 #### Dealing with multiple instances of storage_controller

 Operations described above executed concurrently might create some errors but do
 not prevent progress, so while we normally don't want to run multiple instances
 of storage_controller it is fine to have it temporarily, e.g. during redeploy.

+To harden against some controller instance creating some work in
+`safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up
+the job per sk reconcilers apart from explicit wakups should scan for work
+periodically. It is possible to remove that though if all db updates are
+protected with leadership token/term -- then such scans are needed only after
+leadership is acquired.
+
 Any interactions with db update in-memory controller state, e.g. if migration
 request failed because different one is in progress, controller remembers that
 and tries to finish it.
@@ -545,7 +662,7 @@ Aurora does this but similarly I don't think this is needed.

 We should use Compute <-> safekeeper protocol change to include other (long
 yearned) modifications:
- send data in network order to make arm work.
+- send data in network order without putting whole structs to be arch independent
 - remove term_start_lsn from AppendRequest
 - add horizon to TermHistory
 - add to ProposerGreeting number of connection from this wp to sk
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -204,14 +204,16 @@ impl RemoteExtSpec {

        // Check if extension is present in public or custom.
        // If not, then it is not allowed to be used by this compute.
-        if let Some(public_extensions) = &self.public_extensions {
-            if !public_extensions.contains(&real_ext_name.to_string()) {
-                if let Some(custom_extensions) = &self.custom_extensions {
-                    if !custom_extensions.contains(&real_ext_name.to_string()) {
-                        return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
-                    }
-                }
-            }
+        if !self
+            .public_extensions
+            .as_ref()
+            .is_some_and(|exts| exts.iter().any(|e| e == real_ext_name))
+            && !self
+                .custom_extensions
+                .as_ref()
+                .is_some_and(|exts| exts.iter().any(|e| e == real_ext_name))
+        {
+            return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
        }

        match self.extension_data.get(real_ext_name) {
@@ -340,6 +342,102 @@ mod tests {
    use super::*;
    use std::fs::File;

+    #[test]
+    fn allow_installing_remote_extensions() {
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": null,
+            "custom_extensions": null,
+            "library_index": {},
+            "extension_data": {},
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect_err("Extension should not be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": [],
+            "custom_extensions": null,
+            "library_index": {},
+            "extension_data": {},
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect_err("Extension should not be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": [],
+            "custom_extensions": [],
+            "library_index": {
+                "ext": "ext"
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect_err("Extension should not be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": [],
+            "custom_extensions": ["ext"],
+            "library_index": {
+                "ext": "ext"
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect("Extension should be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": ["ext"],
+            "custom_extensions": [],
+            "library_index": {
+                "extlib": "ext",
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect("Extension should be found");
+
+        // test library index for the case when library name
+        // doesn't match the extension name
+        rspec
+            .get_ext("extlib", true, "latest", "v17")
+            .expect("Library should be found");
+    }
+
    #[test]
    fn parse_spec_file() {
        let file = File::open("tests/cluster_spec.json").unwrap();
--- a/libs/http-utils/Cargo.toml
+++ b/libs/http-utils/Cargo.toml
@@ -0,0 +1,37 @@
+[package]
+name = "http-utils"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+backtrace.workspace = true
+bytes.workspace = true
+inferno.workspace = true
+fail.workspace = true
+flate2.workspace = true
+hyper0.workspace = true
+itertools.workspace = true
+jemalloc_pprof.workspace = true
+once_cell.workspace = true
+pprof.workspace = true
+regex.workspace = true
+routerify.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+serde_path_to_error.workspace = true
+thiserror.workspace = true
+tracing.workspace = true
+tokio.workspace = true
+tokio-util.workspace = true
+url.workspace = true
+uuid.workspace = true
+
+# to use tokio channels as streams, this is faster to compile than async_stream
+# why is it only here? no other crate should use it, streams are rarely needed.
+tokio-stream = { version = "0.1.14" }
+
+metrics.workspace = true
+utils.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/http-utils/src/endpoint.rs
+++ b/libs/http-utils/src/endpoint.rs
@@ -1,7 +1,6 @@
-use crate::auth::{AuthError, Claims, SwappableJwtAuth};
-use crate::http::error::{api_error_handler, route_error_handler, ApiError};
-use crate::http::request::{get_query_param, parse_query_param};
+use crate::error::{api_error_handler, route_error_handler, ApiError};
 use crate::pprof;
+use crate::request::{get_query_param, parse_query_param};
 use ::pprof::protos::Message as _;
 use ::pprof::ProfilerGuardBuilder;
 use anyhow::{anyhow, Context};
@@ -19,6 +18,7 @@ use tokio::sync::{mpsc, Mutex, Notify};
 use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::io::ReaderStream;
 use tracing::{debug, info, info_span, warn, Instrument};
+use utils::auth::{AuthError, Claims, SwappableJwtAuth};

 use std::future::Future;
 use std::io::Write as _;
@@ -718,9 +718,9 @@ pub fn check_permission_with(
 #[cfg(test)]
 mod tests {
    use super::*;
-    use futures::future::poll_fn;
    use hyper::service::Service;
    use routerify::RequestServiceBuilder;
+    use std::future::poll_fn;
    use std::net::{IpAddr, SocketAddr};

    #[tokio::test]
--- a/libs/http-utils/src/error.rs
+++ b/libs/http-utils/src/error.rs
@@ -5,6 +5,8 @@ use std::error::Error as StdError;
 use thiserror::Error;
 use tracing::{error, info, warn};

+use utils::auth::AuthError;
+
 #[derive(Debug, Error)]
 pub enum ApiError {
    #[error("Bad request: {0:#?}")]
@@ -96,6 +98,15 @@ impl ApiError {
    }
 }

+impl From<AuthError> for ApiError {
+    fn from(_value: AuthError) -> Self {
+        // Don't pass on the value of the AuthError as a precautionary measure.
+        // Being intentionally vague in public error communication hurts debugability
+        // but it is more secure.
+        ApiError::Forbidden("JWT authentication error".to_string())
+    }
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct HttpErrorBody {
    pub msg: String,
--- a/libs/http-utils/src/failpoints.rs
+++ b/libs/http-utils/src/failpoints.rs
@@ -0,0 +1,50 @@
+use crate::error::ApiError;
+use crate::json::{json_request, json_response};
+
+use hyper::{Body, Request, Response, StatusCode};
+use serde::{Deserialize, Serialize};
+use tokio_util::sync::CancellationToken;
+
+use utils::failpoint_support::apply_failpoint;
+
+pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
+
+/// Information for configuring a single fail point
+#[derive(Debug, Serialize, Deserialize)]
+pub struct FailpointConfig {
+    /// Name of the fail point
+    pub name: String,
+    /// List of actions to take, using the format described in `fail::cfg`
+    ///
+    /// We also support `actions = "exit"` to cause the fail point to immediately exit.
+    pub actions: String,
+}
+
+/// Configure failpoints through http.
+pub async fn failpoints_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    if !fail::has_failpoints() {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "Cannot manage failpoints because neon was compiled without failpoints support"
+        )));
+    }
+
+    let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
+    for fp in failpoints {
+        tracing::info!("cfg failpoint: {} {}", fp.name, fp.actions);
+
+        // We recognize one extra "action" that's not natively recognized
+        // by the failpoints crate: exit, to immediately kill the process
+        let cfg_result = apply_failpoint(&fp.name, &fp.actions);
+
+        if let Err(err_msg) = cfg_result {
+            return Err(ApiError::BadRequest(anyhow::anyhow!(
+                "Failed to configure failpoints: {err_msg}"
+            )));
+        }
+    }
+
+    json_response(StatusCode::OK, ())
+}
--- a/libs/http-utils/src/json.rs
+++ b/libs/http-utils/src/json.rs
--- a/libs/http-utils/src/lib.rs
+++ b/libs/http-utils/src/lib.rs
@@ -1,8 +1,12 @@
 pub mod endpoint;
 pub mod error;
+pub mod failpoints;
 pub mod json;
+pub mod pprof;
 pub mod request;

+extern crate hyper0 as hyper;
+
 /// Current fast way to apply simple http routing in various Neon binaries.
 /// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed.
 pub use routerify::{ext::RequestExt, RouterBuilder, RouterService};
--- a/libs/http-utils/src/pprof.rs
+++ b/libs/http-utils/src/pprof.rs
--- a/libs/http-utils/src/request.rs
+++ b/libs/http-utils/src/request.rs
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -94,6 +94,7 @@ pub struct ConfigToml {
    pub ondemand_download_behavior_treat_error_as_warn: bool,
    #[serde(with = "humantime_serde")]
    pub background_task_maximum_delay: Duration,
+    pub use_compaction_semaphore: bool,
    pub control_plane_api: Option<reqwest::Url>,
    pub control_plane_api_token: Option<String>,
    pub control_plane_emergency_mode: bool,
@@ -121,6 +122,7 @@ pub struct ConfigToml {
    pub wal_receiver_protocol: PostgresClientProtocol,
    pub page_service_pipelining: PageServicePipeliningConfig,
    pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
+    pub enable_read_path_debugging: Option<bool>,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -262,6 +264,8 @@ pub struct TenantConfigToml {
    /// size exceeds `compaction_upper_limit * checkpoint_distance`.
    pub compaction_upper_limit: usize,
    pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
+    /// If true, compact down L0 across all tenant timelines before doing regular compaction.
+    pub compaction_l0_first: bool,
    /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure,
    /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer
    /// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification
@@ -323,6 +327,10 @@ pub struct TenantConfigToml {
    // Expresed in multiples of checkpoint distance.
    pub image_layer_creation_check_threshold: u8,

+    // How many multiples of L0 `compaction_threshold` will preempt image layer creation and do L0 compaction.
+    // Set to 0 to disable preemption.
+    pub image_creation_preempt_threshold: usize,
+
    /// The length for an explicit LSN lease request.
    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
    #[serde(with = "humantime_serde")]
@@ -466,6 +474,7 @@ impl Default for ConfigToml {
                DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
            )
            .unwrap()),
+            use_compaction_semaphore: false,

            control_plane_api: (None),
            control_plane_api_token: (None),
@@ -486,7 +495,7 @@ impl Default for ConfigToml {
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
            image_compression: (DEFAULT_IMAGE_COMPRESSION),
-            timeline_offloading: false,
+            timeline_offloading: true,
            ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: None,
            virtual_file_io_mode: None,
@@ -506,6 +515,11 @@ impl Default for ConfigToml {
            } else {
                GetVectoredConcurrentIo::SidecarTask
            },
+            enable_read_path_debugging: if cfg!(test) || cfg!(feature = "testing") {
+                Some(true)
+            } else {
+                None
+            },
        }
    }
 }
@@ -533,6 +547,7 @@ pub mod tenant_conf_defaults {
    // most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole
    // calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB.
    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50;
+    pub const DEFAULT_COMPACTION_L0_FIRST: bool = false;

    pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
        crate::models::CompactionAlgorithm::Legacy;
@@ -547,6 +562,10 @@ pub mod tenant_conf_defaults {
    // Relevant: https://github.com/neondatabase/neon/issues/3394
    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
+    // If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
+    // layer creation will end immediately. Set to 0 to disable. The target default will be 3 once we
+    // want to enable this feature.
+    pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 0;
    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
@@ -578,6 +597,7 @@ impl Default for TenantConfigToml {
            compaction_algorithm: crate::models::CompactionAlgorithmSettings {
                kind: DEFAULT_COMPACTION_ALGORITHM,
            },
+            compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST,
            l0_flush_delay_threshold: None,
            l0_flush_stall_threshold: None,
            l0_flush_wait_upload: DEFAULT_L0_FLUSH_WAIT_UPLOAD,
@@ -605,9 +625,10 @@ impl Default for TenantConfigToml {
            lazy_slru_download: false,
            timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
+            image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD,
            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
-            timeline_offloading: false,
+            timeline_offloading: true,
            wal_receiver_protocol_override: None,
            rel_size_v2_enabled: None,
            gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -464,6 +464,8 @@ pub struct TenantConfigPatch {
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_l0_first: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub l0_flush_delay_threshold: FieldPatch<usize>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub l0_flush_stall_threshold: FieldPatch<usize>,
@@ -498,6 +500,8 @@ pub struct TenantConfigPatch {
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub image_layer_creation_check_threshold: FieldPatch<u8>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_creation_preempt_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub lsn_lease_length: FieldPatch<String>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub lsn_lease_length_for_ts: FieldPatch<String>,
@@ -527,6 +531,7 @@ pub struct TenantConfig {
    pub compaction_upper_limit: Option<usize>,
    // defer parsing compaction_algorithm, like eviction_policy
    pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
+    pub compaction_l0_first: Option<bool>,
    pub l0_flush_delay_threshold: Option<usize>,
    pub l0_flush_stall_threshold: Option<usize>,
    pub l0_flush_wait_upload: Option<bool>,
@@ -544,6 +549,7 @@ pub struct TenantConfig {
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
    pub image_layer_creation_check_threshold: Option<u8>,
+    pub image_creation_preempt_threshold: Option<usize>,
    pub lsn_lease_length: Option<String>,
    pub lsn_lease_length_for_ts: Option<String>,
    pub timeline_offloading: Option<bool>,
@@ -564,6 +570,7 @@ impl TenantConfig {
            mut compaction_threshold,
            mut compaction_upper_limit,
            mut compaction_algorithm,
+            mut compaction_l0_first,
            mut l0_flush_delay_threshold,
            mut l0_flush_stall_threshold,
            mut l0_flush_wait_upload,
@@ -581,6 +588,7 @@ impl TenantConfig {
            mut lazy_slru_download,
            mut timeline_get_throttle,
            mut image_layer_creation_check_threshold,
+            mut image_creation_preempt_threshold,
            mut lsn_lease_length,
            mut lsn_lease_length_for_ts,
            mut timeline_offloading,
@@ -602,6 +610,7 @@ impl TenantConfig {
            .compaction_upper_limit
            .apply(&mut compaction_upper_limit);
        patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch.compaction_l0_first.apply(&mut compaction_l0_first);
        patch
            .l0_flush_delay_threshold
            .apply(&mut l0_flush_delay_threshold);
@@ -635,6 +644,9 @@ impl TenantConfig {
        patch
            .image_layer_creation_check_threshold
            .apply(&mut image_layer_creation_check_threshold);
+        patch
+            .image_creation_preempt_threshold
+            .apply(&mut image_creation_preempt_threshold);
        patch.lsn_lease_length.apply(&mut lsn_lease_length);
        patch
            .lsn_lease_length_for_ts
@@ -662,6 +674,7 @@ impl TenantConfig {
            compaction_threshold,
            compaction_upper_limit,
            compaction_algorithm,
+            compaction_l0_first,
            l0_flush_delay_threshold,
            l0_flush_stall_threshold,
            l0_flush_wait_upload,
@@ -679,6 +692,7 @@ impl TenantConfig {
            lazy_slru_download,
            timeline_get_throttle,
            image_layer_creation_check_threshold,
+            image_creation_preempt_threshold,
            lsn_lease_length,
            lsn_lease_length_for_ts,
            timeline_offloading,
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -76,7 +76,15 @@ impl Conf {
        let mut cmd = Command::new(path);
        cmd.env_clear()
            .env("LD_LIBRARY_PATH", self.pg_lib_dir()?)
-            .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?);
+            .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?)
+            .env(
+                "ASAN_OPTIONS",
+                std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+            )
+            .env(
+                "UBSAN_OPTIONS",
+                std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+            );
        Ok(cmd)
    }

--- a/libs/postgres_initdb/src/lib.rs
+++ b/libs/postgres_initdb/src/lib.rs
@@ -64,6 +64,14 @@ pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> {
        .env_clear()
        .env("LD_LIBRARY_PATH", library_search_path)
        .env("DYLD_LIBRARY_PATH", library_search_path)
+        .env(
+            "ASAN_OPTIONS",
+            std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+        )
+        .env(
+            "UBSAN_OPTIONS",
+            std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+        )
        .stdin(std::process::Stdio::null())
        // stdout invocation produces the same output every time, we don't need it
        .stdout(std::process::Stdio::null())
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};

 use crate::{
    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT,
-    DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+    DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
 };

 /// External backup storage configuration, enough for creating a client for that storage.
@@ -45,11 +45,11 @@ impl RemoteStorageKind {

 impl RemoteStorageConfig {
    /// Helper to fetch the configured concurrency limit.
-    pub fn concurrency_limit(&self) -> Option<usize> {
+    pub fn concurrency_limit(&self) -> usize {
        match &self.storage {
-            RemoteStorageKind::LocalFs { .. } => None,
-            RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()),
-            RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()),
+            RemoteStorageKind::LocalFs { .. } => DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT,
+            RemoteStorageKind::AwsS3(c) => c.concurrency_limit.into(),
+            RemoteStorageKind::AzureContainer(c) => c.concurrency_limit.into(),
        }
    }
 }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -65,6 +65,12 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// Here, a limit of max 20k concurrent connections was noted.
 /// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
 pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
+/// Set this limit analogously to the S3 limit.
+///
+/// The local filesystem backend doesn't enforce a concurrency limit itself, but this also bounds
+/// the upload queue concurrency. Some tests create thousands of uploads, which slows down the
+/// quadratic scheduling of the upload queue, and there is no point spawning so many Tokio tasks.
+pub const DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT: usize = 100;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -21,23 +21,17 @@ bytes.workspace = true
 camino.workspace = true
 chrono.workspace = true
 diatomic-waker.workspace = true
-flate2.workspace = true
 git-version.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
-hyper0 = { workspace = true, features = ["full"] }
 inferno.workspace = true
-itertools.workspace = true
 fail.workspace = true
 futures = { workspace = true }
-jemalloc_pprof.workspace = true
 jsonwebtoken.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
-pprof.workspace = true
 regex.workspace = true
-routerify.workspace = true
 serde.workspace = true
 serde_with.workspace = true
 serde_json.workspace = true
@@ -54,8 +48,6 @@ rand.workspace = true
 scopeguard.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
-url.workspace = true
-uuid.workspace = true
 walkdir.workspace = true

 pq_proto.workspace = true
@@ -64,12 +56,6 @@ metrics.workspace = true

 const_format.workspace = true

-# to use tokio channels as streams, this is faster to compile than async_stream
-# why is it only here? no other crate should use it, streams are rarely needed.
-tokio-stream = { version = "0.1.14" }
-
-serde_path_to_error.workspace = true
-
 [dev-dependencies]
 byteorder.workspace = true
 bytes.workspace = true
--- a/libs/utils/scripts/restore_from_wal.sh
+++ b/libs/utils/scripts/restore_from_wal.sh
@@ -39,7 +39,7 @@ function initdb_with_args {
            ;;
    esac

-    eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}"
+    eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib ASAN_OPTIONS="${ASAN_OPTIONS-}" UBSAN_OPTIONS="${UBSAN_OPTIONS-}" "${cmd[*]}"
 }

 rm -fr "$DATA_DIR"
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -10,7 +10,7 @@ use jsonwebtoken::{
 };
 use serde::{Deserialize, Serialize};

-use crate::{http::error::ApiError, id::TenantId};
+use crate::id::TenantId;

 /// Algorithm to use. We require EdDSA.
 const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
@@ -90,15 +90,6 @@ impl Display for AuthError {
    }
 }

-impl From<AuthError> for ApiError {
-    fn from(_value: AuthError) -> Self {
-        // Don't pass on the value of the AuthError as a precautionary measure.
-        // Being intentionally vague in public error communication hurts debugability
-        // but it is more secure.
-        ApiError::Forbidden("JWT authentication error".to_string())
-    }
-}
-
 pub struct JwtAuth {
    decoding_keys: Vec<DecodingKey>,
    validation: Validation,
--- a/libs/utils/src/backoff.rs
+++ b/libs/utils/src/backoff.rs
@@ -1,4 +1,5 @@
 use std::fmt::{Debug, Display};
+use std::time::Duration;

 use futures::Future;
 use tokio_util::sync::CancellationToken;
@@ -29,6 +30,11 @@ pub async fn exponential_backoff(
    }
 }

+pub fn exponential_backoff_duration(n: u32, base_increment: f64, max_seconds: f64) -> Duration {
+    let seconds = exponential_backoff_duration_seconds(n, base_increment, max_seconds);
+    Duration::from_secs_f64(seconds)
+}
+
 pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
    if n == 0 {
        0.0
--- a/libs/utils/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -1,13 +1,6 @@
 //! Failpoint support code shared between pageserver and safekeepers.

-use crate::http::{
-    error::ApiError,
-    json::{json_request, json_response},
-};
-use hyper::{Body, Request, Response, StatusCode};
-use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
-use tracing::*;

 /// Declare a failpoint that can use to `pause` failpoint action.
 /// We don't want to block the executor thread, hence, spawn_blocking + await.
@@ -184,45 +177,3 @@ fn exit_failpoint() {
    tracing::info!("Exit requested by failpoint");
    std::process::exit(1);
 }
-
-pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
-
-/// Information for configuring a single fail point
-#[derive(Debug, Serialize, Deserialize)]
-pub struct FailpointConfig {
-    /// Name of the fail point
-    pub name: String,
-    /// List of actions to take, using the format described in `fail::cfg`
-    ///
-    /// We also support `actions = "exit"` to cause the fail point to immediately exit.
-    pub actions: String,
-}
-
-/// Configure failpoints through http.
-pub async fn failpoints_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    if !fail::has_failpoints() {
-        return Err(ApiError::BadRequest(anyhow::anyhow!(
-            "Cannot manage failpoints because neon was compiled without failpoints support"
-        )));
-    }
-
-    let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
-    for fp in failpoints {
-        info!("cfg failpoint: {} {}", fp.name, fp.actions);
-
-        // We recognize one extra "action" that's not natively recognized
-        // by the failpoints crate: exit, to immediately kill the process
-        let cfg_result = apply_failpoint(&fp.name, &fp.actions);
-
-        if let Err(err_msg) = cfg_result {
-            return Err(ApiError::BadRequest(anyhow::anyhow!(
-                "Failed to configure failpoints: {err_msg}"
-            )));
-        }
-    }
-
-    json_response(StatusCode::OK, ())
-}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -2,8 +2,6 @@
 //! between other crates in this repository.
 #![deny(clippy::undocumented_unsafe_blocks)]

-extern crate hyper0 as hyper;
-
 pub mod backoff;

 /// `Lsn` type implements common tasks on Log Sequence Numbers
@@ -33,9 +31,6 @@ pub mod shard;
 mod hex;
 pub use hex::Hex;

-// http endpoint utils
-pub mod http;
-
 // definition of the Generation type for pageserver attachment APIs
 pub mod generation;

@@ -96,8 +91,6 @@ pub mod circuit_breaker;

 pub mod try_rcu;

-pub mod pprof;
-
 pub mod guard_arc_swap;

 // Re-export used in macro. Avoids adding git-version as dep in target crates.
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -5,6 +5,27 @@ use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, VariantNames};

+/// Logs a critical error, similarly to `tracing::error!`. This will:
+///
+/// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace.
+/// * Trigger a pageable alert (via the metric below).
+/// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error".
+/// * In debug builds, panic the process.
+///
+/// When including errors in the message, please use {err:?} to include the error cause and original
+/// backtrace.
+#[macro_export]
+macro_rules! critical {
+    ($($arg:tt)*) => {{
+        if cfg!(debug_assertions) {
+            panic!($($arg)*);
+        }
+        $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
+        let backtrace = std::backtrace::Backtrace::capture();
+        tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*));
+    }};
+}
+
 #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
 #[strum(serialize_all = "snake_case")]
 pub enum LogFormat {
@@ -25,7 +46,10 @@ impl LogFormat {
    }
 }

-struct TracingEventCountMetric {
+pub struct TracingEventCountMetric {
+    /// CRITICAL is not a `tracing` log level. Instead, we increment it in the `critical!` macro,
+    /// and also emit it as a regular error. These are thus double-counted, but that seems fine.
+    critical: IntCounter,
    error: IntCounter,
    warn: IntCounter,
    info: IntCounter,
@@ -33,7 +57,7 @@ struct TracingEventCountMetric {
    trace: IntCounter,
 }

-static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
+pub static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
    let vec = metrics::register_int_counter_vec!(
        "libmetrics_tracing_event_count",
        "Number of tracing events, by level",
@@ -46,6 +70,7 @@ static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(||
 impl TracingEventCountMetric {
    fn new(vec: IntCounterVec) -> Self {
        Self {
+            critical: vec.with_label_values(&["critical"]),
            error: vec.with_label_values(&["error"]),
            warn: vec.with_label_values(&["warn"]),
            info: vec.with_label_values(&["info"]),
@@ -54,6 +79,11 @@ impl TracingEventCountMetric {
        }
    }

+    // Allow public access from `critical!` macro.
+    pub fn inc_critical(&self) {
+        self.critical.inc();
+    }
+
    fn inc_for_level(&self, level: tracing::Level) {
        let counter = match level {
            tracing::Level::ERROR => &self.error,
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -177,8 +177,8 @@ impl FileCacheState {
        crate::spawn_with_cancel(
            token,
            |res| {
-                if let Err(error) = res {
-                    error!(%error, "postgres error")
+                if let Err(e) = res {
+                    error!(error = format_args!("{e:#}"), "postgres error");
                }
            },
            conn,
@@ -205,7 +205,7 @@ impl FileCacheState {
        {
            Ok(rows) => Ok(rows),
            Err(e) => {
-                error!(error = ?e, "postgres error: {e} -> retrying");
+                error!(error = format_args!("{e:#}"), "postgres error -> retrying");

                let client = FileCacheState::connect(&self.conn_str, self.token.clone())
                    .await
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -191,15 +191,12 @@ async fn start_monitor(
    .await;
    let mut monitor = match monitor {
        Ok(Ok(monitor)) => monitor,
-        Ok(Err(error)) => {
-            error!(?error, "failed to create monitor");
+        Ok(Err(e)) => {
+            error!(error = format_args!("{e:#}"), "failed to create monitor");
            return;
        }
        Err(_) => {
-            error!(
-                ?timeout,
-                "creating monitor timed out (probably waiting to receive protocol range)"
-            );
+            error!(?timeout, "creating monitor timed out");
            return;
        }
    };
@@ -207,6 +204,9 @@ async fn start_monitor(

    match monitor.run().await {
        Ok(()) => info!("monitor was killed due to new connection"),
-        Err(e) => error!(error = ?e, "monitor terminated unexpectedly"),
+        Err(e) => error!(
+            error = format_args!("{e:#}"),
+            "monitor terminated unexpectedly"
+        ),
    }
 }
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -370,12 +370,16 @@ impl Runner {
                }),
            InboundMsgKind::InvalidMessage { error } => {
                warn!(
-                    %error, id, "received notification of an invalid message we sent"
+                    error = format_args!("{error:#}"),
+                    id, "received notification of an invalid message we sent"
                );
                Ok(None)
            }
            InboundMsgKind::InternalError { error } => {
-                warn!(error, id, "agent experienced an internal error");
+                warn!(
+                    error = format_args!("{error:#}"),
+                    id, "agent experienced an internal error"
+                );
                Ok(None)
            }
            InboundMsgKind::HealthCheck {} => {
@@ -476,7 +480,7 @@ impl Runner {
                                        // gives the outermost cause, and the debug impl
                                        // pretty-prints the error, whereas {:#} contains all the
                                        // causes, but is compact (no newlines).
-                                        warn!(error = format!("{e:#}"), "error handling message");
+                                        warn!(error = format_args!("{e:#}"), "error handling message");
                                        OutboundMsg::new(
                                            OutboundMsgKind::InternalError {
                                                error: e.to_string(),
@@ -492,7 +496,7 @@ impl Runner {
                                    .context("failed to send message")?;
                            }
                            Err(e) => warn!(
-                                error = format!("{e}"),
+                                error = format_args!("{e:#}"),
                                msg = ?msg,
                                "received error message"
                            ),
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -79,6 +79,7 @@ pq_proto.workspace = true
 remote_storage.workspace = true
 storage_broker.workspace = true
 tenant_size_model.workspace = true
+http-utils.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
 reqwest.workspace = true
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -11,6 +11,7 @@ testing = [ "pageserver_api/testing" ]
 pageserver_api.workspace = true
 thiserror.workspace = true
 reqwest = { workspace = true, features = [ "stream" ] }
+http-utils.workspace = true
 utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,11 +1,12 @@
 use std::{collections::HashMap, error::Error as _};

 use bytes::Bytes;
-use detach_ancestor::AncestorDetached;
-use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method, StatusCode};
+
+use detach_ancestor::AncestorDetached;
+use http_utils::error::HttpErrorBody;
+use pageserver_api::{models::*, shard::TenantShardId};
 use utils::{
-    http::error::HttpErrorBody,
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -592,7 +592,7 @@ fn start_pageserver(
        let router = http::make_router(router_state, launch_ts, http_auth.clone())?
            .build()
            .map_err(|err| anyhow!(err))?;
-        let service = utils::http::RouterService::new(router).unwrap();
+        let service = http_utils::RouterService::new(router).unwrap();
        let server = hyper0::Server::from_tcp(http_listener)?
            .serve(service)
            .with_graceful_shutdown({
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -140,6 +140,10 @@ pub struct PageServerConf {
    /// not terrible.
    pub background_task_maximum_delay: Duration,

+    /// If true, use a separate semaphore for compaction tasks instead of the common background task
+    /// semaphore. Defaults to false.
+    pub use_compaction_semaphore: bool,
+
    pub control_plane_api: Option<Url>,

    /// JWT token for use with the control plane API.
@@ -193,6 +197,10 @@ pub struct PageServerConf {
    pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig,

    pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo,
+
+    /// Enable read path debugging. If enabled, read key errors will print a backtrace of the layer
+    /// files read.
+    pub enable_read_path_debugging: bool,
 }

 /// Token for authentication to safekeepers
@@ -332,6 +340,7 @@ impl PageServerConf {
            test_remote_failures,
            ondemand_download_behavior_treat_error_as_warn,
            background_task_maximum_delay,
+            use_compaction_semaphore,
            control_plane_api,
            control_plane_api_token,
            control_plane_emergency_mode,
@@ -355,6 +364,7 @@ impl PageServerConf {
            wal_receiver_protocol,
            page_service_pipelining,
            get_vectored_concurrent_io,
+            enable_read_path_debugging,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -385,6 +395,7 @@ impl PageServerConf {
            test_remote_failures,
            ondemand_download_behavior_treat_error_as_warn,
            background_task_maximum_delay,
+            use_compaction_semaphore,
            control_plane_api,
            control_plane_emergency_mode,
            heatmap_upload_concurrency,
@@ -440,6 +451,7 @@ impl PageServerConf {
                .unwrap_or_default(),
            virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
            no_sync: no_sync.unwrap_or(false),
+            enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
        };

        // ------------------------------------------------------------
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -8,7 +8,6 @@ use std::time::Duration;

 use crate::controller_upcall_client::ControlPlaneGenerationsApi;
 use crate::metrics;
-use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
 use crate::tenant::remote_timeline_client::LayerFileMetadata;
 use crate::virtual_file::MaybeFatalIo;
@@ -151,10 +150,30 @@ impl FlushOp {
    }
 }

+/// A DeletionNotify can be used to wait for a deletion to have been executed.
+#[derive(Debug)]
+pub struct DeletionNotify {
+    /// Receives the `DeletionListSeq` from `ListWriter` when scheduled in a `DeletionList`.
+    seq_rx: tokio::sync::oneshot::Receiver<DeletionListSeq>,
+    /// Watches the last executed `DeletionListSeq`.
+    executed_rx: tokio::sync::watch::Receiver<DeletionListSeq>,
+}
+
+impl DeletionNotify {
+    /// Waits for the deletion to have been executed.
+    pub async fn notify(mut self) {
+        let Ok(wait_seq) = self.seq_rx.await else {
+            return; // TODO return error
+        };
+        self.executed_rx.wait_for(|&seq| seq >= wait_seq).await.ok();
+    }
+}
+
 #[derive(Clone, Debug)]
 pub struct DeletionQueueClient {
    tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
    executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
+    executed_rx: tokio::sync::watch::Receiver<DeletionListSeq>,

    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
 }
@@ -177,6 +196,9 @@ impl TenantDeletionList {
    }
 }

+/// Deletion list sequence number. Monotonically increasing, even across restarts.
+type DeletionListSeq = u64;
+
 /// Files ending with this suffix will be ignored and erased
 /// during recovery as startup.
 const TEMP_SUFFIX: &str = "tmp";
@@ -186,8 +208,9 @@ struct DeletionList {
    /// Serialization version, for future use
    version: u8,

-    /// Used for constructing a unique key for each deletion list we write out.
-    sequence: u64,
+    /// Used for constructing a unique key for each deletion list we write out, and to notify
+    /// callers when a deletion has been executed (and will not be retried later).
+    sequence: DeletionListSeq,

    /// To avoid repeating tenant/timeline IDs in every key, we store keys in
    /// nested HashMaps by TenantTimelineID.  Each Tenant only appears once
@@ -215,13 +238,13 @@ struct DeletionHeader {

    /// The highest sequence number (inclusive) that has been validated.  All deletion
    /// lists on disk with a sequence <= this value are safe to execute.
-    validated_sequence: u64,
+    validated_sequence: DeletionListSeq,
 }

 impl DeletionHeader {
    const VERSION_LATEST: u8 = 1;

-    fn new(validated_sequence: u64) -> Self {
+    fn new(validated_sequence: DeletionListSeq) -> Self {
        Self {
            version: Self::VERSION_LATEST,
            validated_sequence,
@@ -243,7 +266,7 @@ impl DeletionHeader {

 impl DeletionList {
    const VERSION_LATEST: u8 = 1;
-    fn new(sequence: u64) -> Self {
+    fn new(sequence: DeletionListSeq) -> Self {
        Self {
            version: Self::VERSION_LATEST,
            sequence,
@@ -461,47 +484,24 @@ impl DeletionQueueClient {
    /// layers until you're sure they can be deleted safely (i.e. remote metadata no longer
    /// references them).
    ///
+    /// The returned `DeletionNotify` can be used to wait for the deletion to execute.
+    ///
    /// The `current_generation` is the generation of this pageserver's current attachment.  The
    /// generations in `layers` are the generations in which those layers were written.
-    pub(crate) async fn push_layers(
+    pub(crate) fn push_layers(
        &self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        current_generation: Generation,
        layers: Vec<(LayerName, LayerFileMetadata)>,
-    ) -> Result<(), DeletionQueueError> {
-        if current_generation.is_none() {
-            debug!("Enqueuing deletions in legacy mode, skipping queue");
+    ) -> Result<DeletionNotify, DeletionQueueError> {
+        // None generations are not valid for attached tenants: they must always be attached in
+        // a known generation.  None generations are still permitted for layers in the index because
+        // they may be historical.
+        assert!(!current_generation.is_none());

-            let mut layer_paths = Vec::new();
-            for (layer, meta) in layers {
-                layer_paths.push(remote_layer_path(
-                    &tenant_shard_id.tenant_id,
-                    &timeline_id,
-                    meta.shard,
-                    &layer,
-                    meta.generation,
-                ));
-            }
-            self.push_immediate(layer_paths).await?;
-            return self.flush_immediate().await;
-        }
+        let (seq_tx, seq_rx) = tokio::sync::oneshot::channel();

-        self.push_layers_sync(tenant_shard_id, timeline_id, current_generation, layers)
-    }
-
-    /// When a Tenant has a generation, push_layers is always synchronous because
-    /// the ListValidator channel is an unbounded channel.
-    ///
-    /// This can be merged into push_layers when we remove the Generation-less mode
-    /// support (`<https://github.com/neondatabase/neon/issues/5395>`)
-    pub(crate) fn push_layers_sync(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        current_generation: Generation,
-        layers: Vec<(LayerName, LayerFileMetadata)>,
-    ) -> Result<(), DeletionQueueError> {
        metrics::DELETION_QUEUE
            .keys_submitted
            .inc_by(layers.len() as u64);
@@ -513,8 +513,14 @@ impl DeletionQueueClient {
                layers,
                generation: current_generation,
                objects: Vec::new(),
+                seq_tx,
            }),
-        )
+        )?;
+
+        Ok(DeletionNotify {
+            seq_rx,
+            executed_rx: self.executed_rx.clone(),
+        })
    }

    /// This is cancel-safe.  If you drop the future the flush may still happen in the background.
@@ -638,6 +644,10 @@ impl DeletionQueue {
        // happen in the backend (persistent), not in this queue.
        let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16);

+        // Notifies clients about executed deletions.
+        // TODO: recover the last sequence number on startup.
+        let (executed_tx, executed_rx) = tokio::sync::watch::channel(0);
+
        let lsn_table = Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new()));

        // The deletion queue has an independent cancellation token to
@@ -650,6 +660,7 @@ impl DeletionQueue {
                client: DeletionQueueClient {
                    tx,
                    executor_tx: executor_tx.clone(),
+                    executed_rx,
                    lsn_table: lsn_table.clone(),
                },
                cancel: cancel.clone(),
@@ -660,6 +671,7 @@ impl DeletionQueue {
                    conf,
                    backend_rx,
                    executor_tx,
+                    executed_tx,
                    controller_upcall_client,
                    lsn_table.clone(),
                    cancel.clone(),
@@ -957,14 +969,12 @@ mod test {

        // File should still be there after we push it to the queue (we haven't pushed enough to flush anything)
        info!("Pushing");
-        client
-            .push_layers(
-                tenant_shard_id,
-                TIMELINE_ID,
-                now_generation,
-                [(layer_file_name_1.clone(), layer_metadata)].to_vec(),
-            )
-            .await?;
+        client.push_layers(
+            tenant_shard_id,
+            TIMELINE_ID,
+            now_generation,
+            [(layer_file_name_1.clone(), layer_metadata)].to_vec(),
+        )?;
        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);

        assert_local_files(&[], &deletion_prefix);
@@ -1017,14 +1027,12 @@ mod test {
        assert_remote_files(&[&remote_layer_name], &remote_timeline_path);

        tracing::debug!("Pushing...");
-        client
-            .push_layers(
-                tenant_shard_id,
-                TIMELINE_ID,
-                stale_generation,
-                [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
-            )
-            .await?;
+        client.push_layers(
+            tenant_shard_id,
+            TIMELINE_ID,
+            stale_generation,
+            [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
+        )?;

        // We enqueued the operation in a stale generation: it should have failed validation
        tracing::debug!("Flushing...");
@@ -1032,14 +1040,12 @@ mod test {
        assert_remote_files(&[&remote_layer_name], &remote_timeline_path);

        tracing::debug!("Pushing...");
-        client
-            .push_layers(
-                tenant_shard_id,
-                TIMELINE_ID,
-                latest_generation,
-                [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
-            )
-            .await?;
+        client.push_layers(
+            tenant_shard_id,
+            TIMELINE_ID,
+            latest_generation,
+            [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
+        )?;

        // We enqueued the operation in a fresh generation: it should have passed validation
        tracing::debug!("Flushing...");
@@ -1074,28 +1080,24 @@ mod test {
        // generation gets that treatment)
        let remote_layer_file_name_historical =
            ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
-        client
-            .push_layers(
-                tenant_shard_id,
-                TIMELINE_ID,
-                now_generation.previous(),
-                [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
-            )
-            .await?;
+        client.push_layers(
+            tenant_shard_id,
+            TIMELINE_ID,
+            now_generation.previous(),
+            [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
+        )?;

        // Inject a deletion in the generation before generation_now: after restart,
        // this deletion should get executed, because we execute deletions in the
        // immediately previous generation on the same node.
        let remote_layer_file_name_previous =
            ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
-        client
-            .push_layers(
-                tenant_shard_id,
-                TIMELINE_ID,
-                now_generation,
-                [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
-            )
-            .await?;
+        client.push_layers(
+            tenant_shard_id,
+            TIMELINE_ID,
+            now_generation,
+            [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
+        )?;

        client.flush().await?;
        assert_remote_files(
@@ -1139,6 +1141,7 @@ pub(crate) mod mock {
    use tracing::info;

    use super::*;
+    use crate::tenant::remote_timeline_client::remote_layer_path;
    use std::sync::atomic::{AtomicUsize, Ordering};

    pub struct ConsumerState {
@@ -1265,6 +1268,7 @@ pub(crate) mod mock {
            DeletionQueueClient {
                tx: self.tx.clone(),
                executor_tx: self.executor_tx.clone(),
+                executed_rx: todo!(),
                lsn_table: self.lsn_table.clone(),
            }
        }
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -12,6 +12,7 @@

 use super::DeletionHeader;
 use super::DeletionList;
+use super::DeletionListSeq;
 use super::FlushOp;
 use super::ValidatorQueueMessage;

@@ -65,6 +66,9 @@ pub(super) struct DeletionOp {
    /// The _current_ generation of the Tenant shard attachment in which we are enqueuing
    /// this deletion.
    pub(super) generation: Generation,
+
+    /// Return channel for the `DeletionList::sequence` this deletion is included in.
+    pub(super) seq_tx: tokio::sync::oneshot::Sender<DeletionListSeq>,
 }

 #[derive(Debug)]
@@ -175,7 +179,7 @@ impl ListWriter {
    ///
    /// It is not an error for the header to not exist: we return None, and
    /// the caller should act as if validated_sequence is 0
-    async fn load_validated_sequence(&self) -> Result<Option<u64>, anyhow::Error> {
+    async fn load_validated_sequence(&self) -> Result<Option<DeletionListSeq>, anyhow::Error> {
        let header_path = self.conf.deletion_header_path();
        match tokio::fs::read(&header_path).await {
            Ok(header_bytes) => {
@@ -228,7 +232,7 @@ impl ListWriter {

        let temp_extension = format!(".{TEMP_SUFFIX}");
        let header_path = self.conf.deletion_header_path();
-        let mut seqs: Vec<u64> = Vec::new();
+        let mut seqs: Vec<DeletionListSeq> = Vec::new();
        while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") {
            let file_name = dentry.file_name();
            let dentry_str = file_name.to_string_lossy();
@@ -433,6 +437,9 @@ impl ListWriter {
                            metrics::DELETION_QUEUE.unexpected_errors.inc();
                        }
                    }
+
+                    // Notify the client about the sequence number of this deletion.
+                    op.seq_tx.send(self.pending.sequence).ok();
                }
                ListWriterQueueMessage::Flush(op) => {
                    if self.pending.is_empty() {
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -33,6 +33,7 @@ use crate::virtual_file::MaybeFatalIo;
 use super::deleter::DeleterMessage;
 use super::DeletionHeader;
 use super::DeletionList;
+use super::DeletionListSeq;
 use super::DeletionQueueError;
 use super::FlushOp;
 use super::VisibleLsnUpdates;
@@ -60,6 +61,9 @@ where
    rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
    tx: tokio::sync::mpsc::Sender<DeleterMessage>,

+    /// Notifies clients about the last executed DeletionList sequence number.
+    executed_tx: tokio::sync::watch::Sender<DeletionListSeq>,
+
    // Client for calling into control plane API for validation of deletes
    controller_upcall_client: Option<C>,

@@ -94,6 +98,7 @@ where
        conf: &'static PageServerConf,
        rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
        tx: tokio::sync::mpsc::Sender<DeleterMessage>,
+        executed_tx: tokio::sync::watch::Sender<DeletionListSeq>,
        controller_upcall_client: Option<C>,
        lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
        cancel: CancellationToken,
@@ -102,6 +107,7 @@ where
            conf,
            rx,
            tx,
+            executed_tx,
            controller_upcall_client,
            lsn_table,
            pending_lists: Vec::new(),
@@ -161,7 +167,7 @@ where
            tenant_generations.keys().map(|k| (*k, true)).collect()
        };

-        let mut validated_sequence: Option<u64> = None;
+        let mut validated_sequence: Option<DeletionListSeq> = None;

        // Apply the validation results to the pending LSN updates
        for (tenant_id, tenant_lsn_state) in pending_lsn_updates.tenants {
@@ -295,6 +301,7 @@ where
    async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
        for list_path in list_paths {
            debug!("Removing deletion list {list_path}");
+            // TODO: this needs to fsync the removal.
            tokio::fs::remove_file(&list_path)
                .await
                .fatal_err("remove deletion list");
@@ -324,6 +331,7 @@ where
        }

        // Drain `validated_lists` into the executor
+        let executed_seq = self.validated_lists.iter().map(|l| l.sequence).max();
        let mut executing_lists = Vec::new();
        for list in self.validated_lists.drain(..) {
            let list_path = self.conf.deletion_list_path(list.sequence);
@@ -340,6 +348,14 @@ where
        // Erase the deletion lists whose keys have all be deleted from remote storage
        self.cleanup_lists(executing_lists).await;

+        // Notify any waiters that the deletions have been executed.
+        //
+        // TODO: this will wait for all pending lists to be deleted. Consider making it more
+        // responsive by processing lists one by one.
+        if let Some(executed_seq) = executed_seq {
+            self.executed_tx.send_replace(executed_seq);
+        }
+
        Ok(())
    }

--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -61,6 +61,7 @@ use crate::{
        remote_timeline_client::LayerFileMetadata,
        secondary::SecondaryTenant,
        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint},
+        tasks::sleep_random,
    },
    CancellableTask, DiskUsageEvictionTask,
 };
@@ -210,14 +211,8 @@ async fn disk_usage_eviction_task(
        info!("disk usage based eviction task finishing");
    };

-    use crate::tenant::tasks::random_init_delay;
-    {
-        if random_init_delay(task_config.period, &cancel)
-            .await
-            .is_err()
-        {
-            return;
-        }
+    if sleep_random(task_config.period, &cancel).await.is_err() {
+        return;
    }

    let mut iteration_no = 0;
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -13,6 +13,12 @@ use enumset::EnumSet;
 use futures::future::join_all;
 use futures::StreamExt;
 use futures::TryFutureExt;
+use http_utils::endpoint::{
+    profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span,
+};
+use http_utils::failpoints::failpoints_handler;
+use http_utils::request::must_parse_query_param;
+use http_utils::request::{get_request_param, must_get_query_param, parse_query_param};
 use humantime::format_rfc3339;
 use hyper::header;
 use hyper::StatusCode;
@@ -60,13 +66,6 @@ use tokio::time::Instant;
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::auth::JwtAuth;
-use utils::failpoint_support::failpoints_handler;
-use utils::http::endpoint::{
-    profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span,
-};
-use utils::http::request::must_parse_query_param;
-use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};

 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -104,6 +103,13 @@ use crate::tenant::OffloadedTimeline;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::DEFAULT_PG_VERSION;
 use crate::{disk_usage_eviction_task, tenant};
+use http_utils::{
+    endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
+    error::{ApiError, HttpErrorBody},
+    json::{json_request, json_request_maybe, json_response},
+    request::parse_request_param,
+    RequestExt, RouterBuilder,
+};
 use pageserver_api::models::{
    StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
    TimelineInfo,
@@ -111,13 +117,6 @@ use pageserver_api::models::{
 use utils::{
    auth::SwappableJwtAuth,
    generation::Generation,
-    http::{
-        endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
-        error::{ApiError, HttpErrorBody},
-        json::{json_request, json_request_maybe, json_response},
-        request::parse_request_param,
-        RequestExt, RouterBuilder,
-    },
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };
@@ -561,7 +560,7 @@ async fn reload_auth_validation_keys_handler(
    let key_path = config.auth_validation_public_key_path.as_ref().unwrap();
    info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}");

-    match JwtAuth::from_key_path(key_path) {
+    match utils::auth::JwtAuth::from_key_path(key_path) {
        Ok(new_auth) => {
            shared_auth.swap(new_auth);
            json_response(StatusCode::OK, ())
@@ -3393,7 +3392,17 @@ where
                            let status = response.status();
                            info!(%status, "Cancelled request finished successfully")
                        }
-                        Err(e) => error!("Cancelled request finished with an error: {e:?}"),
+                        Err(e) => match e {
+                            ApiError::ShuttingDown | ApiError::ResourceUnavailable(_) => {
+                                // Don't log this at error severity: they are normal during lifecycle of tenants/process
+                                info!("Cancelled request aborted for shutdown")
+                            }
+                            _ => {
+                                // Log these in a highly visible way, because we have no client to send the response to, but
+                                // would like to know that something went wrong.
+                                error!("Cancelled request finished with an error: {e:?}")
+                            }
+                        },
                    }
                }
                // only logging for cancelled panicked request handlers is the tracing_panic_hook,
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -263,14 +263,6 @@ pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
 /// data directory at pageserver startup can be automatically removed.
 pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp";

-/// A marker file to mark that a timeline directory was not fully initialized.
-/// If a timeline directory with this marker is encountered at pageserver startup,
-/// the timeline directory and the marker file are both removed.
-/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
-pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
-
-pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
-
 pub fn is_temporary(path: &Utf8Path) -> bool {
    match path.file_name() {
        Some(name) => name.ends_with(TEMP_FILE_SUFFIX),
@@ -278,25 +270,6 @@ pub fn is_temporary(path: &Utf8Path) -> bool {
    }
 }

-fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
-    match path.file_name() {
-        Some(name) => name.ends_with(suffix),
-        None => false,
-    }
-}
-
-// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
-// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
-// from the name.
-
-pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool {
-    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
-}
-
-pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool {
-    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
-}
-
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -6,7 +6,7 @@ use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
 use std::time::{Duration, Instant};

-use enum_map::EnumMap;
+use enum_map::{Enum as _, EnumMap};
 use futures::Future;
 use metrics::{
    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
@@ -32,6 +32,7 @@ use utils::id::TimelineId;

 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext};
+use crate::pgdatadir_mapping::DatadirModificationStats;
 use crate::task_mgr::TaskKind;
 use crate::tenant::layer_map::LayerMap;
 use crate::tenant::mgr::TenantSlot;
@@ -103,7 +104,7 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::n
    .expect("failed to define a metric")
 });

-// Buckets for background operations like compaction, GC, size calculation
+// Buckets for background operation duration in seconds, like compaction, GC, size calculation.
 const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];

 pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
@@ -235,7 +236,7 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(||

    GetVectoredLatency {
        map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
-            let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
+            let task_kind = TaskKind::from_usize(task_kind_idx);

            if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
                let task_kind = task_kind.into();
@@ -258,7 +259,7 @@ pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {

    ScanLatency {
        map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
-            let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
+            let task_kind = TaskKind::from_usize(task_kind_idx);

            if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
                let task_kind = task_kind.into();
@@ -299,10 +300,10 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {

 pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
    map: EnumMap::from_array(std::array::from_fn(|task_kind| {
-        let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
+        let task_kind = TaskKind::from_usize(task_kind);
        let task_kind: &'static str = task_kind.into();
        EnumMap::from_array(std::array::from_fn(|content_kind| {
-            let content_kind = <PageContentKind as enum_map::Enum>::from_usize(content_kind);
+            let content_kind = PageContentKind::from_usize(content_kind);
            let content_kind: &'static str = content_kind.into();
            PageCacheMetricsForTaskKind {
                read_accesses_immutable: {
@@ -1365,10 +1366,7 @@ impl SmgrOpTimer {
    /// The first callers receives Some, subsequent ones None.
    ///
    /// See [`SmgrOpTimerState`] for more context.
-    pub(crate) fn observe_execution_end_flush_start(
-        &mut self,
-        at: Instant,
-    ) -> Option<SmgrOpFlushInProgress> {
+    pub(crate) fn observe_execution_end(&mut self, at: Instant) -> Option<SmgrOpFlushInProgress> {
        // NB: unlike the other observe_* methods, this one take()s.
        #[allow(clippy::question_mark)] // maintain similar code pattern.
        let Some(mut inner) = self.0.take() else {
@@ -1402,7 +1400,6 @@ impl SmgrOpTimer {
            ..
        } = inner;
        Some(SmgrOpFlushInProgress {
-            flush_started_at: at,
            global_micros: global_flush_in_progress_micros,
            per_timeline_micros: per_timeline_flush_in_progress_micros,
        })
@@ -1418,7 +1415,6 @@ impl SmgrOpTimer {
 /// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there,
 /// and remove this struct from the code base.
 pub(crate) struct SmgrOpFlushInProgress {
-    flush_started_at: Instant,
    global_micros: IntCounter,
    per_timeline_micros: IntCounter,
 }
@@ -1437,12 +1433,13 @@ impl Drop for SmgrOpTimer {
        self.observe_throttle_start(now);
        self.observe_throttle_done(ThrottleResult::NotThrottled { end: now });
        self.observe_execution_start(now);
-        self.observe_execution_end_flush_start(now);
+        let maybe_flush_timer = self.observe_execution_end(now);
+        drop(maybe_flush_timer);
    }
 }

 impl SmgrOpFlushInProgress {
-    pub(crate) async fn measure<Fut, O>(mut self, mut fut: Fut) -> O
+    pub(crate) async fn measure<Fut, O>(self, mut started_at: Instant, mut fut: Fut) -> O
    where
        Fut: std::future::Future<Output = O>,
    {
@@ -1454,12 +1451,12 @@ impl SmgrOpFlushInProgress {
        let mut observe_guard = scopeguard::guard(
            || {
                let now = Instant::now();
-                let elapsed = now - self.flush_started_at;
+                let elapsed = now - started_at;
                self.global_micros
                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
                self.per_timeline_micros
                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
-                self.flush_started_at = now;
+                started_at = now;
            },
            |mut observe| {
                observe();
@@ -1912,7 +1909,7 @@ pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy<ComputeCommandCounters> = Lazy

    ComputeCommandCounters {
        map: EnumMap::from_array(std::array::from_fn(|i| {
-            let command = <ComputeCommandKind as enum_map::Enum>::from_usize(i);
+            let command = ComputeCommandKind::from_usize(i);
            let command_str: &'static str = command.into();
            inner.with_label_values(&[command_str])
        })),
@@ -2212,11 +2209,13 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {

 pub struct BackgroundLoopSemaphoreMetrics {
    counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
-    durations: EnumMap<BackgroundLoopKind, Counter>,
+    durations: EnumMap<BackgroundLoopKind, Histogram>,
+    waiting_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
+    running_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
 }

-pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
-    || {
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> =
+    Lazy::new(|| {
        let counters = register_int_counter_pair_vec!(
            "pageserver_background_loop_semaphore_wait_start_count",
            "Counter for background loop concurrency-limiting semaphore acquire calls started",
@@ -2226,45 +2225,101 @@ pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics
        )
        .unwrap();

-        let durations = register_counter_vec!(
-            "pageserver_background_loop_semaphore_wait_duration_seconds",
-            "Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls",
+        let durations = register_histogram_vec!(
+            "pageserver_background_loop_semaphore_wait_seconds",
+            "Seconds spent waiting on background loop semaphore acquisition",
+            &["task"],
+            vec![0.01, 1.0, 5.0, 10.0, 30.0, 60.0, 180.0, 300.0, 600.0],
+        )
+        .unwrap();
+
+        let waiting_tasks = register_int_gauge_vec!(
+            "pageserver_background_loop_semaphore_waiting_tasks",
+            "Number of background loop tasks waiting for semaphore",
+            &["task"],
+        )
+        .unwrap();
+
+        let running_tasks = register_int_gauge_vec!(
+            "pageserver_background_loop_semaphore_running_tasks",
+            "Number of background loop tasks running concurrently",
            &["task"],
        )
        .unwrap();

        BackgroundLoopSemaphoreMetrics {
-            counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+            counters: EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = BackgroundLoopKind::from_usize(i);
                counters.with_label_values(&[kind.into()])
            })),
-            durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+            durations: EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = BackgroundLoopKind::from_usize(i);
                durations.with_label_values(&[kind.into()])
            })),
+            waiting_tasks: EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = BackgroundLoopKind::from_usize(i);
+                waiting_tasks.with_label_values(&[kind.into()])
+            })),
+            running_tasks: EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = BackgroundLoopKind::from_usize(i);
+                running_tasks.with_label_values(&[kind.into()])
+            })),
        }
-    },
-);
+    });

 impl BackgroundLoopSemaphoreMetrics {
-    pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
-        struct Record<'a> {
-            metrics: &'a BackgroundLoopSemaphoreMetrics,
-            task: BackgroundLoopKind,
-            _counter_guard: metrics::IntCounterPairGuard,
-            start: Instant,
-        }
-        impl Drop for Record<'_> {
-            fn drop(&mut self) {
-                let elapsed = self.start.elapsed().as_secs_f64();
-                self.metrics.durations[self.task].inc_by(elapsed);
-            }
-        }
-        Record {
-            metrics: self,
+    /// Starts recording semaphore metrics. Call `acquired()` on the returned recorder when the
+    /// semaphore is acquired, and drop it when the task completes or is cancelled.
+    pub(crate) fn record(
+        &self,
+        task: BackgroundLoopKind,
+    ) -> BackgroundLoopSemaphoreMetricsRecorder {
+        BackgroundLoopSemaphoreMetricsRecorder::start(self, task)
+    }
+}
+
+/// Records metrics for a background task.
+pub struct BackgroundLoopSemaphoreMetricsRecorder<'a> {
+    metrics: &'a BackgroundLoopSemaphoreMetrics,
+    task: BackgroundLoopKind,
+    start: Instant,
+    wait_counter_guard: Option<metrics::IntCounterPairGuard>,
+}
+
+impl<'a> BackgroundLoopSemaphoreMetricsRecorder<'a> {
+    /// Starts recording semaphore metrics, by recording wait time and incrementing
+    /// `wait_start_count` and `waiting_tasks`.
+    fn start(metrics: &'a BackgroundLoopSemaphoreMetrics, task: BackgroundLoopKind) -> Self {
+        metrics.waiting_tasks[task].inc();
+        Self {
+            metrics,
            task,
-            _counter_guard: self.counters[task].guard(),
            start: Instant::now(),
+            wait_counter_guard: Some(metrics.counters[task].guard()),
+        }
+    }
+
+    /// Signals that the semaphore has been acquired, and updates relevant metrics.
+    pub fn acquired(&mut self) -> Duration {
+        let waited = self.start.elapsed();
+        self.wait_counter_guard.take().expect("already acquired");
+        self.metrics.durations[self.task].observe(waited.as_secs_f64());
+        self.metrics.waiting_tasks[self.task].dec();
+        self.metrics.running_tasks[self.task].inc();
+        waited
+    }
+}
+
+impl Drop for BackgroundLoopSemaphoreMetricsRecorder<'_> {
+    /// The task either completed or was cancelled.
+    fn drop(&mut self) {
+        if self.wait_counter_guard.take().is_some() {
+            // Waiting.
+            self.metrics.durations[self.task].observe(self.start.elapsed().as_secs_f64());
+            self.metrics.waiting_tasks[self.task].dec();
+        } else {
+            // Running.
+            self.metrics.running_tasks[self.task].dec();
        }
    }
 }
@@ -2378,11 +2433,40 @@ pub(crate) struct WalIngestMetrics {
    pub(crate) records_observed: IntCounter,
    pub(crate) records_committed: IntCounter,
    pub(crate) records_filtered: IntCounter,
+    pub(crate) values_committed_metadata_images: IntCounter,
+    pub(crate) values_committed_metadata_deltas: IntCounter,
+    pub(crate) values_committed_data_images: IntCounter,
+    pub(crate) values_committed_data_deltas: IntCounter,
    pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
-    pub(crate) clear_vm_bits_unknown: IntCounterVec,
+}
+
+impl WalIngestMetrics {
+    pub(crate) fn inc_values_committed(&self, stats: &DatadirModificationStats) {
+        if stats.metadata_images > 0 {
+            self.values_committed_metadata_images
+                .inc_by(stats.metadata_images);
+        }
+        if stats.metadata_deltas > 0 {
+            self.values_committed_metadata_deltas
+                .inc_by(stats.metadata_deltas);
+        }
+        if stats.data_images > 0 {
+            self.values_committed_data_images.inc_by(stats.data_images);
+        }
+        if stats.data_deltas > 0 {
+            self.values_committed_data_deltas.inc_by(stats.data_deltas);
+        }
+    }
 }

 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
+    let values_committed = register_int_counter_vec!(
+        "pageserver_wal_ingest_values_committed",
+        "Number of values committed to pageserver storage from WAL records",
+        &["class", "kind"],
+    )
+    .expect("failed to define a metric");
+
    WalIngestMetrics {
    bytes_received: register_int_counter!(
        "pageserver_wal_ingest_bytes_received",
@@ -2409,17 +2493,15 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
        "Number of WAL records filtered out due to sharding"
    )
    .expect("failed to define a metric"),
+    values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]),
+    values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]),
+    values_committed_data_images: values_committed.with_label_values(&["data", "image"]),
+    values_committed_data_deltas: values_committed.with_label_values(&["data", "delta"]),
    gap_blocks_zeroed_on_rel_extend: register_int_counter!(
        "pageserver_gap_blocks_zeroed_on_rel_extend",
        "Total number of zero gap blocks written on relation extends"
    )
    .expect("failed to define a metric"),
-    clear_vm_bits_unknown: register_int_counter_vec!(
-        "pageserver_wal_ingest_clear_vm_bits_unknown",
-        "Number of ignored ClearVmBits operations due to unknown pages/relations",
-        &["entity"],
-    )
-    .expect("failed to define a metric"),
 }
 });

@@ -2486,7 +2568,7 @@ pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> =

 pub(crate) struct WalRedoProcessCounters {
    pub(crate) started: IntCounter,
-    pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
+    pub(crate) killed_by_cause: EnumMap<WalRedoKillCause, IntCounter>,
    pub(crate) active_stderr_logger_tasks_started: IntCounter,
    pub(crate) active_stderr_logger_tasks_finished: IntCounter,
 }
@@ -2528,7 +2610,7 @@ impl Default for WalRedoProcessCounters {
        Self {
            started,
            killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
-                let cause = <WalRedoKillCause as enum_map::Enum>::from_usize(i);
+                let cause = WalRedoKillCause::from_usize(i);
                let cause_str: &'static str = cause.into();
                killed.with_label_values(&[cause_str])
            })),
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -489,7 +489,6 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
        let timeline = tenant_shard
            .get_timeline(timeline_id, true)
            .map_err(GetActiveTimelineError::Timeline)?;
-        set_tracing_field_shard_id(&timeline);
        Ok(timeline)
    }
 }
@@ -774,11 +773,11 @@ impl PageServerHandler {

        let batched_msg = match neon_fe_msg {
            PagestreamFeMessage::Exists(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
                let shard = timeline_handles
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
-                    .instrument(span.clone()) // sets `shard_id` field
                    .await?;
+                debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
+                let span = tracing::info_span!(parent: &parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetRelExists,
@@ -793,11 +792,10 @@ impl PageServerHandler {
                }
            }
            PagestreamFeMessage::Nblocks(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
                let shard = timeline_handles
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
-                    .instrument(span.clone()) // sets `shard_id` field
                    .await?;
+                let span = tracing::info_span!(parent: &parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetRelSize,
@@ -812,11 +810,10 @@ impl PageServerHandler {
                }
            }
            PagestreamFeMessage::DbSize(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn);
                let shard = timeline_handles
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
-                    .instrument(span.clone()) // sets `shard_id` field
                    .await?;
+                let span = tracing::info_span!(parent: &parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetDbSize,
@@ -831,11 +828,10 @@ impl PageServerHandler {
                }
            }
            PagestreamFeMessage::GetSlruSegment(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn);
                let shard = timeline_handles
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
-                    .instrument(span.clone()) // sets `shard_id` field
                    .await?;
+                let span = tracing::info_span!(parent: &parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetSlruSegment,
@@ -850,12 +846,20 @@ impl PageServerHandler {
                }
            }
            PagestreamFeMessage::GetPage(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %req.hdr.request_lsn);
+                // avoid a somewhat costly Span::record() by constructing the entire span in one go.
+                macro_rules! mkspan {
+                    (before shard routing) => {{
+                        tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn)
+                    }};
+                    ($shard_id:expr) => {{
+                        tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id)
+                    }};
+                }

                macro_rules! respond_error {
-                    ($error:expr) => {{
+                    ($span:expr, $error:expr) => {{
                        let error = BatchedFeMessage::RespondError {
-                            span,
+                            span: $span,
                            error: BatchedPageStreamError {
                                req: req.hdr,
                                err: $error,
@@ -868,27 +872,35 @@ impl PageServerHandler {
                let key = rel_block_to_key(req.rel, req.blkno);
                let shard = match timeline_handles
                    .get(tenant_id, timeline_id, ShardSelector::Page(key))
-                    .instrument(span.clone()) // sets `shard_id` field
                    .await
                {
                    Ok(tl) => tl,
-                    Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
-                        // We already know this tenant exists in general, because we resolved it at
-                        // start of connection.  Getting a NotFound here indicates that the shard containing
-                        // the requested page is not present on this node: the client's knowledge of shard->pageserver
-                        // mapping is out of date.
-                        //
-                        // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
-                        // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
-                        // and talk to a different pageserver.
-                        return respond_error!(PageStreamError::Reconnect(
-                            "getpage@lsn request routed to wrong shard".into()
-                        ));
-                    }
                    Err(e) => {
-                        return respond_error!(e.into());
+                        let span = mkspan!(before shard routing);
+                        match e {
+                            GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_)) => {
+                                // We already know this tenant exists in general, because we resolved it at
+                                // start of connection.  Getting a NotFound here indicates that the shard containing
+                                // the requested page is not present on this node: the client's knowledge of shard->pageserver
+                                // mapping is out of date.
+                                //
+                                // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
+                                // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
+                                // and talk to a different pageserver.
+                                return respond_error!(
+                                    span,
+                                    PageStreamError::Reconnect(
+                                        "getpage@lsn request routed to wrong shard".into()
+                                    )
+                                );
+                            }
+                            e => {
+                                return respond_error!(span, e.into());
+                            }
+                        }
                    }
                };
+                let span = mkspan!(shard.tenant_shard_id.shard_slug());

                let timer = record_op_start_and_throttle(
                    &shard,
@@ -910,7 +922,7 @@ impl PageServerHandler {
                {
                    Ok(lsn) => lsn,
                    Err(e) => {
-                        return respond_error!(e);
+                        return respond_error!(span, e);
                    }
                };
                BatchedFeMessage::GetPage {
@@ -922,11 +934,10 @@ impl PageServerHandler {
            }
            #[cfg(feature = "testing")]
            PagestreamFeMessage::Test(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_test_request");
                let shard = timeline_handles
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
-                    .instrument(span.clone()) // sets `shard_id` field
                    .await?;
+                let span = tracing::info_span!(parent: &parent_span, "handle_test_request", shard_id = %shard.tenant_shard_id.shard_slug());
                let timer =
                    record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
                        .await?;
@@ -1063,7 +1074,7 @@ impl PageServerHandler {
        };

        // invoke handler function
-        let (handler_results, span): (
+        let (mut handler_results, span): (
            Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
            _,
        ) = match batch {
@@ -1190,11 +1201,49 @@ impl PageServerHandler {
            }
        };

+        // We purposefully don't count flush time into the smgr operation timer.
+        //
+        // The reason is that current compute client will not perform protocol processing
+        // if the postgres backend process is doing things other than `->smgr_read()`.
+        // This is especially the case for prefetch.
+        //
+        // If the compute doesn't read from the connection, eventually TCP will backpressure
+        // all the way into our flush call below.
+        //
+        // The timer's underlying metric is used for a storage-internal latency SLO and
+        // we don't want to include latency in it that we can't control.
+        // And as pointed out above, in this case, we don't control the time that flush will take.
+        //
+        // We put each response in the batch onto the wire in a separate pgb_writer.flush()
+        // call, which (all unmeasured) adds syscall overhead but reduces time to first byte
+        // and avoids building up a "giant" contiguous userspace buffer to hold the entire response.
+        // TODO: vectored socket IO would be great, but pgb_writer doesn't support that.
+        let flush_timers = {
+            let flushing_start_time = Instant::now();
+            let mut flush_timers = Vec::with_capacity(handler_results.len());
+            for handler_result in &mut handler_results {
+                let flush_timer = match handler_result {
+                    Ok((_, timer)) => Some(
+                        timer
+                            .observe_execution_end(flushing_start_time)
+                            .expect("we are the first caller"),
+                    ),
+                    Err(_) => {
+                        // TODO: measure errors
+                        None
+                    }
+                };
+                flush_timers.push(flush_timer);
+            }
+            assert_eq!(flush_timers.len(), handler_results.len());
+            flush_timers
+        };
+
        // Map handler result to protocol behavior.
        // Some handler errors cause exit from pagestream protocol.
        // Other handler errors are sent back as an error message and we stay in pagestream protocol.
-        for handler_result in handler_results {
-            let (response_msg, timer) = match handler_result {
+        for (handler_result, flushing_timer) in handler_results.into_iter().zip(flush_timers) {
+            let response_msg = match handler_result {
                Err(e) => match &e.err {
                    PageStreamError::Shutdown => {
                        // If we fail to fulfil a request during shutdown, which may be _because_ of
@@ -1218,16 +1267,14 @@ impl PageServerHandler {
                        span.in_scope(|| {
                            error!("error reading relation or page version: {full:#}")
                        });
-                        (
-                            PagestreamBeMessage::Error(PagestreamErrorResponse {
-                                req: e.req,
-                                message: e.err.to_string(),
-                            }),
-                            None, // TODO: measure errors
-                        )
+
+                        PagestreamBeMessage::Error(PagestreamErrorResponse {
+                            req: e.req,
+                            message: e.err.to_string(),
+                        })
                    }
                },
-                Ok((response_msg, timer)) => (response_msg, Some(timer)),
+                Ok((response_msg, _op_timer_already_observed)) => response_msg,
            };

            //
@@ -1238,30 +1285,12 @@ impl PageServerHandler {
                &response_msg.serialize(protocol_version),
            ))?;

-            // We purposefully don't count flush time into the timer.
-            //
-            // The reason is that current compute client will not perform protocol processing
-            // if the postgres backend process is doing things other than `->smgr_read()`.
-            // This is especially the case for prefetch.
-            //
-            // If the compute doesn't read from the connection, eventually TCP will backpressure
-            // all the way into our flush call below.
-            //
-            // The timer's underlying metric is used for a storage-internal latency SLO and
-            // we don't want to include latency in it that we can't control.
-            // And as pointed out above, in this case, we don't control the time that flush will take.
-            let flushing_timer = timer.map(|mut timer| {
-                timer
-                    .observe_execution_end_flush_start(Instant::now())
-                    .expect("we are the first caller")
-            });
-
            // what we want to do
            let flush_fut = pgb_writer.flush();
            // metric for how long flushing takes
            let flush_fut = match flushing_timer {
                Some(flushing_timer) => {
-                    futures::future::Either::Left(flushing_timer.measure(flush_fut))
+                    futures::future::Either::Left(flushing_timer.measure(Instant::now(), flush_fut))
                }
                None => futures::future::Either::Right(flush_fut),
            };
@@ -1280,8 +1309,6 @@ impl PageServerHandler {
                }
                Ok(())
            }
-            // and log the info! line inside the request span
-            .instrument(span.clone())
            .await?;
        }
        Ok(())
@@ -1342,7 +1369,7 @@ impl PageServerHandler {
            .take()
            .expect("implementation error: timeline_handles should not be locked");

-        let request_span = info_span!("request", shard_id = tracing::field::Empty);
+        let request_span = info_span!("request");
        let ((pgb_reader, timeline_handles), result) = match self.pipelining_config.clone() {
            PageServicePipeliningConfig::Pipelined(pipelining_config) => {
                self.handle_pagerequests_pipelined(
@@ -1692,7 +1719,7 @@ impl PageServerHandler {
        // to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN).
        if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() {
            let gc_info = &timeline.gc_info.read().unwrap();
-            if !gc_info.leases.contains_key(&request_lsn) {
+            if !gc_info.lsn_covered_by_lease(request_lsn) {
                return Err(
                    PageStreamError::BadRequest(format!(
                        "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
@@ -2036,6 +2063,13 @@ impl PageServerHandler {
            .unwrap()
            .get(tenant_id, timeline_id, ShardSelector::Zero)
            .await?;
+        set_tracing_field_shard_id(&timeline);
+
+        if timeline.is_archived() == Some(true) {
+            // TODO after a grace period, turn this log line into a hard error
+            tracing::warn!("timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it.");
+            //return Err(QueryError::NotFound("timeline is archived".into()))
+        }

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -48,7 +48,7 @@ use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
 use utils::{bin_ser::BeSer, lsn::Lsn};
-use wal_decoder::serialized_batch::SerializedValueBatch;
+use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};

 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
 pub const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -612,11 +612,18 @@ impl Timeline {
        pausable_failpoint!("find-lsn-for-timestamp-pausable");

        let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
+        let gc_cutoff_planned = {
+            let gc_info = self.gc_info.read().unwrap();
+            gc_info.min_cutoff()
+        };
+        // Usually the planned cutoff is newer than the cutoff of the last gc run,
+        // but let's be defensive.
+        let gc_cutoff = gc_cutoff_planned.max(*gc_cutoff_lsn_guard);
        // We use this method to figure out the branching LSN for the new branch, but the
        // GC cutoff could be before the branching point and we cannot create a new branch
        // with LSN < `ancestor_lsn`. Thus, pick the maximum of these two to be
        // on the safe side.
-        let min_lsn = std::cmp::max(*gc_cutoff_lsn_guard, self.get_ancestor_lsn());
+        let min_lsn = std::cmp::max(gc_cutoff, self.get_ancestor_lsn());
        let max_lsn = self.get_last_record_lsn();

        // LSNs are always 8-byte aligned. low/mid/high represent the
@@ -1297,6 +1304,26 @@ impl DatadirModification<'_> {
            .is_some_and(|b| b.has_data())
    }

+    /// Returns statistics about the currently pending modifications.
+    pub(crate) fn stats(&self) -> DatadirModificationStats {
+        let mut stats = DatadirModificationStats::default();
+        for (_, _, value) in self.pending_metadata_pages.values().flatten() {
+            match value {
+                Value::Image(_) => stats.metadata_images += 1,
+                Value::WalRecord(r) if r.will_init() => stats.metadata_images += 1,
+                Value::WalRecord(_) => stats.metadata_deltas += 1,
+            }
+        }
+        for valuemeta in self.pending_data_batch.iter().flat_map(|b| &b.metadata) {
+            match valuemeta {
+                ValueMeta::Serialized(s) if s.will_init => stats.data_images += 1,
+                ValueMeta::Serialized(_) => stats.data_deltas += 1,
+                ValueMeta::Observed(_) => {}
+            }
+        }
+        stats
+    }
+
    /// Set the current lsn
    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
        ensure!(
@@ -2317,6 +2344,15 @@ impl DatadirModification<'_> {
    }
 }

+/// Statistics for a DatadirModification.
+#[derive(Default)]
+pub struct DatadirModificationStats {
+    pub metadata_images: u64,
+    pub metadata_deltas: u64,
+    pub data_images: u64,
+    pub data_deltas: u64,
+}
+
 /// This struct facilitates accessing either a committed key from the timeline at a
 /// specific LSN, or the latest uncommitted key from a pending modification.
 ///
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -328,8 +328,8 @@ pub enum TaskKind {
    // Eviction. One per timeline.
    Eviction,

-    // Ingest housekeeping (flushing ephemeral layers on time threshold or disk pressure)
-    IngestHousekeeping,
+    // Tenant housekeeping (flush idle ephemeral layers, shut down idle walredo, etc.).
+    TenantHousekeeping,

    /// See [`crate::disk_usage_eviction_task`].
    DiskUsageEviction,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -20,6 +20,7 @@ use chrono::NaiveDateTime;
 use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
+use itertools::Itertools as _;
 use pageserver_api::models;
 use pageserver_api::models::CompactInfoResponse;
 use pageserver_api::models::LsnLease;
@@ -46,14 +47,18 @@ use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
+use timeline::compaction::CompactionOutcome;
 use timeline::compaction::GcCompactionQueue;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
 use timeline::offload::OffloadError;
+use timeline::CompactFlags;
 use timeline::CompactOptions;
+use timeline::CompactionError;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
+use tokio::sync::Notify;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -95,7 +100,6 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
-use crate::is_uninit_mark;
 use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::CONCURRENT_INITDBS;
 use crate::metrics::INITDB_RUN_TIME;
@@ -349,6 +353,9 @@ pub struct Tenant {
    /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
    compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,

+    /// Signals the tenant compaction loop that there is L0 compaction work to be done.
+    pub(crate) l0_compaction_trigger: Arc<Notify>,
+
    /// Scheduled gc-compaction tasks.
    scheduled_compaction_tasks: std::sync::Mutex<HashMap<TimelineId, Arc<GcCompactionQueue>>>,

@@ -1690,12 +1697,7 @@ impl Tenant {
                    timeline_id,
                    index_part,
                    remote_metadata,
-                    TimelineResources {
-                        remote_client,
-                        pagestream_throttle: self.pagestream_throttle.clone(),
-                        pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
-                        l0_flush_global_state: self.l0_flush_global_state.clone(),
-                    },
+                    self.get_timeline_resources_for(remote_client),
                    LoadTimelineCause::Attach,
                    ctx,
                )
@@ -1793,11 +1795,7 @@ impl Tenant {
            let entry = entry.context("read timeline dir entry")?;
            let entry_path = entry.path();

-            let purge = if crate::is_temporary(entry_path)
-                // TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718)
-                || is_uninit_mark(entry_path)
-                || crate::is_delete_mark(entry_path)
-            {
+            let purge = if crate::is_temporary(entry_path) {
                true
            } else {
                match TimelineId::try_from(entry_path.file_name()) {
@@ -2902,146 +2900,194 @@ impl Tenant {
            .await
    }

-    /// Perform one compaction iteration.
-    /// This function is periodically called by compactor task.
-    /// Also it can be explicitly requested per timeline through page server
-    /// api's 'compact' command.
+    /// Performs one compaction iteration. Called periodically from the compaction loop. Returns
+    /// whether another compaction is needed, if we still have pending work or if we yield for
+    /// immediate L0 compaction.
    ///
-    /// Returns whether we have pending compaction task.
+    /// Compaction can also be explicitly requested for a timeline via the HTTP API.
    async fn compaction_iteration(
        self: &Arc<Self>,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<bool, timeline::CompactionError> {
-        // Don't start doing work during shutdown, or when broken, we do not need those in the logs
+    ) -> Result<CompactionOutcome, CompactionError> {
+        // Don't compact inactive tenants.
        if !self.is_active() {
-            return Ok(false);
+            return Ok(CompactionOutcome::Skipped);
        }

-        {
-            let conf = self.tenant_conf.load();
-
-            // Note that compaction usually requires deletions, but we don't respect
-            // may_delete_layers_hint here: that is because tenants in AttachedMulti
-            // should proceed with compaction even if they can't do deletion, to avoid
-            // accumulating dangerously deep stacks of L0 layers.  Deletions will be
-            // enqueued inside RemoteTimelineClient, and executed layer if/when we transition
-            // to AttachedSingle state.
-            if !conf.location.may_upload_layers_hint() {
-                info!("Skipping compaction in location state {:?}", conf.location);
-                return Ok(false);
-            }
+        // Don't compact tenants that can't upload layers. We don't check `may_delete_layers_hint`,
+        // since we need to compact L0 even in AttachedMulti to bound read amplification.
+        let location = self.tenant_conf.load().location;
+        if !location.may_upload_layers_hint() {
+            info!("skipping compaction in location state {location:?}");
+            return Ok(CompactionOutcome::Skipped);
        }

-        // Scan through the hashmap and collect a list of all the timelines,
-        // while holding the lock. Then drop the lock and actually perform the
-        // compactions.  We don't want to block everything else while the
-        // compaction runs.
-        let timelines_to_compact_or_offload;
-        {
-            let timelines = self.timelines.lock().unwrap();
-            timelines_to_compact_or_offload = timelines
-                .iter()
-                .filter_map(|(timeline_id, timeline)| {
-                    let (is_active, (can_offload, _)) =
-                        (timeline.is_active(), timeline.can_offload());
-                    let has_no_unoffloaded_children = {
-                        !timelines
-                            .iter()
-                            .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id))
-                    };
-                    let config_allows_offload = self.conf.timeline_offloading
-                        || self
-                            .tenant_conf
-                            .load()
-                            .tenant_conf
-                            .timeline_offloading
-                            .unwrap_or_default();
-                    let can_offload =
-                        can_offload && has_no_unoffloaded_children && config_allows_offload;
-                    if (is_active, can_offload) == (false, false) {
-                        None
-                    } else {
-                        Some((*timeline_id, timeline.clone(), (is_active, can_offload)))
-                    }
-                })
-                .collect::<Vec<_>>();
-            drop(timelines);
-        }
-
-        // Before doing any I/O work, check our circuit breaker
+        // Don't compact if the circuit breaker is tripped.
        if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
-            info!("Skipping compaction due to previous failures");
-            return Ok(false);
+            info!("skipping compaction due to previous failures");
+            return Ok(CompactionOutcome::Skipped);
        }

-        let mut has_pending_task = false;
+        // Collect all timelines to compact, along with offload instructions and L0 counts.
+        let mut compact: Vec<Arc<Timeline>> = Vec::new();
+        let mut offload: HashSet<TimelineId> = HashSet::new();
+        let mut l0_counts: HashMap<TimelineId, usize> = HashMap::new();

-        for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
        {
-            // pending_task_left == None: cannot compact, maybe still pending tasks
-            // pending_task_left == Some(true): compaction task left
-            // pending_task_left == Some(false): no compaction task left
-            let pending_task_left = if *can_compact {
-                let has_pending_l0_compaction_task = timeline
-                    .compact(cancel, EnumSet::empty(), ctx)
-                    .instrument(info_span!("compact_timeline", %timeline_id))
-                    .await
-                    .inspect_err(|e| match e {
-                        timeline::CompactionError::ShuttingDown => (),
-                        timeline::CompactionError::Offload(_) => {
-                            // Failures to offload timelines do not trip the circuit breaker, because
-                            // they do not do lots of writes the way compaction itself does: it is cheap
-                            // to retry, and it would be bad to stop all compaction because of an issue with offloading.
-                        }
-                        timeline::CompactionError::Other(e) => {
-                            self.compaction_circuit_breaker
-                                .lock()
-                                .unwrap()
-                                .fail(&CIRCUIT_BREAKERS_BROKEN, e);
-                        }
-                    })?;
-                if has_pending_l0_compaction_task {
-                    Some(true)
-                } else {
-                    let queue = {
-                        let guard = self.scheduled_compaction_tasks.lock().unwrap();
-                        guard.get(timeline_id).cloned()
-                    };
-                    if let Some(queue) = queue {
-                        let has_pending_tasks = queue
-                            .iteration(cancel, ctx, &self.gc_block, timeline)
-                            .await?;
-                        Some(has_pending_tasks)
-                    } else {
-                        Some(false)
-                    }
+            let offload_enabled = self.get_timeline_offloading_enabled();
+            let timelines = self.timelines.lock().unwrap();
+            for (&timeline_id, timeline) in timelines.iter() {
+                // Skip inactive timelines.
+                if !timeline.is_active() {
+                    continue;
                }
-            } else {
-                None
-            };
-            has_pending_task |= pending_task_left.unwrap_or(false);
-            if pending_task_left == Some(false) && *can_offload {
-                pausable_failpoint!("before-timeline-auto-offload");
-                match offload_timeline(self, timeline)
-                    .instrument(info_span!("offload_timeline", %timeline_id))
-                    .await
-                {
-                    Err(OffloadError::NotArchived) => {
-                        // Ignore this, we likely raced with unarchival
-                        Ok(())
-                    }
-                    other => other,
-                }?;
+
+                // Schedule the timeline for compaction.
+                compact.push(timeline.clone());
+
+                // Schedule the timeline for offloading if eligible.
+                let can_offload = offload_enabled
+                    && timeline.can_offload().0
+                    && !timelines
+                        .iter()
+                        .any(|(_, tli)| tli.get_ancestor_timeline_id() == Some(timeline_id));
+                if can_offload {
+                    offload.insert(timeline_id);
+                }
+            }
+        } // release timelines lock
+
+        for timeline in &compact {
+            // Collect L0 counts. Can't await while holding lock above.
+            if let Ok(lm) = timeline.layers.read().await.layer_map() {
+                l0_counts.insert(timeline.timeline_id, lm.level0_deltas().len());
            }
        }

+        // Pass 1: L0 compaction across all timelines, in order of L0 count. We prioritize this to
+        // bound read amplification.
+        //
+        // TODO: this may spin on one or more ingest-heavy timelines, starving out image/GC
+        // compaction and offloading. We leave that as a potential problem to solve later. Consider
+        // splitting L0 and image/GC compaction to separate background jobs.
+        if self.get_compaction_l0_first() {
+            let compaction_threshold = self.get_compaction_threshold();
+            let compact_l0 = compact
+                .iter()
+                .map(|tli| (tli, l0_counts.get(&tli.timeline_id).copied().unwrap_or(0)))
+                .filter(|&(_, l0)| l0 >= compaction_threshold)
+                .sorted_by_key(|&(_, l0)| l0)
+                .rev()
+                .map(|(tli, _)| tli.clone())
+                .collect_vec();
+
+            let mut has_pending_l0 = false;
+            for timeline in compact_l0 {
+                let outcome = timeline
+                    .compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx)
+                    .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
+                    .await
+                    .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?;
+                match outcome {
+                    CompactionOutcome::Done => {}
+                    CompactionOutcome::Skipped => {}
+                    CompactionOutcome::Pending => has_pending_l0 = true,
+                    CompactionOutcome::YieldForL0 => has_pending_l0 = true,
+                }
+            }
+            if has_pending_l0 {
+                return Ok(CompactionOutcome::YieldForL0); // do another pass
+            }
+        }
+
+        // Pass 2: image compaction and timeline offloading. If any timelines have accumulated
+        // more L0 layers, they may also be compacted here.
+        //
+        // NB: image compaction may yield if there is pending L0 compaction.
+        //
+        // TODO: it will only yield if there is pending L0 compaction on the same timeline. If a
+        // different timeline needs compaction, it won't. It should check `l0_compaction_trigger`.
+        // We leave this for a later PR.
+        //
+        // TODO: consider ordering timelines by some priority, e.g. time since last full compaction,
+        // amount of L1 delta debt or garbage, offload-eligible timelines first, etc.
+        let mut has_pending = false;
+        for timeline in compact {
+            if !timeline.is_active() {
+                continue;
+            }
+
+            let mut outcome = timeline
+                .compact(cancel, EnumSet::default(), ctx)
+                .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
+                .await
+                .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?;
+
+            // If we're done compacting, check the scheduled GC compaction queue for more work.
+            if outcome == CompactionOutcome::Done {
+                let queue = self
+                    .scheduled_compaction_tasks
+                    .lock()
+                    .unwrap()
+                    .get(&timeline.timeline_id)
+                    .cloned();
+                if let Some(queue) = queue {
+                    outcome = queue
+                        .iteration(cancel, ctx, &self.gc_block, &timeline)
+                        .await?;
+                }
+            }
+
+            // If we're done compacting, offload the timeline if requested.
+            if outcome == CompactionOutcome::Done && offload.contains(&timeline.timeline_id) {
+                pausable_failpoint!("before-timeline-auto-offload");
+                offload_timeline(self, &timeline)
+                    .instrument(info_span!("offload_timeline", timeline_id = %timeline.timeline_id))
+                    .await
+                    .or_else(|err| match err {
+                        // Ignore this, we likely raced with unarchival.
+                        OffloadError::NotArchived => Ok(()),
+                        err => Err(err),
+                    })?;
+            }
+
+            match outcome {
+                CompactionOutcome::Done => {}
+                CompactionOutcome::Skipped => {}
+                CompactionOutcome::Pending => has_pending = true,
+                // This mostly makes sense when the L0-only pass above is enabled, since there's
+                // otherwise no guarantee that we'll start with the timeline that has high L0.
+                CompactionOutcome::YieldForL0 => return Ok(CompactionOutcome::YieldForL0),
+            }
+        }
+
+        // Success! Untrip the breaker if necessary.
        self.compaction_circuit_breaker
            .lock()
            .unwrap()
            .success(&CIRCUIT_BREAKERS_UNBROKEN);

-        Ok(has_pending_task)
+        match has_pending {
+            true => Ok(CompactionOutcome::Pending),
+            false => Ok(CompactionOutcome::Done),
+        }
+    }
+
+    /// Trips the compaction circuit breaker if appropriate.
+    pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) {
+        match err {
+            CompactionError::ShuttingDown => (),
+            // Offload failures don't trip the circuit breaker, since they're cheap to retry and
+            // shouldn't block compaction.
+            CompactionError::Offload(_) => {}
+            CompactionError::Other(err) => {
+                self.compaction_circuit_breaker
+                    .lock()
+                    .unwrap()
+                    .fail(&CIRCUIT_BREAKERS_BROKEN, err);
+            }
+        }
    }

    /// Cancel scheduled compaction tasks
@@ -3088,32 +3134,28 @@ impl Tenant {
        Ok(rx)
    }

-    // Call through to all timelines to freeze ephemeral layers if needed.  Usually
-    // this happens during ingest: this background housekeeping is for freezing layers
-    // that are open but haven't been written to for some time.
-    async fn ingest_housekeeping(&self) {
-        // Scan through the hashmap and collect a list of all the timelines,
-        // while holding the lock. Then drop the lock and actually perform the
-        // compactions.  We don't want to block everything else while the
-        // compaction runs.
-        let timelines = {
-            self.timelines
-                .lock()
-                .unwrap()
-                .values()
-                .filter_map(|timeline| {
-                    if timeline.is_active() {
-                        Some(timeline.clone())
-                    } else {
-                        None
-                    }
-                })
-                .collect::<Vec<_>>()
-        };
+    /// Performs periodic housekeeping, via the tenant housekeeping background task.
+    async fn housekeeping(&self) {
+        // Call through to all timelines to freeze ephemeral layers as needed. This usually happens
+        // during ingest, but we don't want idle timelines to hold open layers for too long.
+        let timelines = self
+            .timelines
+            .lock()
+            .unwrap()
+            .values()
+            .filter(|tli| tli.is_active())
+            .cloned()
+            .collect_vec();

-        for timeline in &timelines {
+        for timeline in timelines {
            timeline.maybe_freeze_ephemeral_layer().await;
        }
+
+        // Shut down walredo if idle.
+        const WALREDO_IDLE_TIMEOUT: Duration = Duration::from_secs(180);
+        if let Some(ref walredo_mgr) = self.walredo_mgr {
+            walredo_mgr.maybe_quiesce(WALREDO_IDLE_TIMEOUT);
+        }
    }

    pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool {
@@ -3823,6 +3865,13 @@ impl Tenant {
            .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit)
    }

+    pub fn get_compaction_l0_first(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .compaction_l0_first
+            .unwrap_or(self.conf.default_tenant_conf.compaction_l0_first)
+    }
+
    pub fn get_gc_horizon(&self) -> u64 {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
@@ -3877,6 +3926,16 @@ impl Tenant {
            .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
    }

+    pub fn get_timeline_offloading_enabled(&self) -> bool {
+        if self.conf.timeline_offloading {
+            return true;
+        }
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .timeline_offloading
+            .unwrap_or(self.conf.default_tenant_conf.timeline_offloading)
+    }
+
    /// Generate an up-to-date TenantManifest based on the state of this Tenant.
    fn build_tenant_manifest(&self) -> TenantManifest {
        let timelines_offloaded = self.timelines_offloaded.lock().unwrap();
@@ -4115,6 +4174,7 @@ impl Tenant {
                // use an extremely long backoff.
                Some(Duration::from_secs(3600 * 24)),
            )),
+            l0_compaction_trigger: Arc::new(Notify::new()),
            scheduled_compaction_tasks: Mutex::new(Default::default()),
            activate_now_sem: tokio::sync::Semaphore::new(0),
            attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
@@ -4642,22 +4702,26 @@ impl Tenant {

        // check against last actual 'latest_gc_cutoff' first
        let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
-        src_timeline
-            .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
-            .context(format!(
-                "invalid branch start lsn: less than latest GC cutoff {}",
-                *latest_gc_cutoff_lsn,
-            ))
-            .map_err(CreateTimelineError::AncestorLsn)?;
-
-        // and then the planned GC cutoff
        {
            let gc_info = src_timeline.gc_info.read().unwrap();
-            let cutoff = gc_info.min_cutoff();
-            if start_lsn < cutoff {
-                return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
-                    "invalid branch start lsn: less than planned GC cutoff {cutoff}"
-                )));
+            let planned_cutoff = gc_info.min_cutoff();
+            if gc_info.lsn_covered_by_lease(start_lsn) {
+                tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *latest_gc_cutoff_lsn);
+            } else {
+                src_timeline
+                    .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
+                    .context(format!(
+                        "invalid branch start lsn: less than latest GC cutoff {}",
+                        *latest_gc_cutoff_lsn,
+                    ))
+                    .map_err(CreateTimelineError::AncestorLsn)?;
+
+                // and then the planned GC cutoff
+                if start_lsn < planned_cutoff {
+                    return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
+                        "invalid branch start lsn: less than planned GC cutoff {planned_cutoff}"
+                    )));
+                }
            }
        }

@@ -5019,12 +5083,19 @@ impl Tenant {
        )
    }

-    /// Call this before constructing a timeline, to build its required structures
+    /// Builds required resources for a new timeline.
    fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
+        let remote_client = self.build_timeline_remote_client(timeline_id);
+        self.get_timeline_resources_for(remote_client)
+    }
+
+    /// Builds timeline resources for the given remote client.
+    fn get_timeline_resources_for(&self, remote_client: RemoteTimelineClient) -> TimelineResources {
        TimelineResources {
-            remote_client: self.build_timeline_remote_client(timeline_id),
+            remote_client,
            pagestream_throttle: self.pagestream_throttle.clone(),
            pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
+            l0_compaction_trigger: self.l0_compaction_trigger.clone(),
            l0_flush_global_state: self.l0_flush_global_state.clone(),
        }
    }
@@ -5470,6 +5541,7 @@ pub(crate) mod harness {
                compaction_threshold: Some(tenant_conf.compaction_threshold),
                compaction_upper_limit: Some(tenant_conf.compaction_upper_limit),
                compaction_algorithm: Some(tenant_conf.compaction_algorithm),
+                compaction_l0_first: Some(tenant_conf.compaction_l0_first),
                l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold,
                l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold,
                l0_flush_wait_upload: Some(tenant_conf.l0_flush_wait_upload),
@@ -5491,6 +5563,9 @@ pub(crate) mod harness {
                image_layer_creation_check_threshold: Some(
                    tenant_conf.image_layer_creation_check_threshold,
                ),
+                image_creation_preempt_threshold: Some(
+                    tenant_conf.image_creation_preempt_threshold,
+                ),
                lsn_lease_length: Some(tenant_conf.lsn_lease_length),
                lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
                timeline_offloading: Some(tenant_conf.timeline_offloading),
@@ -7694,6 +7769,18 @@ mod tests {
            }

            tline.freeze_and_flush().await?;
+            // Force layers to L1
+            tline
+                .compact(
+                    &cancel,
+                    {
+                        let mut flags = EnumSet::new();
+                        flags.insert(CompactFlags::ForceL0Compaction);
+                        flags
+                    },
+                    &ctx,
+                )
+                .await?;

            if iter % 5 == 0 {
                let (_, before_delta_file_accessed) =
@@ -7706,6 +7793,7 @@ mod tests {
                            let mut flags = EnumSet::new();
                            flags.insert(CompactFlags::ForceImageLayerCreation);
                            flags.insert(CompactFlags::ForceRepartition);
+                            flags.insert(CompactFlags::ForceL0Compaction);
                            flags
                        },
                        &ctx,
@@ -8152,6 +8240,8 @@ mod tests {

        let cancel = CancellationToken::new();

+        // Image layer creation happens on the disk_consistent_lsn so we need to force set it now.
+        tline.force_set_disk_consistent_lsn(Lsn(0x40));
        tline
            .compact(
                &cancel,
@@ -8165,8 +8255,7 @@ mod tests {
            )
            .await
            .unwrap();
-
-        // Image layers are created at last_record_lsn
+        // Image layers are created at repartition LSN
        let images = tline
            .inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone())
            .await
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -285,6 +285,10 @@ pub struct TenantConfOpt {
    #[serde(default)]
    pub compaction_algorithm: Option<CompactionAlgorithmSettings>,

+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub compaction_l0_first: Option<bool>,
+
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub l0_flush_delay_threshold: Option<usize>,
@@ -357,6 +361,9 @@ pub struct TenantConfOpt {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub image_layer_creation_check_threshold: Option<u8>,

+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_creation_preempt_threshold: Option<usize>,
+
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(with = "humantime_serde")]
    #[serde(default)]
@@ -413,6 +420,9 @@ impl TenantConfOpt {
                .as_ref()
                .unwrap_or(&global_conf.compaction_algorithm)
                .clone(),
+            compaction_l0_first: self
+                .compaction_l0_first
+                .unwrap_or(global_conf.compaction_l0_first),
            l0_flush_delay_threshold: self
                .l0_flush_delay_threshold
                .or(global_conf.l0_flush_delay_threshold),
@@ -453,6 +463,9 @@ impl TenantConfOpt {
            image_layer_creation_check_threshold: self
                .image_layer_creation_check_threshold
                .unwrap_or(global_conf.image_layer_creation_check_threshold),
+            image_creation_preempt_threshold: self
+                .image_creation_preempt_threshold
+                .unwrap_or(global_conf.image_creation_preempt_threshold),
            lsn_lease_length: self
                .lsn_lease_length
                .unwrap_or(global_conf.lsn_lease_length),
@@ -460,7 +473,7 @@ impl TenantConfOpt {
                .lsn_lease_length_for_ts
                .unwrap_or(global_conf.lsn_lease_length_for_ts),
            timeline_offloading: self
-                .lazy_slru_download
+                .timeline_offloading
                .unwrap_or(global_conf.timeline_offloading),
            wal_receiver_protocol_override: self
                .wal_receiver_protocol_override
@@ -487,6 +500,7 @@ impl TenantConfOpt {
            mut compaction_threshold,
            mut compaction_upper_limit,
            mut compaction_algorithm,
+            mut compaction_l0_first,
            mut l0_flush_delay_threshold,
            mut l0_flush_stall_threshold,
            mut l0_flush_wait_upload,
@@ -504,6 +518,7 @@ impl TenantConfOpt {
            mut lazy_slru_download,
            mut timeline_get_throttle,
            mut image_layer_creation_check_threshold,
+            mut image_creation_preempt_threshold,
            mut lsn_lease_length,
            mut lsn_lease_length_for_ts,
            mut timeline_offloading,
@@ -531,6 +546,7 @@ impl TenantConfOpt {
            .compaction_upper_limit
            .apply(&mut compaction_upper_limit);
        patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch.compaction_l0_first.apply(&mut compaction_l0_first);
        patch
            .l0_flush_delay_threshold
            .apply(&mut l0_flush_delay_threshold);
@@ -578,6 +594,9 @@ impl TenantConfOpt {
        patch
            .image_layer_creation_check_threshold
            .apply(&mut image_layer_creation_check_threshold);
+        patch
+            .image_creation_preempt_threshold
+            .apply(&mut image_creation_preempt_threshold);
        patch
            .lsn_lease_length
            .map(|v| humantime::parse_duration(&v))?
@@ -609,6 +628,7 @@ impl TenantConfOpt {
            compaction_threshold,
            compaction_upper_limit,
            compaction_algorithm,
+            compaction_l0_first,
            l0_flush_delay_threshold,
            l0_flush_stall_threshold,
            l0_flush_wait_upload,
@@ -626,6 +646,7 @@ impl TenantConfOpt {
            lazy_slru_download,
            timeline_get_throttle,
            image_layer_creation_check_threshold,
+            image_creation_preempt_threshold,
            lsn_lease_length,
            lsn_lease_length_for_ts,
            timeline_offloading,
@@ -670,6 +691,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
            compaction_period: value.compaction_period.map(humantime),
            compaction_threshold: value.compaction_threshold,
            compaction_upper_limit: value.compaction_upper_limit,
+            compaction_l0_first: value.compaction_l0_first,
            l0_flush_delay_threshold: value.l0_flush_delay_threshold,
            l0_flush_stall_threshold: value.l0_flush_stall_threshold,
            l0_flush_wait_upload: value.l0_flush_wait_upload,
@@ -689,6 +711,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
            lazy_slru_download: value.lazy_slru_download,
            timeline_get_throttle: value.timeline_get_throttle,
            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
+            image_creation_preempt_threshold: value.image_creation_preempt_threshold,
            lsn_lease_length: value.lsn_lease_length.map(humantime),
            lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
            timeline_offloading: value.timeline_offloading,
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2816,8 +2816,8 @@ where
 }

 use {
-    crate::tenant::gc_result::GcResult, pageserver_api::models::TimelineGcRequest,
-    utils::http::error::ApiError,
+    crate::tenant::gc_result::GcResult, http_utils::error::ApiError,
+    pageserver_api::models::TimelineGcRequest,
 };

 #[cfg(test)]
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -437,8 +437,7 @@ impl RemoteTimelineClient {
            .conf
            .remote_storage_config
            .as_ref()
-            .and_then(|r| r.concurrency_limit())
-            .unwrap_or(0);
+            .map_or(0, |r| r.concurrency_limit());
        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
        self.update_remote_physical_size_gauge(Some(index_part));
@@ -461,8 +460,7 @@ impl RemoteTimelineClient {
            .conf
            .remote_storage_config
            .as_ref()
-            .and_then(|r| r.concurrency_limit())
-            .unwrap_or(0);
+            .map_or(0, |r| r.concurrency_limit());
        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
        self.update_remote_physical_size_gauge(None);
@@ -484,8 +482,7 @@ impl RemoteTimelineClient {
            .conf
            .remote_storage_config
            .as_ref()
-            .and_then(|r| r.concurrency_limit())
-            .unwrap_or(0);
+            .map_or(0, |r| r.concurrency_limit());

        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
@@ -520,7 +517,7 @@ impl RemoteTimelineClient {
            if let Ok(queue) = queue_locked.initialized_mut() {
                let blocked_deletions = std::mem::take(&mut queue.blocked_deletions);
                for d in blocked_deletions {
-                    if let Err(e) = self.deletion_queue_client.push_layers_sync(
+                    if let Err(e) = self.deletion_queue_client.push_layers(
                        self.tenant_shard_id,
                        self.timeline_id,
                        self.generation,
@@ -2154,7 +2151,7 @@ impl RemoteTimelineClient {
                                self.generation,
                                delete.layers.clone(),
                            )
-                            .await
+                            .map(|_| ())
                            .map_err(|e| anyhow::anyhow!(e))
                    }
                }
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -9,13 +9,14 @@ use crate::{
    metrics::SECONDARY_MODE,
    tenant::{
        config::AttachmentMode,
-        mgr::GetTenantError,
-        mgr::TenantManager,
+        mgr::{GetTenantError, TenantManager},
        remote_timeline_client::remote_heatmap_path,
        span::debug_assert_current_span_has_tenant_id,
        tasks::{warn_when_period_overrun, BackgroundLoopKind},
        Tenant,
    },
+    virtual_file::VirtualFile,
+    TEMP_FILE_SUFFIX,
 };

 use futures::Future;
@@ -32,7 +33,10 @@ use super::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, Instrument};
-use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
+use utils::{
+    backoff, completion::Barrier, crashsafe::path_with_suffix_extension,
+    yielding_loop::yielding_loop,
+};

 pub(super) async fn heatmap_uploader_task(
    tenant_manager: Arc<TenantManager>,
@@ -461,6 +465,18 @@ async fn upload_tenant_heatmap(
        }
    }

+    // After a successful upload persist the fresh heatmap to disk.
+    // When restarting, the tenant will read the heatmap from disk
+    // and additively generate a new heatmap (see [`Timeline::generate_heatmap`]).
+    // If the heatmap is stale, the additive generation can lead to keeping previously
+    // evicted timelines on the secondarie's disk.
+    let tenant_shard_id = tenant.get_tenant_shard_id();
+    let heatmap_path = tenant.conf.tenant_heatmap_path(tenant_shard_id);
+    let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
+    if let Err(err) = VirtualFile::crashsafe_overwrite(heatmap_path, temp_path, bytes).await {
+        tracing::warn!("Non fatal IO error writing to disk after heatmap upload: {err}");
+    }
+
    tracing::info!("Successfully uploaded {size} byte heatmap to {path}");

    Ok(UploadHeatmapOutcome::Uploaded(LastUploadState {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -44,7 +44,7 @@ pub(crate) use layer::{EvictionError, Layer, ResidentLayer};

 use self::inmemory_layer::InMemoryLayerFileId;

-use super::timeline::GetVectoredError;
+use super::timeline::{GetVectoredError, ReadPath};
 use super::PageReconstructError;

 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
@@ -262,6 +262,8 @@ pub(crate) struct ValuesReconstructState {

    pub(crate) io_concurrency: IoConcurrency,
    num_active_ios: Arc<AtomicUsize>,
+
+    pub(crate) read_path: Option<ReadPath>,
 }

 /// The level of IO concurrency to be used on the read path
@@ -609,6 +611,7 @@ impl ValuesReconstructState {
            delta_layers_visited: 0,
            io_concurrency,
            num_active_ios: Arc::new(AtomicUsize::new(0)),
+            read_path: None,
        }
    }

--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -166,6 +166,10 @@ impl BatchLayerWriter {
        // END: catch every error and do the recovery in the above section
        Ok(generated_layers)
    }
+
+    pub fn pending_layer_num(&self) -> usize {
+        self.generated_layer_writers.len()
+    }
 }

 /// An image writer that takes images and produces multiple image layers.
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -353,7 +353,6 @@ impl Layer {
    /// while the guard exists.
    ///
    /// Returns None if the layer is currently evicted or becoming evicted.
-    #[cfg(test)]
    pub(crate) async fn keep_resident(&self) -> Option<ResidentLayer> {
        let downloaded = self.0.inner.get().and_then(|rowe| rowe.get())?;

@@ -530,7 +529,6 @@ impl ResidentOrWantedEvicted {
    /// This is not used on the read path (anything that calls
    /// [`LayerInner::get_or_maybe_download`]) because it was decided that reads always win
    /// evictions, and part of that winning is using [`ResidentOrWantedEvicted::get_and_upgrade`].
-    #[cfg(test)]
    fn get(&self) -> Option<Arc<DownloadedLayer>> {
        match self {
            ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()),
--- a/Show More
+++ b/Show More