Stop using mold linker

There isn't anything wrong with with 'mold' as such, but it also doesn't give any benefit. Better to stick to the defaults when there's no reason to deviate. 'mold' might be a little faster, but it's insignificant when doing a full build. It might make a difference when doing an incremental build, but docker builds are not incremental. I didn't remove 'mold' from the build-tools image yet, as I'm not sure if the image might still be used to build old images that are still using 'mold', or perhaps to build PRs created earlier. I will remove 'mold' from build-tools later as a separate PR, after some time has passed.
pageserver: improve tenant housekeeping task (#10725 )
2026-05-17 21:20:37 +00:00 · 2025-02-09 16:43:37 +02:00 · 2025-02-08 12:42:55 +00:00 · 2025-02-08 11:02:13 +00:00 · 2025-02-08 09:28:09 +00:00 · 2025-02-07 19:45:39 +00:00
203 changed files with 8076 additions and 3554 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -24,3 +24,4 @@
 !storage_controller/
 !vendor/postgres-*/
 !workspace_hack/
+!build_tools/patches
--- a/.github/ISSUE_TEMPLATE/bug-template.md
+++ b/.github/ISSUE_TEMPLATE/bug-template.md
@@ -3,6 +3,7 @@ name: Bug Template
 about: Used for describing bugs
 title: ''
 labels: t/bug
+type: Bug
 assignees: ''

 ---
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -4,6 +4,7 @@ about: A set of related tasks contributing towards specific outcome, comprising
  more than 1 week of work.
 title: 'Epic: '
 labels: t/Epic
+type: Epic
 assignees: ''

 ---
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -27,3 +27,4 @@ config-variables:
  - SLACK_ON_CALL_QA_STAGING_STREAM
  - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
  - SLACK_ON_CALL_STORAGE_STAGING_STREAM
+  - SLACK_CICD_CHANNEL_ID
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -41,7 +41,10 @@ inputs:
    description: 'Path to directory containing libpq library - it is caller responsibility to provision the libpq library'
    required: false
    default: '/tmp/neon/pg_install/v16/lib'
-  
+  project_settings:
+    description: 'A JSON object with project settings'
+    required: false
+    default: '{}'

 outputs:
  dsn:
@@ -73,7 +76,7 @@ runs:
              \"provisioner\": \"k8s-neonvm\",
              \"autoscaling_limit_min_cu\": ${MIN_CU},
              \"autoscaling_limit_max_cu\": ${MAX_CU},
-              \"settings\": { }
+              \"settings\": ${PROJECT_SETTINGS}
            }
          }")

@@ -92,12 +95,12 @@ runs:
        if [ "${SHARD_SPLIT_PROJECT}" = "true" ]; then
          # determine tenant ID
          TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"`
-          
+
          echo "Splitting project ${project_id} with tenant_id ${TENANT_ID} into $((SHARD_COUNT)) shards with stripe size $((STRIPE_SIZE))"

          echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split"
          echo "with body {\"new_shard_count\": $((SHARD_COUNT)), \"new_stripe_size\": $((STRIPE_SIZE))}"
-          
+
          # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set)
          curl -X PUT \
            "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" \
@@ -118,3 +121,4 @@ runs:
        STRIPE_SIZE: ${{ inputs.stripe_size }}
        PSQL: ${{ inputs.psql_path }}
        LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }}
+        PROJECT_SETTINGS: ${{ inputs.project_settings }}
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -121,6 +121,8 @@ runs:
        export DEFAULT_PG_VERSION=${PG_VERSION#v}
        export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
        export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}
+        export ASAN_OPTIONS=detect_leaks=0:detect_stack_use_after_return=0:abort_on_error=1:strict_string_checks=1:check_initialization_order=1:strict_init_order=1
+        export UBSAN_OPTIONS=abort_on_error=1:print_stacktrace=1

        if [ "${BUILD_TYPE}" = "remote" ]; then
          export REMOTE_ENV=1
--- a/.github/file-filters.yaml
+++ b/.github/file-filters.yaml
@@ -1,4 +1,5 @@
 rust_code: ['**/*.rs', '**/Cargo.toml', '**/Cargo.lock']
+rust_dependencies: ['**/Cargo.lock']

 v14: ['vendor/postgres-v14/**', 'Makefile', 'pgxn/**']
 v15: ['vendor/postgres-v15/**', 'Makefile', 'pgxn/**']
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -17,7 +17,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ]
+        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon, neon_pg17 ]
        database: [ clickbench, tpch, userexample ]

    env:
@@ -41,6 +41,9 @@ jobs:
          neon)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
+          neon_pg17)
+            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }}
+            ;;
          aws-rds-postgres)
            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
            ;;
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -20,7 +20,7 @@ on:
        required: true
        type: string
      test-cfg:
-        description: 'a json object of postgres versions and lfc states to run regression tests on'
+        description: 'a json object of postgres versions and lfc/sanitizers states to build and run regression tests on'
        required: true
        type: string

@@ -48,6 +48,8 @@ jobs:
      # io_uring will account the memory of the CQ and SQ as locked.
      # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+    strategy:
+      matrix: ${{ fromJSON(format('{{"include":{0}}}', inputs.test-cfg)) }}
    env:
      BUILD_TYPE: ${{ inputs.build-type }}
      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
@@ -87,6 +89,7 @@ jobs:
      - name: Set env variables
        env:
          ARCH: ${{ inputs.arch }}
+          SANITIZERS: ${{ matrix.sanitizers }}
        run: |
          CARGO_FEATURES="--features testing"
          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
@@ -99,8 +102,14 @@ jobs:
            cov_prefix=""
            CARGO_FLAGS="--locked --release"
          fi
+          if [[ $SANITIZERS == 'enabled' ]]; then
+            make_vars="WITH_SANITIZERS=yes"
+          else
+            make_vars=""
+          fi
          {
            echo "cov_prefix=${cov_prefix}"
+            echo "make_vars=${make_vars}"
            echo "CARGO_FEATURES=${CARGO_FEATURES}"
            echo "CARGO_FLAGS=${CARGO_FLAGS}"
            echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
@@ -136,35 +145,39 @@ jobs:

      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v14 -j$(nproc)
+        run: make ${make_vars} postgres-v14 -j$(nproc)

      - name: Build postgres v15
        if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v15 -j$(nproc)
+        run: make ${make_vars} postgres-v15 -j$(nproc)

      - name: Build postgres v16
        if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v16 -j$(nproc)
+        run: make ${make_vars} postgres-v16 -j$(nproc)

      - name: Build postgres v17
        if: steps.cache_pg_17.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v17 -j$(nproc)
+        run: make ${make_vars} postgres-v17 -j$(nproc)

      - name: Build neon extensions
-        run: mold -run make neon-pg-ext -j$(nproc)
+        run: make ${make_vars} neon-pg-ext -j$(nproc)

      - name: Build walproposer-lib
-        run: mold -run make walproposer-lib -j$(nproc)
+        run: make ${make_vars} walproposer-lib -j$(nproc)

      - name: Run cargo build
+        env:
+          WITH_TESTS: ${{ matrix.sanitizers != 'enabled' && '--tests' || '' }}
        run: |
-          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+          export ASAN_OPTIONS=detect_leaks=0
+          ${cov_prefix} cargo build $CARGO_FLAGS $CARGO_FEATURES --bins ${WITH_TESTS}

      # Do install *before* running rust tests because they might recompile the
      # binaries with different features/flags.
      - name: Install rust binaries
        env:
          ARCH: ${{ inputs.arch }}
+          SANITIZERS: ${{ matrix.sanitizers }}
        run: |
          # Install target binaries
          mkdir -p /tmp/neon/bin/
@@ -179,7 +192,7 @@ jobs:
          done

          # Install test executables and write list of all binaries (for code coverage)
-          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
+          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' && $SANITIZERS != 'enabled' ]]; then
            # Keep bloated coverage data files away from the rest of the artifact
            mkdir -p /tmp/coverage/

@@ -212,6 +225,7 @@ jobs:
          role-duration-seconds: 18000 # 5 hours

      - name: Run rust tests
+        if: ${{ matrix.sanitizers != 'enabled' }}
        env:
          NEXTEST_RETRIES: 3
        run: |
@@ -267,6 +281,27 @@ jobs:
          path: /tmp/neon
          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

+      - name: Check diesel schema
+        if: inputs.build-type == 'release' && inputs.arch == 'x64'
+        env:
+          DATABASE_URL: postgresql://localhost:1235/storage_controller
+          POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+        run: |
+          export ASAN_OPTIONS=detect_leaks=0
+          /tmp/neon/bin/neon_local init
+          /tmp/neon/bin/neon_local storage_controller start
+
+          diesel print-schema > storage_controller/src/schema.rs
+
+          if [ -n "$(git diff storage_controller/src/schema.rs)" ]; then
+            echo >&2 "Uncommitted changes in diesel schema"
+
+            git diff .
+            exit 1
+          fi
+
+          /tmp/neon/bin/neon_local storage_controller stop
+
      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
      - name: Merge and upload coverage data
        if: inputs.build-type == 'debug'
@@ -299,7 +334,7 @@ jobs:
      - name: Pytest regression tests
        continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
        uses: ./.github/actions/run-python-test-set
-        timeout-minutes: 60
+        timeout-minutes: ${{ matrix.sanitizers != 'enabled' && 60 || 180 }}
        with:
          build_type: ${{ inputs.build-type }}
          test_selection: regress
@@ -317,6 +352,7 @@ jobs:
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
          USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
+          SANITIZERS: ${{ matrix.sanitizers }}

      # Temporary disable this step until we figure out why it's so flaky
      # Ref https://github.com/neondatabase/neon/issues/4540
--- a/.github/workflows/_check-codestyle-rust.yml
+++ b/.github/workflows/_check-codestyle-rust.yml
@@ -16,6 +16,9 @@ defaults:
  run:
    shell: bash -euxo pipefail {0}

+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
 jobs:
  check-codestyle-rust:
    strategy:
@@ -84,8 +87,3 @@ jobs:
        run: |
          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
-
-      # https://github.com/EmbarkStudios/cargo-deny
-      - name: Check rust licenses/bans/advisories/sources
-        if: ${{ !cancelled() }}
-        run: cargo deny check --hide-inclusion-graph
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -67,9 +67,9 @@ jobs:

      - uses: actions/checkout@v4
        with:
-          ref: main
+          ref: ${{ github.event.pull_request.head.sha }}
          token: ${{ secrets.CI_ACCESS_TOKEN }}
-      
+
      - name: Look for existing PR
        id: get-pr
        env:
@@ -77,7 +77,7 @@ jobs:
        run: |
          ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
          echo "ALREADY_CREATED=${ALREADY_CREATED}" >> ${GITHUB_OUTPUT}
-      
+
      - name: Get changed labels
        id: get-labels
        if: steps.get-pr.outputs.ALREADY_CREATED != ''
@@ -94,8 +94,6 @@ jobs:
          echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT}
          echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT}

-      - run: gh pr checkout "${PR_NUMBER}"
-
      - run: git checkout -b "${BRANCH}"

      - run: git push --force origin "${BRANCH}"
@@ -103,7 +101,7 @@ jobs:

      - name: Create a Pull Request for CI run (if required)
        if: steps.get-pr.outputs.ALREADY_CREATED == ''
-        env: 
+        env:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
          cat << EOF > body.md
@@ -140,7 +138,7 @@ jobs:

      - run: git push --force origin "${BRANCH}"
        if: steps.get-pr.outputs.ALREADY_CREATED != ''
-             
+
  cleanup:
    # Close PRs and delete branchs if the original PR is closed.

--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -63,11 +63,15 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - DEFAULT_PG_VERSION: 16
+          - PG_VERSION: 16
            PLATFORM: "neon-staging"
            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
            RUNNER: [ self-hosted, us-east-2, x64 ]
-          - DEFAULT_PG_VERSION: 16
+          - PG_VERSION: 17
+            PLATFORM: "neon-staging"
+            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+            RUNNER: [ self-hosted, us-east-2, x64 ]
+          - PG_VERSION: 16
            PLATFORM: "azure-staging"
            region_id: 'azure-eastus2'
            RUNNER: [ self-hosted, eastus2, x64 ]
@@ -75,7 +79,7 @@ jobs:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }}
+      PG_VERSION: ${{ matrix.PG_VERSION }}
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -112,7 +116,7 @@ jobs:
      uses: ./.github/actions/neon-project-create
      with:
        region_id: ${{ matrix.region_id }}
-        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+        postgres_version: ${{ env.PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}

    - name: Run benchmark
@@ -122,7 +126,7 @@ jobs:
        test_selection: performance
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
        # Set --sparse-ordering option of pytest-order plugin
        # to ensure tests are running in order of appears in the file.
@@ -313,7 +317,11 @@ jobs:
                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
        }'

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
@@ -329,12 +337,15 @@ jobs:
        matrix='{
          "platform": [
            "neonvm-captest-reuse"
+          ],
+          "pg_version" : [
+            16,17
          ]
        }'

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
-                                                     { "platform": "rds-aurora"   }]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "platform": "rds-postgres" },
+                                                     { "pg_version": 16, "platform": "rds-aurora"   }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -346,14 +357,14 @@ jobs:
          "platform": [
            "neonvm-captest-reuse"
          ],
-          "scale": [
-            "10"
+          "pg_version" : [
+            16,17
          ]
        }'

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                     { "platform": "rds-aurora",   "scale": "10" }]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "platform": "rds-postgres" },
+                                                     { "pg_version": 16, "platform": "rds-aurora"   }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -378,7 +389,7 @@ jobs:
      TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
      TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
+      PG_VERSION: ${{ matrix.pg_version }}
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -416,7 +427,7 @@ jobs:
      uses: ./.github/actions/neon-project-create
      with:
        region_id: ${{ matrix.region_id }}
-        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+        postgres_version: ${{ env.PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
        compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}

@@ -447,7 +458,7 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-    # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB 
+    # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB
    # without (neonvm-captest-new)
    # and with (neonvm-captest-new-many-tables) many relations in the database
    - name: Create many relations before the run
@@ -459,7 +470,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_perf_many_relations
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -475,7 +486,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -490,7 +501,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -505,7 +516,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -549,14 +560,19 @@ jobs:
        include:
          - PLATFORM: "neonvm-captest-pgvector"
            RUNNER: [ self-hosted, us-east-2, x64 ]
+            postgres_version: 16
+          - PLATFORM: "neonvm-captest-pgvector-pg17"
+            RUNNER: [ self-hosted, us-east-2, x64 ]
+            postgres_version: 17
          - PLATFORM: "azure-captest-pgvector"
            RUNNER: [ self-hosted, eastus2, x64 ]
+            postgres_version: 16

    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
      TEST_PG_BENCH_SCALES_MATRIX: "1"
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      PG_VERSION: ${{ matrix.postgres_version }}
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote

@@ -574,32 +590,20 @@ jobs:
    steps:
    - uses: actions/checkout@v4

-    # until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16
-    # instead of using Neon artifacts containing pgbench
-    - name: Install postgresql-16 where pytest expects it
-      run: |
-        # Just to make it easier to test things locally on macOS (with arm64)
-        arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours

-        cd /home/nonroot
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg120+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg120+1_${arch}.deb"
-        dpkg -x libpq5_17.2-1.pgdg120+1_${arch}.deb pg
-        dpkg -x postgresql-16_16.6-1.pgdg120+1_${arch}.deb pg
-        dpkg -x postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb pg
-
-        mkdir -p /tmp/neon/pg_install/v16/bin
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v16/bin/psql
-        ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v16/lib
-
-        LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}"
-        export LD_LIBRARY_PATH
-        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV}
-
-        /tmp/neon/pg_install/v16/bin/pgbench --version
-        /tmp/neon/pg_install/v16/bin/psql --version
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Set up Connection String
      id: set-up-connstr
@@ -608,6 +612,9 @@ jobs:
          neonvm-captest-pgvector)
            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
            ;;
+          neonvm-captest-pgvector-pg17)
+            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_PG17 }}
+            ;;
          azure-captest-pgvector)
            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }}
            ;;
@@ -619,13 +626,6 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-    - name: Configure AWS credentials
-      uses: aws-actions/configure-aws-credentials@v4
-      with:
-        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-        role-duration-seconds: 18000 # 5 hours
-
    - name: Benchmark pgvector hnsw indexing
      uses: ./.github/actions/run-python-test-set
      with:
@@ -634,7 +634,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -649,7 +649,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -696,7 +696,7 @@ jobs:

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      PG_VERSION: ${{ matrix.pg_version }}
      TEST_OUTPUT: /tmp/test_output
      TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
      TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
@@ -739,7 +739,18 @@ jobs:
      run: |
        case "${PLATFORM}" in
          neonvm-captest-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
+            case "${PG_VERSION}" in
+              16)
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
+                ;;
+              17)
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_CONNSTR_PG17 }}
+                ;;
+              *)
+                echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
+                exit 1
+                ;;
+            esac
            ;;
          rds-aurora)
            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CLICKBENCH_10M_CONNSTR }}
@@ -763,7 +774,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 43200 -k test_clickbench
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -799,7 +810,7 @@ jobs:
    # We might change it after https://github.com/neondatabase/neon/issues/2900.
    #
    # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
    permissions:
      contents: write
      statuses: write
@@ -812,12 +823,11 @@ jobs:

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      PG_VERSION: ${{ matrix.pg_version }}
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.platform }}
-      TEST_OLAP_SCALE: ${{ matrix.scale }}

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -849,13 +859,24 @@ jobs:
      run: |
        case "${PLATFORM}" in
          neonvm-captest-reuse)
-            ENV_PLATFORM=CAPTEST_TPCH
+            case "${PG_VERSION}" in
+              16)
+                CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_S10_CONNSTR"
+                ;;
+              17)
+                CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_CONNSTR_PG17"
+                ;;
+              *)
+                echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
+                exit 1
+                ;;
+            esac
            ;;
          rds-aurora)
-            ENV_PLATFORM=RDS_AURORA_TPCH
+            CONNSTR_SECRET_NAME="BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR"
            ;;
          rds-postgres)
-            ENV_PLATFORM=RDS_POSTGRES_TPCH
+            CONNSTR_SECRET_NAME="BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR"
            ;;
          *)
            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
@@ -863,7 +884,6 @@ jobs:
            ;;
        esac

-        CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${TEST_OLAP_SCALE}_CONNSTR"
        echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV

    - name: Set up Connection String
@@ -881,13 +901,13 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_tpch
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-        TEST_OLAP_SCALE: ${{ matrix.scale }}
+        TEST_OLAP_SCALE: 10

    - name: Create Allure report
      id: create-allure-report
@@ -909,7 +929,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  user-examples-compare:
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
    permissions:
      contents: write
      statuses: write
@@ -922,7 +942,7 @@ jobs:

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      PG_VERSION: ${{ matrix.pg_version }}
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -959,7 +979,18 @@ jobs:
      run: |
        case "${PLATFORM}" in
          neonvm-captest-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
+            case "${PG_VERSION}" in
+              16)
+                CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
+                ;;
+              17)
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_USER_EXAMPLE_CONNSTR_PG17 }}
+                ;;
+              *)
+                echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
+                exit 1
+                ;;
+            esac
            ;;
          rds-aurora)
            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_AURORA_CONNSTR }}
@@ -983,7 +1014,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -45,6 +45,26 @@ jobs:
            run cancel-previous-in-concurrency-group.yml \
              --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"

+  files-changed:
+    needs: [ check-permissions ]
+    runs-on: [ self-hosted, small ]
+    timeout-minutes: 3
+    outputs:
+      check-rust-dependencies: ${{ steps.files-changed.outputs.rust_dependencies }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Check for file changes
+        uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36  # v3.0.2
+        id: files-changed
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          filters: .github/file-filters.yaml
+
  tag:
    needs: [ check-permissions ]
    runs-on: [ self-hosted, small ]
@@ -170,6 +190,14 @@ jobs:
      archs: '["x64", "arm64"]'
    secrets: inherit

+  check-dependencies-rust:
+    needs: [ files-changed, build-build-tools-image ]
+    if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' }}
+    uses: ./.github/workflows/cargo-deny.yml
+    with:
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+    secrets: inherit
+
  build-and-test-locally:
    needs: [ tag, build-build-tools-image ]
    strategy:
@@ -654,7 +682,7 @@ jobs:
          push: true
          pull: true
          file: compute/compute-node.Dockerfile
-          target: neon-pg-ext-test
+          target: extension-tests
          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
          tags: |
            neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
@@ -1332,6 +1360,8 @@ jobs:
      - build-and-test-locally
      - check-codestyle-python
      - check-codestyle-rust
+      - check-dependencies-rust
+      - files-changed
      - promote-images-dev
      - test-images
      - trigger-custom-extensions-build-and-wait
@@ -1344,4 +1374,11 @@ jobs:
        if: |
          contains(needs.*.result, 'failure')
          || contains(needs.*.result, 'cancelled')
-          || contains(needs.*.result, 'skipped')
+          || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true')
+          || needs.build-and-test-locally.result == 'skipped'
+          || needs.check-codestyle-python.result == 'skipped'
+          || needs.check-codestyle-rust.result == 'skipped'
+          || needs.files-changed.result == 'skipped'
+          || needs.promote-images-dev.result == 'skipped'
+          || needs.test-images.result == 'skipped'
+          || needs.trigger-custom-extensions-build-and-wait.result == 'skipped'
--- a/.github/workflows/build_and_test_with_sanitizers.yml
+++ b/.github/workflows/build_and_test_with_sanitizers.yml
@@ -0,0 +1,133 @@
+name: Build and Test with Sanitizers
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:   '0 1 * * *' # run once a day, timezone is utc
+  workflow_dispatch:
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+  COPT: '-Werror'
+
+jobs:
+  tag:
+    runs-on: [ self-hosted, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+    outputs:
+      build-tag: ${{steps.build-tag.outputs.tag}}
+
+    steps:
+      # Need `fetch-depth: 0` to count the number of commits in the branch
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Get build tag
+        run: |
+          echo run:$GITHUB_RUN_ID
+          echo ref:$GITHUB_REF_NAME
+          echo rev:$(git rev-list --count HEAD)
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'"
+            echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
+          fi
+        shell: bash
+        id: build-tag
+
+  build-build-tools-image:
+    uses: ./.github/workflows/build-build-tools-image.yml
+    secrets: inherit
+
+  build-and-test-locally:
+    needs: [ tag, build-build-tools-image ]
+    strategy:
+      fail-fast: false
+      matrix:
+        arch: [ x64, arm64 ]
+        build-type: [ release ]
+    uses: ./.github/workflows/_build-and-test-locally.yml
+    with:
+      arch: ${{ matrix.arch }}
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      build-tag: ${{ needs.tag.outputs.build-tag }}
+      build-type: ${{ matrix.build-type }}
+      test-cfg: '[{"pg_version":"v17", "sanitizers": "enabled"}]'
+    secrets: inherit
+
+
+  create-test-report:
+    needs: [ build-and-test-locally, build-build-tools-image ]
+    if: ${{ !cancelled() }}
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
+    outputs:
+      report-url: ${{ steps.create-allure-report.outputs.report-url }}
+
+    runs-on: [ self-hosted, small ]
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Create Allure report
+        if: ${{ !cancelled() }}
+        id: create-allure-report
+        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        env:
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+
+      - uses: actions/github-script@v7
+        if: ${{ !cancelled() }}
+        with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
+          script: |
+            const report = {
+              reportUrl:     "${{ steps.create-allure-report.outputs.report-url }}",
+              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
+            }
+
+            const coverage = {}
+
+            const script = require("./scripts/comment-test-report.js")
+            await script({
+              github,
+              context,
+              fetch,
+              report,
+              coverage,
+            })
--- a/.github/workflows/cargo-deny.yml
+++ b/.github/workflows/cargo-deny.yml
@@ -0,0 +1,57 @@
+name: cargo deny checks
+
+on:
+  workflow_call:
+    inputs:
+      build-tools-image:
+        required: false
+        type: string
+  schedule:
+    - cron: '0 0 * * *'
+
+jobs:
+  cargo-deny:
+    strategy:
+      matrix:
+        ref: >-
+          ${{
+            fromJSON(
+              github.event_name == 'schedule'
+                && '["main","release","release-proxy","release-compute"]'
+                || format('["{0}"]', github.sha)
+            )
+          }}
+
+    runs-on: [self-hosted, small]
+
+    container:
+      image: ${{ inputs.build-tools-image || 'neondatabase/build-tools:pinned' }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ matrix.ref }}
+
+      - name: Check rust licenses/bans/advisories/sources
+        env:
+          CARGO_DENY_TARGET: >-
+            ${{ github.event_name == 'schedule' && 'advisories' || 'all' }}
+        run: cargo deny check --hide-inclusion-graph $CARGO_DENY_TARGET
+
+      - name: Post to a Slack channel
+        if: ${{ github.event_name == 'schedule' && failure() }}
+        uses: slackapi/slack-github-action@v2
+        with:
+          method: chat.postMessage
+          token: ${{ secrets.SLACK_BOT_TOKEN }}
+          payload: |
+            channel: ${{ vars.SLACK_CICD_CHANNEL_ID }}
+            text: |
+              Periodic cargo-deny on ${{ matrix.ref }}: ${{ job.status }}
+              <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+              Pinging @oncall-devprod.
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -12,8 +12,8 @@ on:
  pull_request:
    paths:
      - '.github/workflows/pg-clients.yml'
-      - 'test_runner/pg_clients/**'
-      - 'test_runner/logical_repl/**'
+      - 'test_runner/pg_clients/**/*.py'
+      - 'test_runner/logical_repl/**/*.py'
      - 'poetry.lock'
  workflow_dispatch:

@@ -104,6 +104,8 @@ jobs:
        with:
          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
          postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+          project_settings: >-
+            {"enable_logical_replication": true}

      - name: Run tests
        uses: ./.github/actions/run-python-test-set
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -59,7 +59,10 @@ jobs:
          echo "${RUST_CHANGED_FILES}"

  build-build-tools-image:
-    if: needs.get-changed-files.outputs.python-changed == 'true'
+    if: |
+      false
+      || needs.get-changed-files.outputs.python-changed == 'true'
+      || needs.get-changed-files.outputs.rust-changed == 'true'
    needs: [ get-changed-files ]
    uses: ./.github/workflows/build-build-tools-image.yml
    with:
@@ -92,7 +95,8 @@ jobs:
  # - conclusion
  # - neon-cloud-e2e
  conclusion:
-    if: always()
+    # Do not run job on Pull Requests as it interferes with the `conclusion` job from the `build_and_test` workflow
+    if: always() && github.event_name == 'merge_group'
    permissions:
      statuses: write # for `github.repos.createCommitStatus(...)`
      contents: write
@@ -124,6 +128,8 @@ jobs:
      - name: Fail the job if any of the dependencies do not succeed or skipped
        run: exit 1
        if: |
-          (contains(needs.check-codestyle-python.result, 'skipped') && needs.get-changed-files.outputs.python-changed == 'true')
+          false
+          || (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true')
+          || (needs.check-codestyle-rust.result   == 'skipped' && needs.get-changed-files.outputs.rust-changed   == 'true')
          || contains(needs.*.result, 'failure')
          || contains(needs.*.result, 'cancelled')
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -206,6 +206,16 @@ dependencies = [
 "syn 2.0.90",
 ]

+[[package]]
+name = "assert-json-diff"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "async-channel"
 version = "1.9.0"
@@ -978,7 +988,7 @@ version = "0.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "cexpr",
 "clang-sys",
 "itertools 0.12.1",
@@ -1006,9 +1016,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"

 [[package]]
 name = "bitflags"
-version = "2.4.1"
+version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
+checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"

 [[package]]
 name = "block-buffer"
@@ -1019,6 +1029,12 @@ dependencies = [
 "generic-array",
 ]

+[[package]]
+name = "boxcar"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42"
+
 [[package]]
 name = "bstr"
 version = "1.5.0"
@@ -1225,6 +1241,20 @@ version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"

+[[package]]
+name = "clashmap"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93bd59c81e2bd87a775ae2de75f070f7e2bfe97363a6ad652f46824564c23e4d"
+dependencies = [
+ "crossbeam-utils",
+ "hashbrown 0.15.2",
+ "lock_api",
+ "parking_lot_core 0.9.8",
+ "polonius-the-crab",
+ "replace_with",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -1312,7 +1342,7 @@ dependencies = [
 "tar",
 "thiserror 1.0.69",
 "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres",
 "tokio-stream",
 "tokio-util",
 "tower 0.5.2",
@@ -1421,7 +1451,7 @@ dependencies = [
 "storage_broker",
 "thiserror 1.0.69",
 "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres",
 "tokio-util",
 "toml",
 "toml_edit",
@@ -1561,7 +1591,7 @@ version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "crossterm_winapi",
 "libc",
 "parking_lot 0.12.1",
@@ -1792,7 +1822,7 @@ version = "2.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ccf1bedf64cdb9643204a36dd15b19a6ce8e7aa7f7b105868e9f1fad5ffa7d12"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "byteorder",
 "chrono",
 "diesel_derives",
@@ -1812,7 +1842,7 @@ dependencies = [
 "futures-util",
 "scoped-futures",
 "tokio",
- "tokio-postgres 0.7.12",
+ "tokio-postgres",
 ]

 [[package]]
@@ -2428,6 +2458,16 @@ dependencies = [
 "wasm-bindgen",
 ]

+[[package]]
+name = "gettid"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "397256552fed4a9e577850498071831ec8f18ea83368aecc114cab469dcb43e5"
+dependencies = [
+ "libc",
+ "winapi",
+]
+
 [[package]]
 name = "gimli"
 version = "0.31.1"
@@ -2556,6 +2596,12 @@ dependencies = [
 "allocator-api2",
 ]

+[[package]]
+name = "hashbrown"
+version = "0.15.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
+
 [[package]]
 name = "hashlink"
 version = "0.9.1"
@@ -2606,6 +2652,15 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46"

+[[package]]
+name = "higher-kinded-types"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "561985554c8b8d4808605c90a5f1979cc6c31a5d20b78465cd59501233c6678e"
+dependencies = [
+ "never-say-never",
+]
+
 [[package]]
 name = "hmac"
 version = "0.12.1"
@@ -3084,11 +3139,11 @@ dependencies = [

 [[package]]
 name = "inotify"
-version = "0.9.6"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff"
+checksum = "f37dccff2791ab604f9babef0ba14fbe0be30bd368dc541e2b08d07c8aa908f3"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.8.0",
 "inotify-sys",
 "libc",
 ]
@@ -3265,9 +3320,9 @@ dependencies = [

 [[package]]
 name = "kqueue"
-version = "1.0.7"
+version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c8fc60ba15bf51257aa9807a48a61013db043fcf3a78cb0d916e8e396dcad98"
+checksum = "7447f1ca1b7b563588a205fe93dea8df60fd981423a768bc1c0ded35ed147d0c"
 dependencies = [
 "kqueue-sys",
 "libc",
@@ -3275,9 +3330,9 @@ dependencies = [

 [[package]]
 name = "kqueue-sys"
-version = "1.0.3"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8367585489f01bc55dd27404dcf56b95e6da061a256a666ab23be9ba96a2e587"
+checksum = "ed9625ffda8729b85e45cf04090035ac368927b8cebc34898e7c120f52e4838b"
 dependencies = [
 "bitflags 1.3.2",
 "libc",
@@ -3304,9 +3359,9 @@ dependencies = [

 [[package]]
 name = "libc"
-version = "0.2.167"
+version = "0.2.169"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc"
+checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"

 [[package]]
 name = "libloading"
@@ -3553,14 +3608,14 @@ dependencies = [

 [[package]]
 name = "mio"
-version = "0.8.11"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
+checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd"
 dependencies = [
 "libc",
 "log",
 "wasi 0.11.0+wasi-snapshot-preview1",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]

 [[package]]
@@ -3569,6 +3624,12 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"

+[[package]]
+name = "never-say-never"
+version = "6.6.666"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf5a574dadd7941adeaa71823ecba5e28331b8313fb2e1c6a5c7e5981ea53ad6"
+
 [[package]]
 name = "nix"
 version = "0.25.1"
@@ -3600,7 +3661,7 @@ version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "cfg-if",
 "libc",
 "memoffset 0.9.0",
@@ -3618,12 +3679,11 @@ dependencies = [

 [[package]]
 name = "notify"
-version = "6.1.1"
+version = "8.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d"
+checksum = "2fee8403b3d66ac7b26aee6e40a897d85dc5ce26f44da36b8b73e987cc52e943"
 dependencies = [
- "bitflags 2.4.1",
- "crossbeam-channel",
+ "bitflags 2.8.0",
 "filetime",
 "fsevent-sys",
 "inotify",
@@ -3631,10 +3691,17 @@ dependencies = [
 "libc",
 "log",
 "mio",
+ "notify-types",
 "walkdir",
- "windows-sys 0.48.0",
+ "windows-sys 0.59.0",
 ]

+[[package]]
+name = "notify-types"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e0826a989adedc2a244799e823aece04662b66609d96af8dff7ac6df9a8925d"
+
 [[package]]
 name = "ntapi"
 version = "0.4.1"
@@ -4060,8 +4127,8 @@ dependencies = [
 "pageserver_compaction",
 "pin-project-lite",
 "postgres",
- "postgres-protocol 0.6.4",
- "postgres-types 0.2.4",
+ "postgres-protocol",
+ "postgres-types",
 "postgres_backend",
 "postgres_connection",
 "postgres_ffi",
@@ -4092,7 +4159,7 @@ dependencies = [
 "tokio",
 "tokio-epoll-uring",
 "tokio-io-timeout",
- "tokio-postgres 0.7.7",
+ "tokio-postgres",
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
@@ -4150,7 +4217,7 @@ dependencies = [
 "serde",
 "thiserror 1.0.69",
 "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres",
 "tokio-stream",
 "tokio-util",
 "utils",
@@ -4180,6 +4247,16 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "papaya"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c"
+dependencies = [
+ "equivalent",
+ "seize",
+]
+
 [[package]]
 name = "parking"
 version = "2.1.1"
@@ -4446,48 +4523,40 @@ dependencies = [
 "plotters-backend",
 ]

+[[package]]
+name = "polonius-the-crab"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e97ca2c89572ae41bbec1c99498251f87dd5a94e500c5ec19c382dd593dd5ce9"
+dependencies = [
+ "higher-kinded-types",
+ "never-say-never",
+]
+
 [[package]]
 name = "postgres"
-version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
+version = "0.19.7"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
 "bytes",
 "fallible-iterator",
 "futures-util",
 "log",
 "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres",
 ]

 [[package]]
 name = "postgres-protocol"
-version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
-dependencies = [
- "base64 0.20.0",
- "byteorder",
- "bytes",
- "fallible-iterator",
- "hmac",
- "lazy_static",
- "md-5",
- "memchr",
- "rand 0.8.5",
- "sha2",
- "stringprep",
-]
-
-[[package]]
-name = "postgres-protocol"
-version = "0.6.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acda0ebdebc28befa84bee35e651e4c5f09073d668c7aed4cf7e23c3cda84b23"
+version = "0.6.6"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
 "base64 0.22.1",
 "byteorder",
 "bytes",
 "fallible-iterator",
 "hmac",
+ "lazy_static",
 "md-5",
 "memchr",
 "rand 0.8.5",
@@ -4513,23 +4582,13 @@ dependencies = [

 [[package]]
 name = "postgres-types"
-version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
+version = "0.2.6"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
 "bytes",
+ "chrono",
 "fallible-iterator",
- "postgres-protocol 0.6.4",
-]
-
-[[package]]
-name = "postgres-types"
-version = "0.2.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f66ea23a2d0e5734297357705193335e0a957696f34bed2f2faefacb2fec336f"
-dependencies = [
- "bytes",
- "fallible-iterator",
- "postgres-protocol 0.6.7",
+ "postgres-protocol",
 ]

 [[package]]
@@ -4554,7 +4613,7 @@ dependencies = [
 "serde",
 "thiserror 1.0.69",
 "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres",
 "tokio-postgres-rustls",
 "tokio-rustls 0.26.0",
 "tokio-util",
@@ -4569,7 +4628,7 @@ dependencies = [
 "itertools 0.10.5",
 "once_cell",
 "postgres",
- "tokio-postgres 0.7.7",
+ "tokio-postgres",
 "url",
 ]

@@ -4663,7 +4722,7 @@ dependencies = [
 "byteorder",
 "bytes",
 "itertools 0.10.5",
- "postgres-protocol 0.6.4",
+ "postgres-protocol",
 "rand 0.8.5",
 "serde",
 "thiserror 1.0.69",
@@ -4704,7 +4763,7 @@ version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "chrono",
 "flate2",
 "hex",
@@ -4719,7 +4778,7 @@ version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "chrono",
 "hex",
 ]
@@ -4825,6 +4884,7 @@ dependencies = [
 "ahash",
 "anyhow",
 "arc-swap",
+ "assert-json-diff",
 "async-compression",
 "async-trait",
 "atomic-take",
@@ -4832,15 +4892,16 @@ dependencies = [
 "aws-sdk-iam",
 "aws-sigv4",
 "base64 0.13.1",
+ "boxcar",
 "bstr",
 "bytes",
 "camino",
 "camino-tempfile",
 "chrono",
 "clap",
+ "clashmap",
 "compute_api",
 "consumption_metrics",
- "dashmap 5.5.0",
 "ecdsa 0.16.9",
 "ed25519-dalek",
 "env_logger 0.10.2",
@@ -4848,6 +4909,7 @@ dependencies = [
 "flate2",
 "framed-websockets",
 "futures",
+ "gettid",
 "hashbrown 0.14.5",
 "hashlink",
 "hex",
@@ -4870,7 +4932,9 @@ dependencies = [
 "measured",
 "metrics",
 "once_cell",
+ "opentelemetry",
 "p256 0.13.2",
+ "papaya",
 "parking_lot 0.12.1",
 "parquet",
 "parquet_derive",
@@ -4911,12 +4975,15 @@ dependencies = [
 "tikv-jemalloc-ctl",
 "tikv-jemallocator",
 "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres",
 "tokio-postgres2",
 "tokio-rustls 0.26.0",
 "tokio-tungstenite 0.21.0",
 "tokio-util",
 "tracing",
+ "tracing-log",
+ "tracing-opentelemetry",
+ "tracing-serde",
 "tracing-subscriber",
 "tracing-utils",
 "try-lock",
@@ -5248,6 +5315,12 @@ dependencies = [
 "utils",
 ]

+[[package]]
+name = "replace_with"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3a8614ee435691de62bcffcf4a66d91b3594bf1428a5722e79103249a095690"
+
 [[package]]
 name = "reqwest"
 version = "0.12.4"
@@ -5527,7 +5600,7 @@ version = "0.38.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "errno",
 "libc",
 "linux-raw-sys 0.4.14",
@@ -5699,7 +5772,7 @@ dependencies = [
 "pageserver_api",
 "parking_lot 0.12.1",
 "postgres",
- "postgres-protocol 0.6.4",
+ "postgres-protocol",
 "postgres_backend",
 "postgres_ffi",
 "pprof",
@@ -5723,7 +5796,7 @@ dependencies = [
 "tikv-jemallocator",
 "tokio",
 "tokio-io-timeout",
- "tokio-postgres 0.7.7",
+ "tokio-postgres",
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
@@ -5864,6 +5937,16 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "seize"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "semver"
 version = "1.0.17"
@@ -6340,6 +6423,8 @@ dependencies = [
 "rand 0.8.5",
 "reqwest",
 "routerify",
+ "rustls 0.23.18",
+ "rustls-native-certs 0.8.0",
 "scoped-futures",
 "scopeguard",
 "serde",
@@ -6348,6 +6433,8 @@ dependencies = [
 "strum_macros",
 "thiserror 1.0.69",
 "tokio",
+ "tokio-postgres",
+ "tokio-postgres-rustls",
 "tokio-util",
 "tracing",
 "utils",
@@ -6393,7 +6480,7 @@ dependencies = [
 "serde_json",
 "storage_controller_client",
 "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres",
 "tokio-postgres-rustls",
 "tokio-stream",
 "tokio-util",
@@ -6590,7 +6677,7 @@ dependencies = [
 "fastrand 2.2.0",
 "once_cell",
 "rustix",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]

 [[package]]
@@ -6802,21 +6889,20 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"

 [[package]]
 name = "tokio"
-version = "1.38.1"
+version = "1.43.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df"
+checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e"
 dependencies = [
 "backtrace",
 "bytes",
 "libc",
 "mio",
- "num_cpus",
 "parking_lot 0.12.1",
 "pin-project-lite",
 "signal-hook-registry",
 "socket2",
 "tokio-macros",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]

 [[package]]
@@ -6847,9 +6933,9 @@ dependencies = [

 [[package]]
 name = "tokio-macros"
-version = "2.3.0"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
+checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -6858,8 +6944,8 @@ dependencies = [

 [[package]]
 name = "tokio-postgres"
-version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
+version = "0.7.10"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -6872,32 +6958,8 @@ dependencies = [
 "percent-encoding",
 "phf",
 "pin-project-lite",
- "postgres-protocol 0.6.4",
- "postgres-types 0.2.4",
- "socket2",
- "tokio",
- "tokio-util",
-]
-
-[[package]]
-name = "tokio-postgres"
-version = "0.7.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b5d3742945bc7d7f210693b0c58ae542c6fd47b17adbbda0885f3dcb34a6bdb"
-dependencies = [
- "async-trait",
- "byteorder",
- "bytes",
- "fallible-iterator",
- "futures-channel",
- "futures-util",
- "log",
- "parking_lot 0.12.1",
- "percent-encoding",
- "phf",
- "pin-project-lite",
- "postgres-protocol 0.6.7",
- "postgres-types 0.2.8",
+ "postgres-protocol",
+ "postgres-types",
 "rand 0.8.5",
 "socket2",
 "tokio",
@@ -6914,7 +6976,7 @@ dependencies = [
 "ring",
 "rustls 0.23.18",
 "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres",
 "tokio-rustls 0.26.0",
 "x509-certificate",
 ]
@@ -6935,6 +6997,7 @@ dependencies = [
 "pin-project-lite",
 "postgres-protocol2",
 "postgres-types2",
+ "serde",
 "tokio",
 "tokio-util",
 ]
@@ -7157,7 +7220,7 @@ version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "bytes",
 "http 1.1.0",
 "http-body 1.0.0",
@@ -7591,7 +7654,7 @@ dependencies = [
 "serde_json",
 "sysinfo",
 "tokio",
- "tokio-postgres 0.7.7",
+ "tokio-postgres",
 "tokio-util",
 "tracing",
 "tracing-subscriber",
@@ -7654,9 +7717,9 @@ dependencies = [

 [[package]]
 name = "walkdir"
-version = "2.3.3"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
 dependencies = [
 "same-file",
 "winapi-util",
@@ -7908,6 +7971,15 @@ dependencies = [
 "windows-targets 0.52.6",
 ]

+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.48.0"
@@ -8136,6 +8208,7 @@ dependencies = [
 "tower 0.4.13",
 "tracing",
 "tracing-core",
+ "tracing-log",
 "url",
 "zerocopy",
 "zeroize",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -54,6 +54,7 @@ async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
 backtrace = "0.3.74"
 flate2 = "1.0.26"
+assert-json-diff = "2"
 async-stream = "0.3"
 async-trait = "0.1"
 aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] }
@@ -77,10 +78,10 @@ camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive", "env"] }
+clashmap = { version = "1.0", features = ["raw-api"] }
 comfy-table = "7.1"
 const_format = "0.2"
 crc32c = "0.6"
-dashmap = { version = "5.5.0", features = ["raw-api"] }
 diatomic-waker = { version = "0.2.3" }
 either = "1.8"
 enum-map = "2.4.2"
@@ -123,7 +124,7 @@ measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
 memoffset = "0.9"
 nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
-notify = "6.0.0"
+notify = "8.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
@@ -177,7 +178,7 @@ test-context = "0.3"
 thiserror = "1.0"
 tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
-tokio = { version = "1.17", features = ["macros"] }
+tokio = { version = "1.41", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
@@ -193,7 +194,9 @@ tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
 tower-service = "0.3.3"
 tracing = "0.1"
 tracing-error = "0.2"
+tracing-log = "0.2"
 tracing-opentelemetry = "0.28"
+tracing-serde = "0.2.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
--- a/6
+++ b/6
@@ -24,7 +24,7 @@ COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh

 ENV BUILD_TYPE=release
 RUN set -e \
-    && mold -run make -j $(nproc) -s neon-pg-ext \
+    && make -j $(nproc) -s neon-pg-ext \
    && rm -rf pg_install/build \
    && tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz .

@@ -45,7 +45,7 @@ COPY --chown=nonroot . .

 ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
+    && RUSTFLAGS="-Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
      --bin pg_sni_router  \
      --bin pageserver  \
      --bin pagectl  \
@@ -64,6 +64,7 @@ ARG DEFAULT_PG_VERSION
 WORKDIR /data

 RUN set -e \
+    && echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \
    && apt update \
    && apt install -y \
        libreadline-dev \
@@ -72,6 +73,7 @@ RUN set -e \
 	# System postgres for use with client libraries (e.g. in storage controller)
        postgresql-15 \
        openssl \
+    && rm -f /etc/apt/apt.conf.d/80-retries \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
    && useradd -d /data neon \
    && chown -R neon:neon /data
--- a/17
+++ b/17
@@ -10,18 +10,29 @@ ICU_PREFIX_DIR := /usr/local/icu
 # environment variable.
 #
 BUILD_TYPE ?= debug
+WITH_SANITIZERS ?= no
 ifeq ($(BUILD_TYPE),release)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl
 	PG_CFLAGS = -O2 -g3 $(CFLAGS)
+	PG_LDFLAGS = $(LDFLAGS)
 	# Unfortunately, `--profile=...` is a nightly feature
 	CARGO_BUILD_FLAGS += --release
 else ifeq ($(BUILD_TYPE),debug)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
 	PG_CFLAGS = -O0 -g3 $(CFLAGS)
+	PG_LDFLAGS = $(LDFLAGS)
 else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif

+ifeq ($(WITH_SANITIZERS),yes)
+	PG_CFLAGS += -fsanitize=address -fsanitize=undefined -fno-sanitize-recover
+	COPT += -Wno-error # to avoid failing on warnings induced by sanitizers
+	PG_LDFLAGS = -fsanitize=address -fsanitize=undefined -static-libasan -static-libubsan $(LDFLAGS)
+	export CC := gcc
+	export ASAN_OPTIONS := detect_leaks=0
+endif
+
 ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
 	# Exclude static build openssl, icu for local build (MacOS, Linux)
 	# Only keep for build type release and debug
@@ -33,7 +44,9 @@ endif
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
-	PG_CONFIGURE_OPTS += --with-libseccomp
+	ifneq ($(WITH_SANITIZERS),yes)
+		PG_CONFIGURE_OPTS += --with-libseccomp
+	endif
 else ifeq ($(UNAME_S),Darwin)
 	PG_CFLAGS += -DUSE_PREFETCH
 	ifndef DISABLE_HOMEBREW
@@ -106,7 +119,7 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 	EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
 	(cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \
 	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
-		CFLAGS='$(PG_CFLAGS)' \
+		CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \
 		$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
 		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)

--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -3,6 +3,17 @@ ARG DEBIAN_VERSION=bookworm
 FROM debian:bookworm-slim AS pgcopydb_builder
 ARG DEBIAN_VERSION

+# Use strict mode for bash to catch errors early
+SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
+
+# By default, /bin/sh used in debian images will treat '\n' as eol,
+# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
+    echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
+
+COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
+
 RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
        set -e && \
        apt update && \
@@ -35,6 +46,7 @@ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
        mkdir /tmp/pgcopydb && \
        tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \
        cd /tmp/pgcopydb && \
+        patch -p1 < /pgcopydbv017.patch && \
        make -s clean && \
        make -s -j12 install && \
        libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \
@@ -51,7 +63,8 @@ ARG DEBIAN_VERSION

 # Add nonroot user
 RUN useradd -ms /bin/bash nonroot -b /home
-SHELL ["/bin/bash", "-c"]
+# Use strict mode for bash to catch errors early
+SHELL ["/bin/bash", "-euo", "pipefail", "-c"]

 RUN mkdir -p /pgcopydb/bin && \
    mkdir -p /pgcopydb/lib && \
@@ -61,6 +74,10 @@ RUN mkdir -p /pgcopydb/bin && \
 COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb
 COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5

+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
+    echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
+
 # System deps
 #
 # 'gdb' is included so that we get backtraces of core dumps produced in
@@ -182,8 +199,14 @@ RUN set -e \
 # It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
 # And patches from us:
 # - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
-RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
-    && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
+RUN set +o pipefail && \
+	 for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do \
+		yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')";\
+	 done && \
+	set -o pipefail
+# Split into separate step to debug flaky failures here
+RUN wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
+    && ls -laht lcov.tar.gz && sha256sum lcov.tar.gz \
    && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992  lcov.tar.gz" | sha256sum --check \
    && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
    && cd lcov \
@@ -218,6 +241,8 @@ RUN wget -O /tmp/libicu-${ICU_VERSION}.tgz https://github.com/unicode-org/icu/re
 USER nonroot:nonroot
 WORKDIR /home/nonroot

+RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc
+
 # Python
 ENV PYTHON_VERSION=3.11.10 \
    PYENV_ROOT=/home/nonroot/.pyenv \
@@ -243,7 +268,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.84.0
+ENV RUSTC_VERSION=1.84.1
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
@@ -251,6 +276,7 @@ ARG CARGO_HAKARI_VERSION=0.9.33
 ARG CARGO_DENY_VERSION=0.16.2
 ARG CARGO_HACK_VERSION=0.6.33
 ARG CARGO_NEXTEST_VERSION=0.9.85
+ARG CARGO_DIESEL_CLI_VERSION=2.2.6
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
 	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
@@ -264,6 +290,8 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
    cargo install cargo-hack          --version ${CARGO_HACK_VERSION} && \
    cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
+    cargo install diesel_cli          --version ${CARGO_DIESEL_CLI_VERSION} \
+                                      --features postgres-bundled --no-default-features && \
    rm -rf /home/nonroot/.cargo/registry && \
    rm -rf /home/nonroot/.cargo/git

--- a/build_tools/patches/pgcopydbv017.patch
+++ b/build_tools/patches/pgcopydbv017.patch
@@ -0,0 +1,57 @@
+diff --git a/src/bin/pgcopydb/copydb.c b/src/bin/pgcopydb/copydb.c
+index d730b03..69a9be9 100644
+--- a/src/bin/pgcopydb/copydb.c
+++ b/src/bin/pgcopydb/copydb.c
+@@ -44,6 +44,7 @@ GUC dstSettings[] = {
+ 	{ "synchronous_commit", "'off'" },
+ 	{ "statement_timeout", "0" },
+ 	{ "lock_timeout", "0" },
+	{ "idle_in_transaction_session_timeout", "0" },
+ 	{ NULL, NULL },
+ };
+ 
+diff --git a/src/bin/pgcopydb/pgsql.c b/src/bin/pgcopydb/pgsql.c
+index 94f2f46..e051ba8 100644
+--- a/src/bin/pgcopydb/pgsql.c
+++ b/src/bin/pgcopydb/pgsql.c
+@@ -2319,6 +2319,11 @@ pgsql_execute_log_error(PGSQL *pgsql,
+ 
+ 	LinesBuffer lbuf = { 0 };
+ 
+	if (message != NULL){
+		// make sure message is writable by splitLines
+		message = strdup(message);
+	}
+
+ 	if (!splitLines(&lbuf, message))
+ 	{
+ 		/* errors have already been logged */
+@@ -2332,6 +2337,7 @@ pgsql_execute_log_error(PGSQL *pgsql,
+ 				  PQbackendPID(pgsql->connection),
+ 				  lbuf.lines[lineNumber]);
+ 	}
+        free(message); // free copy of message we created above
+ 
+ 	if (pgsql->logSQL)
+ 	{
+@@ -3174,11 +3180,18 @@ pgcopy_log_error(PGSQL *pgsql, PGresult *res, const char *context)
+ 		/* errors have already been logged */
+ 		return;
+ 	}
+-
+ 	if (res != NULL)
+ 	{
+ 		char *sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
+-		strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate));
+		if (sqlstate == NULL)
+		{
+			// PQresultErrorField returned NULL!
+			pgsql->sqlstate[0] = '\0';  // Set to an empty string to avoid segfault
+		}
+		else
+		{
+			strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate));
+		}
+ 	}
+ 
+ 	char *endpoint =
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
--- a/compute/patches/contrib_pg16.patch
+++ b/compute/patches/contrib_pg16.patch
@@ -0,0 +1,242 @@
+diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out
+index 979e5e8..2375b45 100644
+--- a/contrib/amcheck/expected/check_heap.out
+++ b/contrib/amcheck/expected/check_heap.out
+@@ -80,12 +80,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -118,14 +115,6 @@ SELECT pg_stat_force_next_flush();
+  
+ (1 row)
+ 
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+- ?column? 
+-----------
+- t
+-(1 row)
+-
+ CREATE ROLE regress_heaptest_role;
+ -- verify permissions are checked (error due to function not callable)
+ SET ROLE regress_heaptest_role;
+@@ -233,7 +222,6 @@ ERROR:  cannot check relation "test_foreign_table"
+ DETAIL:  This operation is not supported for foreign tables.
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql
+index 1745bae..3b429c3 100644
+--- a/contrib/amcheck/sql/check_heap.sql
+++ b/contrib/amcheck/sql/check_heap.sql
+@@ -40,12 +40,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -58,9 +55,6 @@ COMMIT;
+ --   ALTER TABLE ... SET TABLESPACE ...
+ -- causing an additional bulkread, which should be reflected in pg_stat_io.
+ SELECT pg_stat_force_next_flush();
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+ 
+ CREATE ROLE regress_heaptest_role;
+ 
+@@ -140,7 +134,6 @@ SELECT * FROM verify_heapam('test_foreign_table',
+ 
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/citext/expected/create_index_acl.out b/contrib/citext/expected/create_index_acl.out
+index 33be13a..70a406c 100644
+--- a/contrib/citext/expected/create_index_acl.out
+++ b/contrib/citext/expected/create_index_acl.out
+@@ -5,9 +5,6 @@
+ -- owner having as few applicable privileges as possible.  (The privileges.sql
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -49,11 +46,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -66,11 +61,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -78,9 +71,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ ERROR:  42704
+ \set VERBOSITY default
+ ROLLBACK;
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/citext/sql/create_index_acl.sql b/contrib/citext/sql/create_index_acl.sql
+index 10b5225..ae442e1 100644
+--- a/contrib/citext/sql/create_index_acl.sql
+++ b/contrib/citext/sql/create_index_acl.sql
+@@ -6,10 +6,6 @@
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+ 
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+-
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -51,11 +47,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -68,11 +62,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -80,9 +72,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ \set VERBOSITY default
+ ROLLBACK;
+ 
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out
+index 72304e0..ebe131b 100644
+--- a/contrib/file_fdw/expected/file_fdw.out
+++ b/contrib/file_fdw/expected/file_fdw.out
+@@ -4,6 +4,7 @@
+ -- directory paths are passed to us in environment variables
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ -- Clean up in case a prior regression run failed
+SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
+diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql
+index f0548e1..848a08c 100644
+--- a/contrib/file_fdw/sql/file_fdw.sql
+++ b/contrib/file_fdw/sql/file_fdw.sql
+@@ -6,6 +6,7 @@
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ 
+ -- Clean up in case a prior regression run failed
+SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
+diff --git a/contrib/pageinspect/expected/gist.out b/contrib/pageinspect/expected/gist.out
+index d1adbab..38b52ac 100644
+--- a/contrib/pageinspect/expected/gist.out
+++ b/contrib/pageinspect/expected/gist.out
+@@ -10,25 +10,6 @@ BEGIN;
+ CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
+     generate_series(1,1000) i;
+ CREATE INDEX test_gist_idx ON test_gist USING gist (p);
+--- Page 0 is the root, the rest are leaf pages
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0));
+- lsn | nsn | rightlink  | flags 
+------+-----+------------+-------
+- 0/1 | 0/0 | 4294967295 | {}
+-(1 row)
+-
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1));
+- lsn | nsn | rightlink  | flags  
+------+-----+------------+--------
+- 0/1 | 0/0 | 4294967295 | {leaf}
+-(1 row)
+-
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2));
+- lsn | nsn | rightlink | flags  
+------+-----+-----------+--------
+- 0/1 | 0/0 |         1 | {leaf}
+-(1 row)
+-
+ COMMIT;
+ SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx');
+  itemoffset |   ctid    | itemlen | dead |             keys              
+diff --git a/contrib/pageinspect/sql/gist.sql b/contrib/pageinspect/sql/gist.sql
+index d263542..607992f 100644
+--- a/contrib/pageinspect/sql/gist.sql
+++ b/contrib/pageinspect/sql/gist.sql
+@@ -12,11 +12,6 @@ CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
+     generate_series(1,1000) i;
+ CREATE INDEX test_gist_idx ON test_gist USING gist (p);
+ 
+--- Page 0 is the root, the rest are leaf pages
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0));
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1));
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2));
+-
+ COMMIT;
+ 
+ SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx');
--- a/compute/patches/contrib_pg17.patch
+++ b/compute/patches/contrib_pg17.patch
@@ -0,0 +1,196 @@
+diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out
+index 979e5e8..2375b45 100644
+--- a/contrib/amcheck/expected/check_heap.out
+++ b/contrib/amcheck/expected/check_heap.out
+@@ -80,12 +80,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -118,14 +115,6 @@ SELECT pg_stat_force_next_flush();
+  
+ (1 row)
+ 
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+- ?column? 
+-----------
+- t
+-(1 row)
+-
+ CREATE ROLE regress_heaptest_role;
+ -- verify permissions are checked (error due to function not callable)
+ SET ROLE regress_heaptest_role;
+@@ -233,7 +222,6 @@ ERROR:  cannot check relation "test_foreign_table"
+ DETAIL:  This operation is not supported for foreign tables.
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql
+index 1745bae..3b429c3 100644
+--- a/contrib/amcheck/sql/check_heap.sql
+++ b/contrib/amcheck/sql/check_heap.sql
+@@ -40,12 +40,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -58,9 +55,6 @@ COMMIT;
+ --   ALTER TABLE ... SET TABLESPACE ...
+ -- causing an additional bulkread, which should be reflected in pg_stat_io.
+ SELECT pg_stat_force_next_flush();
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+ 
+ CREATE ROLE regress_heaptest_role;
+ 
+@@ -140,7 +134,6 @@ SELECT * FROM verify_heapam('test_foreign_table',
+ 
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/citext/expected/create_index_acl.out b/contrib/citext/expected/create_index_acl.out
+index 33be13a..70a406c 100644
+--- a/contrib/citext/expected/create_index_acl.out
+++ b/contrib/citext/expected/create_index_acl.out
+@@ -5,9 +5,6 @@
+ -- owner having as few applicable privileges as possible.  (The privileges.sql
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -49,11 +46,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -66,11 +61,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -78,9 +71,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ ERROR:  42704
+ \set VERBOSITY default
+ ROLLBACK;
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/citext/sql/create_index_acl.sql b/contrib/citext/sql/create_index_acl.sql
+index 10b5225..ae442e1 100644
+--- a/contrib/citext/sql/create_index_acl.sql
+++ b/contrib/citext/sql/create_index_acl.sql
+@@ -6,10 +6,6 @@
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+ 
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+-
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -51,11 +47,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -68,11 +62,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -80,9 +72,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ \set VERBOSITY default
+ ROLLBACK;
+ 
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out
+index 86c148a..81bdb2c 100644
+--- a/contrib/file_fdw/expected/file_fdw.out
+++ b/contrib/file_fdw/expected/file_fdw.out
+@@ -4,6 +4,7 @@
+ -- directory paths are passed to us in environment variables
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ -- Clean up in case a prior regression run failed
+SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
+diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql
+index f0548e1..848a08c 100644
+--- a/contrib/file_fdw/sql/file_fdw.sql
+++ b/contrib/file_fdw/sql/file_fdw.sql
+@@ -6,6 +6,7 @@
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ 
+ -- Clean up in case a prior regression run failed
+SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
--- a/compute/patches/pg_graphql.patch
+++ b/compute/patches/pg_graphql.patch
@@ -0,0 +1,19 @@
+commit ec6a491d126882966a696f9ad5d3698935361d55
+Author: Alexey Masterov <alexeymasterov@neon.tech>
+Date:   Tue Dec 17 10:25:00 2024 +0100
+
+    Changes required to run tests on Neon
+
+diff --git a/test/expected/permissions_functions.out b/test/expected/permissions_functions.out
+index 1e9fbc2..94cbe25 100644
+--- a/test/expected/permissions_functions.out
+++ b/test/expected/permissions_functions.out
+@@ -64,7 +64,7 @@ begin;
+     select current_user;
+  current_user 
+ --------------
+- postgres
+ cloud_admin
+ (1 row)
+ 
+     -- revoke default access from the public role for new functions
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -47,7 +47,9 @@ files:
      # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
      # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
      # regardless of hostname (ALL)
-      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
+      #
+      # Also allow it to shut down the VM. The fast_import job does that when it's finished.
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
  - filename: cgconfig.conf
    content: |
      # Configuration for cgroups in VM compute nodes
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -34,6 +34,7 @@
 //!             -r http://pg-ext-s3-gateway \
 //! ```
 use std::collections::HashMap;
+use std::ffi::OsString;
 use std::fs::File;
 use std::path::Path;
 use std::process::exit;
@@ -44,7 +45,7 @@ use std::{thread, time::Duration};

 use anyhow::{Context, Result};
 use chrono::Utc;
-use clap::Arg;
+use clap::Parser;
 use compute_tools::disk_quota::set_disk_quota;
 use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
@@ -73,10 +74,75 @@ use utils::failpoint_support;
 // in-case of not-set environment var
 const BUILD_TAG_DEFAULT: &str = "latest";

-fn main() -> Result<()> {
-    let scenario = failpoint_support::init();
+// Compatibility hack: if the control plane specified any remote-ext-config
+// use the default value for extension storage proxy gateway.
+// Remove this once the control plane is updated to pass the gateway URL
+fn parse_remote_ext_config(arg: &str) -> Result<String> {
+    if arg.starts_with("http") {
+        Ok(arg.trim_end_matches('/').to_string())
+    } else {
+        Ok("http://pg-ext-s3-gateway".to_string())
+    }
+}

-    let (build_tag, clap_args) = init()?;
+#[derive(Parser)]
+#[command(rename_all = "kebab-case")]
+struct Cli {
+    #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
+    pub pgbin: String,
+
+    #[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
+    pub remote_ext_config: Option<String>,
+
+    #[arg(long, default_value_t = 3080)]
+    pub http_port: u16,
+
+    #[arg(short = 'D', long, value_name = "DATADIR")]
+    pub pgdata: String,
+
+    #[arg(short = 'C', long, value_name = "DATABASE_URL")]
+    pub connstr: String,
+
+    #[cfg(target_os = "linux")]
+    #[arg(long, default_value = "neon-postgres")]
+    pub cgroup: String,
+
+    #[cfg(target_os = "linux")]
+    #[arg(
+        long,
+        default_value = "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor"
+    )]
+    pub filecache_connstr: String,
+
+    #[cfg(target_os = "linux")]
+    #[arg(long, default_value = "0.0.0.0:10301")]
+    pub vm_monitor_addr: String,
+
+    #[arg(long, action = clap::ArgAction::SetTrue)]
+    pub resize_swap_on_bind: bool,
+
+    #[arg(long)]
+    pub set_disk_quota_for_fs: Option<String>,
+
+    #[arg(short = 's', long = "spec", group = "spec")]
+    pub spec_json: Option<String>,
+
+    #[arg(short = 'S', long, group = "spec-path")]
+    pub spec_path: Option<OsString>,
+
+    #[arg(short = 'i', long, group = "compute-id", conflicts_with_all = ["spec", "spec-path"])]
+    pub compute_id: Option<String>,
+
+    #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], requires = "compute-id", value_name = "CONTROL_PLANE_API_BASE_URL")]
+    pub control_plane_uri: Option<String>,
+}
+
+fn main() -> Result<()> {
+    let cli = Cli::parse();
+
+    let build_tag = init()?;
+
+    let scenario = failpoint_support::init();

    // enable core dumping for all child processes
    setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
@@ -85,13 +151,11 @@ fn main() -> Result<()> {
        // Enter startup tracing context
        let _startup_context_guard = startup_context_from_env();

-        let cli_args = process_cli(&clap_args)?;
+        let cli_spec = try_spec_from_cli(&cli)?;

-        let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
+        let compute = wait_spec(build_tag, &cli, cli_spec)?;

-        let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;
-
-        start_postgres(&clap_args, wait_spec_result)?
+        start_postgres(&cli, compute)?

        // Startup is finished, exit the startup tracing span
    };
@@ -108,7 +172,7 @@ fn main() -> Result<()> {
    deinit_and_exit(wait_pg_result);
 }

-fn init() -> Result<(String, clap::ArgMatches)> {
+fn init() -> Result<String> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -123,66 +187,7 @@ fn init() -> Result<(String, clap::ArgMatches)> {
        .to_string();
    info!("build_tag: {build_tag}");

-    Ok((build_tag, cli().get_matches()))
-}
-
-fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
-    let pgbin_default = "postgres";
-    let pgbin = matches
-        .get_one::<String>("pgbin")
-        .map(|s| s.as_str())
-        .unwrap_or(pgbin_default);
-
-    let ext_remote_storage = matches
-        .get_one::<String>("remote-ext-config")
-        // Compatibility hack: if the control plane specified any remote-ext-config
-        // use the default value for extension storage proxy gateway.
-        // Remove this once the control plane is updated to pass the gateway URL
-        .map(|conf| {
-            if conf.starts_with("http") {
-                conf.trim_end_matches('/')
-            } else {
-                "http://pg-ext-s3-gateway"
-            }
-        });
-
-    let http_port = *matches
-        .get_one::<u16>("http-port")
-        .expect("http-port is required");
-    let pgdata = matches
-        .get_one::<String>("pgdata")
-        .expect("PGDATA path is required");
-    let connstr = matches
-        .get_one::<String>("connstr")
-        .expect("Postgres connection string is required");
-    let spec_json = matches.get_one::<String>("spec");
-    let spec_path = matches.get_one::<String>("spec-path");
-    let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
-    let set_disk_quota_for_fs = matches.get_one::<String>("set-disk-quota-for-fs");
-
-    Ok(ProcessCliResult {
-        connstr,
-        pgdata,
-        pgbin,
-        ext_remote_storage,
-        http_port,
-        spec_json,
-        spec_path,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs,
-    })
-}
-
-struct ProcessCliResult<'clap> {
-    connstr: &'clap str,
-    pgdata: &'clap str,
-    pgbin: &'clap str,
-    ext_remote_storage: Option<&'clap str>,
-    http_port: u16,
-    spec_json: Option<&'clap String>,
-    spec_path: Option<&'clap String>,
-    resize_swap_on_bind: bool,
-    set_disk_quota_for_fs: Option<&'clap String>,
+    Ok(build_tag)
 }

 fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
@@ -235,19 +240,9 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
    }
 }

-fn try_spec_from_cli(
-    matches: &clap::ArgMatches,
-    ProcessCliResult {
-        spec_json,
-        spec_path,
-        ..
-    }: &ProcessCliResult,
-) -> Result<CliSpecParams> {
-    let compute_id = matches.get_one::<String>("compute-id");
-    let control_plane_uri = matches.get_one::<String>("control-plane-uri");
-
+fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
    // First, try to get cluster spec from the cli argument
-    if let Some(spec_json) = spec_json {
+    if let Some(ref spec_json) = cli.spec_json {
        info!("got spec from cli argument {}", spec_json);
        return Ok(CliSpecParams {
            spec: Some(serde_json::from_str(spec_json)?),
@@ -256,7 +251,7 @@ fn try_spec_from_cli(
    }

    // Second, try to read it from the file if path is provided
-    if let Some(spec_path) = spec_path {
+    if let Some(ref spec_path) = cli.spec_path {
        let file = File::open(Path::new(spec_path))?;
        return Ok(CliSpecParams {
            spec: Some(serde_json::from_reader(file)?),
@@ -264,17 +259,20 @@ fn try_spec_from_cli(
        });
    }

-    let Some(compute_id) = compute_id else {
+    if cli.compute_id.is_none() {
        panic!(
            "compute spec should be provided by one of the following ways: \
                --spec OR --spec-path OR --control-plane-uri and --compute-id"
        );
    };
-    let Some(control_plane_uri) = control_plane_uri else {
+    if cli.control_plane_uri.is_none() {
        panic!("must specify both --control-plane-uri and --compute-id or none");
    };

-    match get_spec_from_control_plane(control_plane_uri, compute_id) {
+    match get_spec_from_control_plane(
+        cli.control_plane_uri.as_ref().unwrap(),
+        cli.compute_id.as_ref().unwrap(),
+    ) {
        Ok(spec) => Ok(CliSpecParams {
            spec,
            live_config_allowed: true,
@@ -298,21 +296,12 @@ struct CliSpecParams {

 fn wait_spec(
    build_tag: String,
-    ProcessCliResult {
-        connstr,
-        pgdata,
-        pgbin,
-        ext_remote_storage,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs,
-        http_port,
-        ..
-    }: ProcessCliResult,
+    cli: &Cli,
    CliSpecParams {
        spec,
        live_config_allowed,
    }: CliSpecParams,
-) -> Result<WaitSpecResult> {
+) -> Result<Arc<ComputeNode>> {
    let mut new_state = ComputeState::new();
    let spec_set;

@@ -324,7 +313,7 @@ fn wait_spec(
    } else {
        spec_set = false;
    }
-    let connstr = Url::parse(connstr).context("cannot parse connstr as a URL")?;
+    let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
    let conn_conf = postgres::config::Config::from_str(connstr.as_str())
        .context("cannot build postgres config from connstr")?;
    let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
@@ -333,14 +322,14 @@ fn wait_spec(
        connstr,
        conn_conf,
        tokio_conn_conf,
-        pgdata: pgdata.to_string(),
-        pgbin: pgbin.to_string(),
-        pgversion: get_pg_version_string(pgbin),
-        http_port,
+        pgdata: cli.pgdata.clone(),
+        pgbin: cli.pgbin.clone(),
+        pgversion: get_pg_version_string(&cli.pgbin),
+        http_port: cli.http_port,
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
-        ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
+        ext_remote_storage: cli.remote_ext_config.clone(),
        ext_download_progress: RwLock::new(HashMap::new()),
        build_tag,
    };
@@ -357,7 +346,7 @@ fn wait_spec(
    // Launch http service first, so that we can serve control-plane requests
    // while configuration is still in progress.
    let _http_handle =
-        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
+        launch_http_server(cli.http_port, &compute).expect("cannot launch http endpoint thread");

    if !spec_set {
        // No spec provided, hang waiting for it.
@@ -389,27 +378,12 @@ fn wait_spec(

    launch_lsn_lease_bg_task_for_static(&compute);

-    Ok(WaitSpecResult {
-        compute,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(),
-    })
-}
-
-struct WaitSpecResult {
-    compute: Arc<ComputeNode>,
-    resize_swap_on_bind: bool,
-    set_disk_quota_for_fs: Option<String>,
+    Ok(compute)
 }

 fn start_postgres(
-    // need to allow unused because `matches` is only used if target_os = "linux"
-    #[allow(unused_variables)] matches: &clap::ArgMatches,
-    WaitSpecResult {
-        compute,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs,
-    }: WaitSpecResult,
+    cli: &Cli,
+    compute: Arc<ComputeNode>,
 ) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
    // We got all we need, update the state.
    let mut state = compute.state.lock().unwrap();
@@ -437,7 +411,7 @@ fn start_postgres(
    let mut delay_exit = false;

    // Resize swap to the desired size if the compute spec says so
-    if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) {
+    if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) {
        // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
        // *before* starting postgres.
        //
@@ -464,9 +438,9 @@ fn start_postgres(

    // Set disk quota if the compute spec says so
    if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
-        (disk_quota_bytes, set_disk_quota_for_fs)
+        (disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref())
    {
-        match set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) {
+        match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) {
            Ok(()) => {
                let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
                info!(%disk_quota_bytes, %size_mib, "set disk quota");
@@ -509,13 +483,7 @@ fn start_postgres(
        if #[cfg(target_os = "linux")] {
            use std::env;
            use tokio_util::sync::CancellationToken;
-            let vm_monitor_addr = matches
-                .get_one::<String>("vm-monitor-addr")
-                .expect("--vm-monitor-addr should always be set because it has a default arg");
-            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
-            let cgroup = matches.get_one::<String>("cgroup");

-            // Only make a runtime if we need to.
            // Note: it seems like you can make a runtime in an inner scope and
            // if you start a task in it it won't be dropped. However, make it
            // in the outermost scope just to be safe.
@@ -538,15 +506,15 @@ fn start_postgres(
            let pgconnstr = if disable_lfc_resizing.unwrap_or(false) {
                None
            } else {
-                file_cache_connstr.cloned()
+                Some(cli.filecache_connstr.clone())
            };

            let vm_monitor = rt.as_ref().map(|rt| {
                rt.spawn(vm_monitor::start(
                    Box::leak(Box::new(vm_monitor::Args {
-                        cgroup: cgroup.cloned(),
+                        cgroup: Some(cli.cgroup.clone()),
                        pgconnstr,
-                        addr: vm_monitor_addr.clone(),
+                        addr: cli.vm_monitor_addr.clone(),
                    })),
                    token.clone(),
                ))
@@ -702,105 +670,6 @@ fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
    exit(exit_code.unwrap_or(1))
 }

-fn cli() -> clap::Command {
-    // Env variable is set by `cargo`
-    let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
-    clap::Command::new("compute_ctl")
-        .version(version)
-        .arg(
-            Arg::new("http-port")
-                .long("http-port")
-                .value_name("HTTP_PORT")
-                .default_value("3080")
-                .value_parser(clap::value_parser!(u16))
-                .required(false),
-        )
-        .arg(
-            Arg::new("connstr")
-                .short('C')
-                .long("connstr")
-                .value_name("DATABASE_URL")
-                .required(true),
-        )
-        .arg(
-            Arg::new("pgdata")
-                .short('D')
-                .long("pgdata")
-                .value_name("DATADIR")
-                .required(true),
-        )
-        .arg(
-            Arg::new("pgbin")
-                .short('b')
-                .long("pgbin")
-                .default_value("postgres")
-                .value_name("POSTGRES_PATH"),
-        )
-        .arg(
-            Arg::new("spec")
-                .short('s')
-                .long("spec")
-                .value_name("SPEC_JSON"),
-        )
-        .arg(
-            Arg::new("spec-path")
-                .short('S')
-                .long("spec-path")
-                .value_name("SPEC_PATH"),
-        )
-        .arg(
-            Arg::new("compute-id")
-                .short('i')
-                .long("compute-id")
-                .value_name("COMPUTE_ID"),
-        )
-        .arg(
-            Arg::new("control-plane-uri")
-                .short('p')
-                .long("control-plane-uri")
-                .value_name("CONTROL_PLANE_API_BASE_URI"),
-        )
-        .arg(
-            Arg::new("remote-ext-config")
-                .short('r')
-                .long("remote-ext-config")
-                .value_name("REMOTE_EXT_CONFIG"),
-        )
-        // TODO(fprasx): we currently have default arguments because the cloud PR
-        // to pass them in hasn't been merged yet. We should get rid of them once
-        // the PR is merged.
-        .arg(
-            Arg::new("vm-monitor-addr")
-                .long("vm-monitor-addr")
-                .default_value("0.0.0.0:10301")
-                .value_name("VM_MONITOR_ADDR"),
-        )
-        .arg(
-            Arg::new("cgroup")
-                .long("cgroup")
-                .default_value("neon-postgres")
-                .value_name("CGROUP"),
-        )
-        .arg(
-            Arg::new("filecache-connstr")
-                .long("filecache-connstr")
-                .default_value(
-                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor",
-                )
-                .value_name("FILECACHE_CONNSTR"),
-        )
-        .arg(
-            Arg::new("resize-swap-on-bind")
-                .long("resize-swap-on-bind")
-                .action(clap::ArgAction::SetTrue),
-        )
-        .arg(
-            Arg::new("set-disk-quota-for-fs")
-                .long("set-disk-quota-for-fs")
-                .value_name("SET_DISK_QUOTA_FOR_FS")
-        )
-}
-
 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
 /// to prevent leakage. TODO: it is better to convert compute_ctl to async and
 /// wait for termination which would be easy then.
@@ -810,7 +679,14 @@ fn handle_exit_signal(sig: i32) {
    exit(1);
 }

-#[test]
-fn verify_cli() {
-    cli().debug_assert()
+#[cfg(test)]
+mod test {
+    use clap::CommandFactory;
+
+    use super::Cli;
+
+    #[test]
+    fn verify_cli() {
+        Cli::command().debug_assert()
+    }
 }
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -231,6 +231,14 @@ pub(crate) async fn main() -> anyhow::Result<()> {
        ])
        .env_clear()
        .env("LD_LIBRARY_PATH", &pg_lib_dir)
+        .env(
+            "ASAN_OPTIONS",
+            std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+        )
+        .env(
+            "UBSAN_OPTIONS",
+            std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+        )
        .stdout(std::process::Stdio::piped())
        .stderr(std::process::Stdio::piped())
        .spawn()
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -85,7 +85,7 @@ use tracing::info;
 use tracing::log::warn;
 use zstd::stream::read::Decoder;

-use crate::metrics::{REMOTE_EXT_REQUESTS_FAILED, REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
+use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};

 fn get_pg_config(argument: &str, pgbin: &str) -> String {
    // gives the result of `pg_config [argument]`
@@ -258,24 +258,19 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
 async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
    let uri = format!("{}/{}", ext_remote_storage, ext_path);

-    info!("Download extension {:?} from uri {:?}", ext_path, uri);
-
-    REMOTE_EXT_REQUESTS_TOTAL.with_label_values(&[]).inc();
+    info!("Download extension {} from uri {}", ext_path, uri);

    match do_extension_server_request(&uri).await {
        Ok(resp) => {
-            info!(
-                "Successfully downloaded remote extension data {:?}",
-                ext_path
-            );
+            info!("Successfully downloaded remote extension data {}", ext_path);
+            REMOTE_EXT_REQUESTS_TOTAL
+                .with_label_values(&[&StatusCode::OK.to_string()])
+                .inc();
            Ok(resp)
        }
        Err((msg, status)) => {
-            let status_str = status
-                .map(|s| s.to_string())
-                .unwrap_or(UNKNOWN_HTTP_STATUS.to_string());
-            REMOTE_EXT_REQUESTS_FAILED
-                .with_label_values(&[&status_str])
+            REMOTE_EXT_REQUESTS_TOTAL
+                .with_label_values(&[&status])
                .inc();
            bail!(msg);
        }
@@ -283,12 +278,15 @@ async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Res
 }

 // Do a single remote extensions server request.
-// Return result or (error message + status code) in case of any failures.
-async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, Option<StatusCode>)> {
+// Return result or (error message + stringified status code) in case of any failures.
+async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String)> {
    let resp = reqwest::get(uri).await.map_err(|e| {
        (
-            format!("could not perform remote extensions server request: {}", e),
-            None,
+            format!(
+                "could not perform remote extensions server request: {:?}",
+                e
+            ),
+            UNKNOWN_HTTP_STATUS.to_string(),
        )
    })?;
    let status = resp.status();
@@ -297,22 +295,22 @@ async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, Option
        StatusCode::OK => match resp.bytes().await {
            Ok(resp) => Ok(resp),
            Err(e) => Err((
-                format!("could not read remote extensions server response: {}", e),
+                format!("could not read remote extensions server response: {:?}", e),
                // It's fine to return and report error with status as 200 OK,
                // because we still failed to read the response.
-                Some(status),
+                status.to_string(),
            )),
        },
        StatusCode::SERVICE_UNAVAILABLE => Err((
            "remote extensions server is temporarily unavailable".to_string(),
-            Some(status),
+            status.to_string(),
        )),
        _ => Err((
            format!(
                "unexpected remote extensions server response status code: {}",
                status
            ),
-            Some(status),
+            status.to_string(),
        )),
    }
 }
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -68,35 +68,6 @@ paths:
              schema:
                $ref: "#/components/schemas/ComputeInsights"

-  /installed_extensions:
-    get:
-      tags:
-      - Info
-      summary: Get installed extensions.
-      description: ""
-      operationId: getInstalledExtensions
-      responses:
-        200:
-          description: List of installed extensions
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/InstalledExtensions"
-  /info:
-    get:
-      tags:
-      - Info
-      summary: Get info about the compute pod / VM.
-      description: ""
-      operationId: getInfo
-      responses:
-        200:
-          description: Info
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Info"
-
  /dbs_and_roles:
    get:
      tags:
--- a/compute_tools/src/http/routes/info.rs
+++ b/compute_tools/src/http/routes/info.rs
@@ -1,11 +0,0 @@
-use axum::response::Response;
-use compute_api::responses::InfoResponse;
-use http::StatusCode;
-
-use crate::http::JsonResponse;
-
-/// Get information about the physical characteristics about the compute.
-pub(in crate::http) async fn get_info() -> Response {
-    let num_cpus = num_cpus::get_physical();
-    JsonResponse::success(StatusCode::OK, &InfoResponse { num_cpus })
-}
--- a/compute_tools/src/http/routes/installed_extensions.rs
+++ b/compute_tools/src/http/routes/installed_extensions.rs
@@ -1,33 +0,0 @@
-use std::sync::Arc;
-
-use axum::{extract::State, response::Response};
-use compute_api::responses::ComputeStatus;
-use http::StatusCode;
-use tokio::task;
-
-use crate::{compute::ComputeNode, http::JsonResponse, installed_extensions};
-
-/// Get a list of installed extensions.
-pub(in crate::http) async fn get_installed_extensions(
-    State(compute): State<Arc<ComputeNode>>,
-) -> Response {
-    let status = compute.get_status();
-    if status != ComputeStatus::Running {
-        return JsonResponse::invalid_status(status);
-    }
-
-    let conf = compute.get_conn_conf(None);
-    let res = task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
-        .await
-        .unwrap();
-
-    match res {
-        Ok(installed_extensions) => {
-            JsonResponse::success(StatusCode::OK, Some(installed_extensions))
-        }
-        Err(e) => JsonResponse::error(
-            StatusCode::INTERNAL_SERVER_ERROR,
-            format!("failed to get list of installed extensions: {e}"),
-        ),
-    }
-}
--- a/compute_tools/src/http/routes/mod.rs
+++ b/compute_tools/src/http/routes/mod.rs
@@ -10,9 +10,7 @@ pub(in crate::http) mod extension_server;
 pub(in crate::http) mod extensions;
 pub(in crate::http) mod failpoints;
 pub(in crate::http) mod grants;
-pub(in crate::http) mod info;
 pub(in crate::http) mod insights;
-pub(in crate::http) mod installed_extensions;
 pub(in crate::http) mod metrics;
 pub(in crate::http) mod metrics_json;
 pub(in crate::http) mod status;
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -22,8 +22,7 @@ use uuid::Uuid;

 use super::routes::{
    check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
-    grants, info as info_route, insights, installed_extensions, metrics, metrics_json, status,
-    terminate,
+    grants, insights, metrics, metrics_json, status, terminate,
 };
 use crate::compute::ComputeNode;

@@ -60,12 +59,7 @@ async fn serve(port: u16, compute: Arc<ComputeNode>) {
        )
        .route("/extensions", post(extensions::install_extension))
        .route("/grants", post(grants::add_grant))
-        .route("/info", get(info_route::get_info))
        .route("/insights", get(insights::get_insights))
-        .route(
-            "/installed_extensions",
-            get(installed_extensions::get_installed_extensions),
-        )
        .route("/metrics", get(metrics::get_metrics))
        .route("/metrics.json", get(metrics_json::get_metrics))
        .route("/status", get(status::get_status))
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -32,16 +32,7 @@ pub const UNKNOWN_HTTP_STATUS: &str = "unknown";
 pub(crate) static CPLANE_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "compute_ctl_cplane_requests_total",
-        "Total number of control plane requests made by compute_ctl",
-        &["rpc"]
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static CPLANE_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "compute_ctl_cplane_requests_failed_total",
-        "Total number of failed control plane requests made by compute_ctl",
+        "Total number of control plane requests made by compute_ctl by status",
        &["rpc", "http_status"]
    )
    .expect("failed to define a metric")
@@ -62,18 +53,9 @@ pub(crate) static DB_MIGRATION_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
 pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "compute_ctl_remote_ext_requests_total",
-        "Total number of requests made by compute_ctl to download extensions from S3 proxy",
+        "Total number of requests made by compute_ctl to download extensions from S3 proxy by status",
        // Do not use any labels like extension name yet.
        // We can add them later if needed.
-        &[]
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static REMOTE_EXT_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "compute_ctl_remote_ext_requests_failed_total",
-        "Total number of failed requests to S3 proxy",
        &["http_status"]
    )
    .expect("failed to define a metric")
@@ -82,9 +64,7 @@ pub(crate) static REMOTE_EXT_REQUESTS_FAILED: Lazy<IntCounterVec> = Lazy::new(||
 pub fn collect() -> Vec<MetricFamily> {
    let mut metrics = INSTALLED_EXTENSIONS.collect();
    metrics.extend(CPLANE_REQUESTS_TOTAL.collect());
-    metrics.extend(CPLANE_REQUESTS_FAILED.collect());
-    metrics.extend(DB_MIGRATION_FAILED.collect());
    metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
-    metrics.extend(REMOTE_EXT_REQUESTS_FAILED.collect());
+    metrics.extend(DB_MIGRATION_FAILED.collect());
    metrics
 }
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -125,7 +125,7 @@ impl<'m> MigrationRunner<'m> {
                    info!("Finished migration id={}", migration_id);
                }
                Err(e) => {
-                    error!("Failed to run migration id={}: {}", migration_id, e);
+                    error!("Failed to run migration id={}: {:?}", migration_id, e);
                    DB_MIGRATION_FAILED
                        .with_label_values(&[migration_id.to_string().as_str()])
                        .inc();
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -6,9 +6,7 @@ use std::path::Path;
 use tracing::{error, info, instrument, warn};

 use crate::config;
-use crate::metrics::{
-    CPlaneRequestRPC, CPLANE_REQUESTS_FAILED, CPLANE_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS,
-};
+use crate::metrics::{CPlaneRequestRPC, CPLANE_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
 use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;
@@ -22,7 +20,7 @@ use compute_api::spec::ComputeSpec;
 fn do_control_plane_request(
    uri: &str,
    jwt: &str,
-) -> Result<ControlPlaneSpecResponse, (bool, String, Option<StatusCode>)> {
+) -> Result<ControlPlaneSpecResponse, (bool, String, String)> {
    let resp = reqwest::blocking::Client::new()
        .get(uri)
        .header("Authorization", format!("Bearer {}", jwt))
@@ -30,8 +28,8 @@ fn do_control_plane_request(
        .map_err(|e| {
            (
                true,
-                format!("could not perform spec request to control plane: {}", e),
-                None,
+                format!("could not perform spec request to control plane: {:?}", e),
+                UNKNOWN_HTTP_STATUS.to_string(),
            )
        })?;

@@ -41,14 +39,14 @@ fn do_control_plane_request(
            Ok(spec_resp) => Ok(spec_resp),
            Err(e) => Err((
                true,
-                format!("could not deserialize control plane response: {}", e),
-                Some(status),
+                format!("could not deserialize control plane response: {:?}", e),
+                status.to_string(),
            )),
        },
        StatusCode::SERVICE_UNAVAILABLE => Err((
            true,
            "control plane is temporarily unavailable".to_string(),
-            Some(status),
+            status.to_string(),
        )),
        StatusCode::BAD_GATEWAY => {
            // We have a problem with intermittent 502 errors now
@@ -57,7 +55,7 @@ fn do_control_plane_request(
            Err((
                true,
                "control plane request failed with 502".to_string(),
-                Some(status),
+                status.to_string(),
            ))
        }
        // Another code, likely 500 or 404, means that compute is unknown to the control plane
@@ -65,7 +63,7 @@ fn do_control_plane_request(
        _ => Err((
            false,
            format!("unexpected control plane response status code: {}", status),
-            Some(status),
+            status.to_string(),
        )),
    }
 }
@@ -92,26 +90,28 @@ pub fn get_spec_from_control_plane(
    // - no spec for compute yet (Empty state) -> return Ok(None)
    // - got spec -> return Ok(Some(spec))
    while attempt < 4 {
-        CPLANE_REQUESTS_TOTAL
-            .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str()])
-            .inc();
        spec = match do_control_plane_request(&cp_uri, &jwt) {
-            Ok(spec_resp) => match spec_resp.status {
-                ControlPlaneComputeStatus::Empty => Ok(None),
-                ControlPlaneComputeStatus::Attached => {
-                    if let Some(spec) = spec_resp.spec {
-                        Ok(Some(spec))
-                    } else {
-                        bail!("compute is attached, but spec is empty")
+            Ok(spec_resp) => {
+                CPLANE_REQUESTS_TOTAL
+                    .with_label_values(&[
+                        CPlaneRequestRPC::GetSpec.as_str(),
+                        &StatusCode::OK.to_string(),
+                    ])
+                    .inc();
+                match spec_resp.status {
+                    ControlPlaneComputeStatus::Empty => Ok(None),
+                    ControlPlaneComputeStatus::Attached => {
+                        if let Some(spec) = spec_resp.spec {
+                            Ok(Some(spec))
+                        } else {
+                            bail!("compute is attached, but spec is empty")
+                        }
                    }
                }
-            },
+            }
            Err((retry, msg, status)) => {
-                let status_str = status
-                    .map(|s| s.to_string())
-                    .unwrap_or(UNKNOWN_HTTP_STATUS.to_string());
-                CPLANE_REQUESTS_FAILED
-                    .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status_str])
+                CPLANE_REQUESTS_TOTAL
+                    .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status])
                    .inc();
                if retry {
                    Err(anyhow!(msg))
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -261,7 +261,13 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
    let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", backtrace_setting);

    // Pass through these environment variables to the command
-    for var in ["LLVM_PROFILE_FILE", "FAILPOINTS", "RUST_LOG"] {
+    for var in [
+        "LLVM_PROFILE_FILE",
+        "FAILPOINTS",
+        "RUST_LOG",
+        "ASAN_OPTIONS",
+        "UBSAN_OPTIONS",
+    ] {
        if let Some(val) = std::env::var_os(var) {
            filled_cmd = filled_cmd.env(var, val);
        }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -388,6 +388,11 @@ impl PageServerNode {
                .map(|x| x.parse::<u8>())
                .transpose()
                .context("Failed to parse 'image_creation_check_threshold' as integer")?,
+            image_creation_preempt_threshold: settings
+                .remove("image_creation_preempt_threshold")
+                .map(|x| x.parse::<usize>())
+                .transpose()
+                .context("Failed to parse 'image_creation_preempt_threshold' as integer")?,
            pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
            walreceiver_connect_timeout: settings
                .remove("walreceiver_connect_timeout")
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -221,7 +221,17 @@ impl StorageController {
            "-p",
            &format!("{}", postgres_port),
        ];
-        let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
+        let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
+        let envs = [
+            ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+        ];
+        let exitcode = Command::new(bin_path)
+            .args(args)
+            .envs(envs)
+            .spawn()?
+            .wait()
+            .await?;

        Ok(exitcode.success())
    }
@@ -242,6 +252,11 @@ impl StorageController {

        let pg_bin_dir = self.get_pg_bin_dir().await?;
        let createdb_path = pg_bin_dir.join("createdb");
+        let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
+        let envs = [
+            ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+        ];
        let output = Command::new(&createdb_path)
            .args([
                "-h",
@@ -254,6 +269,7 @@ impl StorageController {
                &username(),
                DB_NAME,
            ])
+            .envs(envs)
            .output()
            .await
            .expect("Failed to spawn createdb");
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -10,8 +10,8 @@ use pageserver_api::{
    controller_api::{
        AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
        SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
-        ShardsPreferredAzsRequest, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse,
-        TenantPolicyRequest,
+        ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy,
+        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
    },
    models::{
        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -800,7 +800,7 @@ async fn main() -> anyhow::Result<()> {
                    .collect(),
            };
            storcon_client
-                .dispatch::<ShardsPreferredAzsRequest, ()>(
+                .dispatch::<ShardsPreferredAzsRequest, ShardsPreferredAzsResponse>(
                    Method::PUT,
                    "control/v1/preferred_azs".to_string(),
                    Some(req),
--- a/deny.toml
+++ b/deny.toml
@@ -32,6 +32,7 @@ reason = "the marvin attack only affects private key decryption, not public key
 # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
 [licenses]
 allow = [
+    "0BSD",
    "Apache-2.0",
    "BSD-2-Clause",
    "BSD-3-Clause",
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -7,11 +7,12 @@ FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG
 ARG COMPUTE_IMAGE

 USER root
-RUN apt-get update &&       \
+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    apt-get update &&       \
    apt-get install -y curl \
                       jq   \
                       netcat-openbsd
 #This is required for the pg_hintplan test
-RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src 
+RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw

-USER postgres
+USER postgres
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -31,7 +31,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
    echo "clean up containers if exists"
    cleanup
    PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version))
-    PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d
+    PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --quiet-pull --build -d

    echo "wait until the compute is ready. timeout after 60s. "
    cnt=0
@@ -51,6 +51,8 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
    done

    if [ $pg_version -ge 16 ]; then
+        docker cp ext-src $TEST_CONTAINER_NAME:/
+        docker exec $TEST_CONTAINER_NAME bash -c "apt update && apt install -y libtap-parser-sourcehandler-pgtap-perl"
        # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
        # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
        echo Adding dummy config
@@ -60,17 +62,32 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
        docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
        rm -rf $TMPDIR
+        # The following block does the same for the contrib/file_fdw test
+        TMPDIR=$(mktemp -d)
+        docker cp $TEST_CONTAINER_NAME:/postgres/contrib/file_fdw/data $TMPDIR/data
+        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/postgres/contrib/file_fdw/data
+        rm -rf $TMPDIR
+        # Apply patches
+        cat ../compute/patches/contrib_pg${pg_version}.patch | docker exec -i $TEST_CONTAINER_NAME bash -c "(cd /postgres && patch -p1)"
        # We are running tests now
-        if ! docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
-            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
-        then
-            FAILED=$(tail -1 testout.txt)
-            for d in $FAILED
-            do
-                mkdir $d
-                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true
-                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true
-                cat $d/regression.out $d/regression.diffs || true
+        rm -f testout.txt testout_contrib.txt
+        docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
+        $TEST_CONTAINER_NAME /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
+        docker exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
+        $TEST_CONTAINER_NAME /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
+        if [ $EXT_SUCCESS -eq 0 ] || [ $CONTRIB_SUCCESS -eq 0 ]; then
+            CONTRIB_FAILED=
+            FAILED=
+            [ $EXT_SUCCESS -eq 0 ] && FAILED=$(tail -1 testout.txt | awk '{for(i=1;i<=NF;i++){print "/ext-src/"$i;}}')
+            [ $CONTRIB_SUCCESS -eq 0 ] && CONTRIB_FAILED=$(tail -1 testout_contrib.txt | awk '{for(i=0;i<=NF;i++){print "/postgres/contrib/"$i;}}')
+            for d in $FAILED $CONTRIB_FAILED; do
+                dn="$(basename $d)"
+                rm -rf $dn
+                mkdir $dn
+                docker cp $TEST_CONTAINER_NAME:$d/regression.diffs $dn || [ $? -eq 1 ]
+                docker cp $TEST_CONTAINER_NAME:$d/regression.out $dn || [ $? -eq 1 ]
+                cat $dn/regression.out $dn/regression.diffs || true
+                rm -rf $dn
            done
        rm -rf $FAILED
        exit 1
--- a/docker-compose/ext-src/pg_graphql-src/neon-test.sh
+++ b/docker-compose/ext-src/pg_graphql-src/neon-test.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+set -ex
+cd "$(dirname "${0}")"
+dropdb --if-exists contrib_regression
+createdb contrib_regression
+PGXS="$(dirname "$(pg_config --pgxs)" )"
+REGRESS="${PGXS}/../test/regress/pg_regress"
+TESTDIR="test"
+TESTS=$(ls "${TESTDIR}/sql" | sort )
+TESTS=${TESTS//\.sql/}
+psql -v ON_ERROR_STOP=1 -f test/fixtures.sql -d contrib_regression
+${REGRESS} --use-existing --dbname=contrib_regression --inputdir=${TESTDIR} ${TESTS}
+
--- a/docker-compose/ext-src/pgjwt-src/neon-test.sh
+++ b/docker-compose/ext-src/pgjwt-src/neon-test.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -ex
+cd "$(dirname "${0}")"
+pg_prove test.sql
--- a/docker-compose/ext-src/pgjwt-src/test-upgrade.patch
+++ b/docker-compose/ext-src/pgjwt-src/test-upgrade.patch
@@ -0,0 +1,15 @@
+diff --git a/test.sql b/test.sql
+index d7a0ca8..f15bc76 100644
+--- a/test.sql
+++ b/test.sql
+@@ -9,9 +9,7 @@
+ \set ON_ERROR_STOP true
+ \set QUIET 1
+ 
+-CREATE EXTENSION pgcrypto;
+-CREATE EXTENSION pgtap;
+-CREATE EXTENSION pgjwt;
+CREATE EXTENSION IF NOT EXISTS pgtap;
+ 
+ BEGIN;
+ SELECT plan(23);
--- a/docker-compose/ext-src/pgjwt-src/test-upgrade.sh
+++ b/docker-compose/ext-src/pgjwt-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+pg_prove test.sql
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,17 +1,22 @@
 #!/bin/bash
 set -x

-cd /ext-src || exit 2
+extdir=${1}
+
+cd "${extdir}" || exit 2
 FAILED=
-LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
-for d in ${LIST}
-do
-       [ -d "${d}" ] || continue
-       if ! psql -w -c "select 1" >/dev/null; then
-          FAILED="${d} ${FAILED}"
-          break
-       fi
+LIST=$( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u)
+for d in ${LIST}; do
+    [ -d "${d}" ] || continue
+    if ! psql -w -c "select 1" >/dev/null; then
+      FAILED="${d} ${FAILED}"
+      break
+    fi
+    if [ -f "${d}/neon-test.sh" ]; then
+       "${d}/neon-test.sh" || FAILED="${d} ${FAILED}"
+    else
       USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
+    fi
 done
 [ -z "${FAILED}" ] && exit 0
 echo "${FAILED}"
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -24,7 +24,7 @@ function wait_for_ready {
 }
 function create_extensions() {
  for ext in ${1}; do
-    docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext}"
+    docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext} CASCADE"
  done
 }
 EXTENSIONS='[
@@ -40,7 +40,8 @@ EXTENSIONS='[
 {"extname": "pg_uuidv7", "extdir": "pg_uuidv7-src"},
 {"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"},
 {"extname": "semver", "extdir": "pg_semver-src"},
-{"extname": "pg_ivm", "extdir": "pg_ivm-src"}
+{"extname": "pg_ivm", "extdir": "pg_ivm-src"},
+{"extname": "pgjwt", "extdir": "pgjwt-src"}
 ]'
 EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
 TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -285,10 +285,10 @@ To summarize, list of cplane changes:

 ### storage_controller implementation

-Current 'load everything on startup and keep in memory' easy design is fine.
-Single timeline shouldn't take more than 100 bytes (it's 16 byte tenant_id, 16
-byte timeline_id, int generation, vec of ~3 safekeeper ids plus some flags), so
-10^6 of timelines shouldn't take more than 100MB.
+If desired, we may continue using current 'load everything on startup and keep
+in memory' approach: single timeline shouldn't take more than 100 bytes (it's 16
+byte tenant_id, 16 byte timeline_id, int generation, vec of ~3 safekeeper ids
+plus some flags), so 10^6 of timelines shouldn't take more than 100MB.

 Similar to pageserver attachment Intents storage_controller would have in-memory
 `MigrationRequest` (or its absense) for each timeline and pool of tasks trying
@@ -296,7 +296,7 @@ to make these request reality; this ensures one instance of storage_controller
 won't do several migrations on the same timeline concurrently. In the first
 version it is simpler to have more manual control and no retries, i.e. migration
 failure removes the request. Later we can build retries and automatic
-scheduling/migration. `MigrationRequest` is
+scheduling/migration around. `MigrationRequest` is
 ```
 enum MigrationRequest {
    To(Vec<NodeId>),
@@ -313,9 +313,9 @@ similarly, in the first version it is ok to trigger it manually).
 #### Schema

 `safekeepers` table mirroring current `nodes` should be added, except that for
-`scheduling_policy` field (seems like `status` is a better name for it): it is enough
-to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3)
-`decomissioned`.
+`scheduling_policy`: it is enough to have at least in the beginning only 3
+fields: 1) `active` 2) `paused` (initially means only not assign new tlis there
+3) `decomissioned` (node is removed).

 `timelines` table:
 ```
@@ -324,18 +324,24 @@ table! {
    timelines (tenant_id, timeline_id) {
        timeline_id -> Varchar,
        tenant_id -> Varchar,
+        start_lsn -> pg_lsn,
        generation -> Int4,
        sk_set -> Array<Int4>, // list of safekeeper ids
-        new_sk_set -> Nullable<Array<Int4>>, // list of safekeeper ids, null if not joint conf
+        new_sk_set -> Nullable<Array<Int8>>, // list of safekeeper ids, null if not joint conf
        cplane_notified_generation -> Int4,
+        deleted_at -> Nullable<Timestamptz>,
    }
 }
 ```

+`start_lsn` is needed to create timeline on safekeepers properly, see below. We
+might also want to add ancestor_timeline_id to preserve the hierarchy, but for
+this RFC it is not needed.
+
 #### API

 Node management is similar to pageserver:
-1) POST `/control/v1/safekeepers` upserts safekeeper.
+1) POST `/control/v1/safekeepers` inserts safekeeper.
 2) GET `/control/v1/safekeepers` lists safekeepers.
 3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
 4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
@@ -345,25 +351,15 @@ Node management is similar to pageserver:
 Safekeeper deploy scripts should register safekeeper at storage_contorller as
 they currently do with cplane, under the same id.

-Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline`
-would 1) choose initial set of safekeepers; 2) write to the db initial
-`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in
-case of conflict; 3) create timeline on the majority of safekeepers (already
-created is ok).
+Timeline creation/deletion will work through already existing POST and DELETE
+`tenant/:tenant_id/timeline`. Cplane is expected to retry both until they
+succeed. See next section on the implementation details.

-We don't want to block timeline creation when one safekeeper is down. Currently
-this is solved by compute implicitly creating timeline on any safekeeper it is
-connected to. This creates ugly timeline state on safekeeper when timeline is
-created, but start LSN is not defined yet. It would be nice to remove this; to
-do that, controller can in the background retry to create timeline on
-safekeeper(s) which missed that during initial creation call. It can do that
-through `pull_timeline` from majority so it doesn't need to remember
-`parent_lsn` in its db.
-
-Timeline deletion removes the row from the db and forwards deletion to the
-current configuration members. Without additional actions deletions might leak,
-see below on this; initially let's ignore these, reporting to cplane success if
-at least one safekeeper deleted the timeline (this will remove s3 data).
+We don't want to block timeline creation/deletion when one safekeeper is down.
+Currently this is crutched by compute implicitly creating timeline on any
+safekeeper it is connected to. This creates ugly timeline state on safekeeper
+when timeline is created, but start LSN is not defined yet. Next section
+describes dealing with this.

 Tenant deletion repeats timeline deletion for all timelines.

@@ -395,26 +391,6 @@ Similar call should be added for the tenant.
 It would be great to have some way of subscribing to the results (apart from
 looking at logs/metrics).

-Migration is executed as described above. One subtlety is that (local) deletion on
-source safekeeper might fail, which is not a problem if we are going to
-decomission the node but leaves garbage otherwise. I'd propose in the first version
-1) Don't attempt deletion at all if node status is `offline`.
-2) If it failed, just issue warning.
-And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and
-remove garbage timelines for manual use. It will 1) list all timelines on the
-safekeeper 2) compare each one against configuration storage: if timeline
-doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can
-be deleted under generation number if node is not member of current generation.
-
-Automating this is untrivial; we'd need to register all potential missing
-deletions <tenant_id, timeline_id, generation, node_id> in the same transaction
-which switches configurations. Similarly when timeline is fully deleted to
-prevent cplane operation from blocking when some safekeeper is not available
-deletion should be also registered.
-
-One more task pool should infinitely retry notifying control plane about changed
-safekeeper sets.
-
 3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return
   current in memory state of the timeline and pending `MigrationRequest`,
   if any.
@@ -423,12 +399,153 @@ safekeeper sets.
   migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS
   (incrementing generation as always).

+#### API implementation and reconciliation
+
+For timeline creation/deletion we want to preserve the basic assumption that
+unreachable minority (1 sk of 3) doesn't block their completion, but eventually
+we want to finish creation/deletion on nodes which missed it (unless they are
+removed). Similarly for migration; it may and should finish even though excluded
+members missed their exclusion. And of course e.g. such pending exclusion on
+node C after migration ABC -> ABD must not prevent next migration ABD -> ABE. As
+another example, if some node missed timeline creation it clearly must not block
+migration from it. Hence it is natural to have per safekeeper background
+reconciler which retries these ops until they succeed. There are 3 possible
+operation types, and the type is defined by timeline state (membership
+configuration and whether it is deleted) and safekeeper id: we may need to
+create timeline on sk (node added), locally delete it (node excluded, somewhat
+similar to detach) or globally delete it (timeline is deleted).
+
+Next, on storage controller restart in principle these pending operations can be
+figured out by comparing safekeepers state against storcon state. But it seems
+better to me to materialize them in the database; it is not expensive, avoids
+these startup scans which themselves can fail etc and makes it very easy to see
+outstanding work directly at the source of truth -- the db. So we can add table
+`safekeeper_timeline_pending_ops`
+```
+table! {
+    // timeline_id, sk_id is primary key
+    safekeeper_timeline_pending_ops (sk_id, tenant_id, timeline_id) {
+        sk_id -> int8,
+        tenant_id -> Varchar,
+        timeline_id -> Varchar,
+        generation -> Int4,
+        op_type -> Varchar,
+    }
+}
+```
+
+`op_type` can be `include` (seed from peers and ensure generation is up to
+date), `exclude` (remove locally) and `delete`. Field is actually not strictly
+needed as it can be computed from current configuration, but gives more explicit
+observability.
+
+`generation` is necessary there because after op is done reconciler must remove
+it and not remove another row with higher gen which in theory might appear.
+
+Any insert of row should overwrite (remove) all rows with the same sk and
+timeline id but lower `generation` as next op makes previous obsolete. Insertion
+of `op_type` `delete` overwrites all rows.
+
+About `exclude`: rather than adding explicit safekeeper http endpoint, it is
+reasonable to reuse membership switch endpoint: if safekeeper is not member
+of the configuration it locally removes the timeline on the switch. In this case
+404 should also be considered an 'ok' answer by the caller.
+
+So, main loop of per sk reconcile reads `safekeeper_timeline_pending_ops`
+joined with timeline configuration to get current conf (with generation `n`)
+for the safekeeper and does the jobs, infinitely retrying failures:
+1) If node is member (`include`):
+  - Check if timeline exists on it, if not, call pull_timeline on it from 
+     other members
+  - Call switch configuration to the current
+2) If node is not member (`exclude`):
+  - Call switch configuration to the current, 404 is ok.
+3) If timeline is deleted (`delete`), call delete.
+
+In cases 1 and 2 remove `safekeeper_timeline_pending_ops` for the sk and 
+timeline with generation <= `n` if `op_type` is not `delete`.
+In case 3 also remove `safekeeper_timeline_pending_ops` 
+entry + remove `timelines` entry if there is nothing left  in `safekeeper_timeline_pending_ops` for the timeline.
+
+Let's consider in details how APIs can be implemented from this angle.
+
+Timeline creation. It is assumed that cplane retries it until success, so all
+actions must be idempotent. Now, a tricky point here is timeline start LSN. For
+the initial (tenant creation) call cplane doesn't know it. However, setting
+start_lsn on safekeepers during creation is a good thing -- it provides a
+guarantee that walproposer can always find a common point in WAL histories of
+safekeeper and its own, and so absense of it would be a clear sign of
+corruption. The following sequence works:
+1) Create timeline (or observe that it exists) on pageserver,
+   figuring out last_record_lsn in response.
+2) Choose safekeepers and insert (ON CONFLICT DO NOTHING) timeline row into the
+   db. Note that last_record_lsn returned on the previous step is movable as it
+   changes once ingestion starts, insert must not overwrite it (as well as other
+   fields like membership conf). On the contrary, start_lsn used in the next
+   step must be set to the value in the db. cplane_notified_generation can be set
+   to 1 (initial generation) in insert to avoid notifying cplane about initial 
+   conf as cplane will receive it in timeline creation request anyway.
+3) Issue timeline creation calls to at least majority of safekeepers. Using
+   majority here is not necessary but handy because it guarantees that any live
+   majority will have at least one sk with created timeline and so
+   reconciliation task can use pull_timeline shared with migration instead of
+   create timeline special init case. OFC if timeline is already exists call is
+   ignored.
+4) For minority of safekeepers which could have missed creation insert
+   entries to `safekeeper_timeline_pending_ops`. We won't miss this insertion 
+   because response to cplane is sent only after it has happened, and cplane 
+   retries the call until 200 response.
+
+   There is a small question how request handler (timeline creation in this
+   case) would interact with per sk reconciler. As always I prefer to do the
+   simplest possible thing and here it seems to be just waking it up so it
+   re-reads the db for work to do. Passing work in memory is faster, but
+   that shouldn't matter, and path to scan db for work will exist anyway, 
+   simpler to reuse it.
+
+For pg version / wal segment size: while we may persist them in `timelines`
+table, it is not necessary as initial creation at step 3 can take them from
+pageserver or cplane creation call and later pull_timeline will carry them
+around.
+
+Timeline migration.
+1) CAS to the db to create joint conf, and in the same transaction create
+   `safekeeper_timeline_pending_ops` `include` entries to initialize new members
+   as well as deliver this conf to current ones; poke per sk reconcilers to work
+   on it. Also any conf change should also poke cplane notifier task(s).
+2) Once it becomes possible per alg description above, get out of joint conf
+   with another CAS. Task should get wakeups from per sk reconcilers because 
+   conf switch is required for advancement; however retries should be sleep
+   based as well as LSN advancement might be needed, though in happy path 
+   it isn't. To see whether further transition is possible on wakup migration
+   executor polls safekeepers per the algorithm. CAS creating new conf with only
+   new members should again insert entries to `safekeeper_timeline_pending_ops`
+   to switch them there, as well as `exclude` rows to remove timeline from 
+   old members.
+
+Timeline deletion: just set `deleted_at` on the timeline row and insert
+`safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by
+per sk reconcilers.
+
+When node is removed (set to `decomissioned`), `safekeeper_timeline_pending_ops`
+for it must be cleared in the same transaction.
+
+One more task pool should infinitely retry notifying control plane about changed
+safekeeper sets (trying making `cplane_notified_generation` equal `generation`).
+
 #### Dealing with multiple instances of storage_controller

 Operations described above executed concurrently might create some errors but do
 not prevent progress, so while we normally don't want to run multiple instances
 of storage_controller it is fine to have it temporarily, e.g. during redeploy.

+To harden against some controller instance creating some work in
+`safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up
+the job per sk reconcilers apart from explicit wakups should scan for work
+periodically. It is possible to remove that though if all db updates are
+protected with leadership token/term -- then such scans are needed only after
+leadership is acquired.
+
 Any interactions with db update in-memory controller state, e.g. if migration
 request failed because different one is in progress, controller remembers that
 and tries to finish it.
@@ -545,7 +662,7 @@ Aurora does this but similarly I don't think this is needed.

 We should use Compute <-> safekeeper protocol change to include other (long
 yearned) modifications:
- send data in network order to make arm work.
+- send data in network order without putting whole structs to be arch independent
 - remove term_start_lsn from AppendRequest
 - add horizon to TermHistory
 - add to ProposerGreeting number of connection from this wp to sk
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -15,11 +15,6 @@ pub struct GenericAPIError {
    pub error: String,
 }

-#[derive(Debug, Clone, Serialize)]
-pub struct InfoResponse {
-    pub num_cpus: usize,
-}
-
 #[derive(Debug, Clone, Serialize)]
 pub struct ExtensionInstallResponse {
    pub extension: PgIdent,
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -204,14 +204,16 @@ impl RemoteExtSpec {

        // Check if extension is present in public or custom.
        // If not, then it is not allowed to be used by this compute.
-        if let Some(public_extensions) = &self.public_extensions {
-            if !public_extensions.contains(&real_ext_name.to_string()) {
-                if let Some(custom_extensions) = &self.custom_extensions {
-                    if !custom_extensions.contains(&real_ext_name.to_string()) {
-                        return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
-                    }
-                }
-            }
+        if !self
+            .public_extensions
+            .as_ref()
+            .is_some_and(|exts| exts.iter().any(|e| e == real_ext_name))
+            && !self
+                .custom_extensions
+                .as_ref()
+                .is_some_and(|exts| exts.iter().any(|e| e == real_ext_name))
+        {
+            return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
        }

        match self.extension_data.get(real_ext_name) {
@@ -340,6 +342,102 @@ mod tests {
    use super::*;
    use std::fs::File;

+    #[test]
+    fn allow_installing_remote_extensions() {
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": null,
+            "custom_extensions": null,
+            "library_index": {},
+            "extension_data": {},
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect_err("Extension should not be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": [],
+            "custom_extensions": null,
+            "library_index": {},
+            "extension_data": {},
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect_err("Extension should not be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": [],
+            "custom_extensions": [],
+            "library_index": {
+                "ext": "ext"
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect_err("Extension should not be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": [],
+            "custom_extensions": ["ext"],
+            "library_index": {
+                "ext": "ext"
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect("Extension should be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": ["ext"],
+            "custom_extensions": [],
+            "library_index": {
+                "extlib": "ext",
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect("Extension should be found");
+
+        // test library index for the case when library name
+        // doesn't match the extension name
+        rspec
+            .get_ext("extlib", true, "latest", "v17")
+            .expect("Library should be found");
+    }
+
    #[test]
    fn parse_spec_file() {
        let file = File::open("tests/cluster_spec.json").unwrap();
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -94,6 +94,7 @@ pub struct ConfigToml {
    pub ondemand_download_behavior_treat_error_as_warn: bool,
    #[serde(with = "humantime_serde")]
    pub background_task_maximum_delay: Duration,
+    pub use_compaction_semaphore: bool,
    pub control_plane_api: Option<reqwest::Url>,
    pub control_plane_api_token: Option<String>,
    pub control_plane_emergency_mode: bool,
@@ -323,6 +324,10 @@ pub struct TenantConfigToml {
    // Expresed in multiples of checkpoint distance.
    pub image_layer_creation_check_threshold: u8,

+    // How many multiples of L0 `compaction_threshold` will preempt image layer creation and do L0 compaction.
+    // Set to 0 to disable preemption.
+    pub image_creation_preempt_threshold: usize,
+
    /// The length for an explicit LSN lease request.
    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
    #[serde(with = "humantime_serde")]
@@ -466,6 +471,7 @@ impl Default for ConfigToml {
                DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
            )
            .unwrap()),
+            use_compaction_semaphore: false,

            control_plane_api: (None),
            control_plane_api_token: (None),
@@ -547,6 +553,10 @@ pub mod tenant_conf_defaults {
    // Relevant: https://github.com/neondatabase/neon/issues/3394
    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
+    // If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
+    // layer creation will end immediately. Set to 0 to disable. The target default will be 3 once we
+    // want to enable this feature.
+    pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 0;
    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
@@ -605,6 +615,7 @@ impl Default for TenantConfigToml {
            lazy_slru_download: false,
            timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
+            image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD,
            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
            timeline_offloading: false,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -282,7 +282,7 @@ pub struct TimelineCreateRequest {
    pub mode: TimelineCreateRequestMode,
 }

-#[derive(Serialize, Deserialize, Clone, Debug)]
+#[derive(Serialize, Deserialize, Clone)]
 #[serde(untagged)]
 pub enum TimelineCreateRequestMode {
    Branch {
@@ -307,7 +307,7 @@ pub enum TimelineCreateRequestMode {
    },
 }

-#[derive(Serialize, Deserialize, Clone, Debug)]
+#[derive(Serialize, Deserialize, Clone)]
 pub struct TimelineCreateRequestModeImportPgdata {
    pub location: ImportPgdataLocation,
    pub idempotency_key: ImportPgdataIdempotencyKey,
@@ -326,7 +326,7 @@ pub enum ImportPgdataLocation {
    },
 }

-#[derive(Serialize, Deserialize, Clone, Debug)]
+#[derive(Serialize, Deserialize, Clone)]
 #[serde(transparent)]
 pub struct ImportPgdataIdempotencyKey(pub String);

@@ -498,6 +498,8 @@ pub struct TenantConfigPatch {
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub image_layer_creation_check_threshold: FieldPatch<u8>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_creation_preempt_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub lsn_lease_length: FieldPatch<String>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub lsn_lease_length_for_ts: FieldPatch<String>,
@@ -544,6 +546,7 @@ pub struct TenantConfig {
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
    pub image_layer_creation_check_threshold: Option<u8>,
+    pub image_creation_preempt_threshold: Option<usize>,
    pub lsn_lease_length: Option<String>,
    pub lsn_lease_length_for_ts: Option<String>,
    pub timeline_offloading: Option<bool>,
@@ -581,6 +584,7 @@ impl TenantConfig {
            mut lazy_slru_download,
            mut timeline_get_throttle,
            mut image_layer_creation_check_threshold,
+            mut image_creation_preempt_threshold,
            mut lsn_lease_length,
            mut lsn_lease_length_for_ts,
            mut timeline_offloading,
@@ -635,6 +639,9 @@ impl TenantConfig {
        patch
            .image_layer_creation_check_threshold
            .apply(&mut image_layer_creation_check_threshold);
+        patch
+            .image_creation_preempt_threshold
+            .apply(&mut image_creation_preempt_threshold);
        patch.lsn_lease_length.apply(&mut lsn_lease_length);
        patch
            .lsn_lease_length_for_ts
@@ -679,6 +686,7 @@ impl TenantConfig {
            lazy_slru_download,
            timeline_get_throttle,
            image_layer_creation_check_threshold,
+            image_creation_preempt_threshold,
            lsn_lease_length,
            lsn_lease_length_for_ts,
            timeline_offloading,
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -76,7 +76,15 @@ impl Conf {
        let mut cmd = Command::new(path);
        cmd.env_clear()
            .env("LD_LIBRARY_PATH", self.pg_lib_dir()?)
-            .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?);
+            .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?)
+            .env(
+                "ASAN_OPTIONS",
+                std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+            )
+            .env(
+                "UBSAN_OPTIONS",
+                std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+            );
        Ok(cmd)
    }

--- a/libs/postgres_initdb/src/lib.rs
+++ b/libs/postgres_initdb/src/lib.rs
@@ -64,6 +64,14 @@ pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> {
        .env_clear()
        .env("LD_LIBRARY_PATH", library_search_path)
        .env("DYLD_LIBRARY_PATH", library_search_path)
+        .env(
+            "ASAN_OPTIONS",
+            std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+        )
+        .env(
+            "UBSAN_OPTIONS",
+            std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+        )
        .stdin(std::process::Stdio::null())
        // stdout invocation produces the same output every time, we don't need it
        .stdout(std::process::Stdio::null())
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -182,6 +182,13 @@ pub struct CancelKeyData {
    pub cancel_key: i32,
 }

+pub fn id_to_cancel_key(id: u64) -> CancelKeyData {
+    CancelKeyData {
+        backend_pid: (id >> 32) as i32,
+        cancel_key: (id & 0xffffffff) as i32,
+    }
+}
+
 impl fmt::Display for CancelKeyData {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let hi = (self.backend_pid as u64) << 32;
--- a/libs/proxy/tokio-postgres2/Cargo.toml
+++ b/libs/proxy/tokio-postgres2/Cargo.toml
@@ -19,3 +19,4 @@ postgres-protocol2 = { path = "../postgres-protocol2" }
 postgres-types2 = { path = "../postgres-types2" }
 tokio = { workspace = true, features = ["io-util", "time", "net"] }
 tokio-util = { workspace = true, features = ["codec"] }
+serde = { workspace = true, features = ["derive"] }
--- a/libs/proxy/tokio-postgres2/src/cancel_token.rs
+++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs
@@ -3,12 +3,13 @@ use crate::tls::TlsConnect;

 use crate::{cancel_query, client::SocketConfig, tls::MakeTlsConnect};
 use crate::{cancel_query_raw, Error};
+use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::TcpStream;

 /// The capability to request cancellation of in-progress queries on a
 /// connection.
-#[derive(Clone)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct CancelToken {
    pub socket_config: Option<SocketConfig>,
    pub ssl_mode: SslMode,
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -18,6 +18,7 @@ use fallible_iterator::FallibleIterator;
 use futures_util::{future, ready, TryStreamExt};
 use parking_lot::Mutex;
 use postgres_protocol2::message::{backend::Message, frontend};
+use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::fmt;
 use std::sync::Arc;
@@ -137,7 +138,7 @@ impl InnerClient {
    }
 }

-#[derive(Clone)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct SocketConfig {
    pub host: Host,
    pub port: u16,
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -7,6 +7,7 @@ use crate::tls::MakeTlsConnect;
 use crate::tls::TlsConnect;
 use crate::{Client, Connection, Error};
 use postgres_protocol2::message::frontend::StartupMessageParams;
+use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::str;
 use std::time::Duration;
@@ -16,7 +17,7 @@ pub use postgres_protocol2::authentication::sasl::ScramKeys;
 use tokio::net::TcpStream;

 /// TLS configuration.
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
 #[non_exhaustive]
 pub enum SslMode {
    /// Do not use TLS.
@@ -50,7 +51,7 @@ pub enum ReplicationMode {
 }

 /// A host specification.
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub enum Host {
    /// A TCP hostname.
    Tcp(String),
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};

 use crate::{
    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT,
-    DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+    DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
 };

 /// External backup storage configuration, enough for creating a client for that storage.
@@ -45,11 +45,11 @@ impl RemoteStorageKind {

 impl RemoteStorageConfig {
    /// Helper to fetch the configured concurrency limit.
-    pub fn concurrency_limit(&self) -> Option<usize> {
+    pub fn concurrency_limit(&self) -> usize {
        match &self.storage {
-            RemoteStorageKind::LocalFs { .. } => None,
-            RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()),
-            RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()),
+            RemoteStorageKind::LocalFs { .. } => DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT,
+            RemoteStorageKind::AwsS3(c) => c.concurrency_limit.into(),
+            RemoteStorageKind::AzureContainer(c) => c.concurrency_limit.into(),
        }
    }
 }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -65,6 +65,12 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// Here, a limit of max 20k concurrent connections was noted.
 /// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
 pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
+/// Set this limit analogously to the S3 limit.
+///
+/// The local filesystem backend doesn't enforce a concurrency limit itself, but this also bounds
+/// the upload queue concurrency. Some tests create thousands of uploads, which slows down the
+/// quadratic scheduling of the upload queue, and there is no point spawning so many Tokio tasks.
+pub const DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT: usize = 100;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
--- a/libs/utils/scripts/restore_from_wal.sh
+++ b/libs/utils/scripts/restore_from_wal.sh
@@ -39,7 +39,7 @@ function initdb_with_args {
            ;;
    esac

-    eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}"
+    eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib ASAN_OPTIONS="${ASAN_OPTIONS-}" UBSAN_OPTIONS="${UBSAN_OPTIONS-}" "${cmd[*]}"
 }

 rm -fr "$DATA_DIR"
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -5,6 +5,27 @@ use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, VariantNames};

+/// Logs a critical error, similarly to `tracing::error!`. This will:
+///
+/// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace.
+/// * Trigger a pageable alert (via the metric below).
+/// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error".
+/// * In debug builds, panic the process.
+///
+/// When including errors in the message, please use {err:?} to include the error cause and original
+/// backtrace.
+#[macro_export]
+macro_rules! critical {
+    ($($arg:tt)*) => {{
+        if cfg!(debug_assertions) {
+            panic!($($arg)*);
+        }
+        $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
+        let backtrace = std::backtrace::Backtrace::capture();
+        tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*));
+    }};
+}
+
 #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
 #[strum(serialize_all = "snake_case")]
 pub enum LogFormat {
@@ -25,7 +46,10 @@ impl LogFormat {
    }
 }

-struct TracingEventCountMetric {
+pub struct TracingEventCountMetric {
+    /// CRITICAL is not a `tracing` log level. Instead, we increment it in the `critical!` macro,
+    /// and also emit it as a regular error. These are thus double-counted, but that seems fine.
+    critical: IntCounter,
    error: IntCounter,
    warn: IntCounter,
    info: IntCounter,
@@ -33,7 +57,7 @@ struct TracingEventCountMetric {
    trace: IntCounter,
 }

-static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
+pub static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
    let vec = metrics::register_int_counter_vec!(
        "libmetrics_tracing_event_count",
        "Number of tracing events, by level",
@@ -46,6 +70,7 @@ static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(||
 impl TracingEventCountMetric {
    fn new(vec: IntCounterVec) -> Self {
        Self {
+            critical: vec.with_label_values(&["critical"]),
            error: vec.with_label_values(&["error"]),
            warn: vec.with_label_values(&["warn"]),
            info: vec.with_label_values(&["info"]),
@@ -54,6 +79,11 @@ impl TracingEventCountMetric {
        }
    }

+    // Allow public access from `critical!` macro.
+    pub fn inc_critical(&self) {
+        self.critical.inc();
+    }
+
    fn inc_for_level(&self, level: tracing::Level) {
        let counter = match level {
            tracing::Level::ERROR => &self.error,
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -177,8 +177,8 @@ impl FileCacheState {
        crate::spawn_with_cancel(
            token,
            |res| {
-                if let Err(error) = res {
-                    error!(%error, "postgres error")
+                if let Err(e) = res {
+                    error!(error = format_args!("{e:#}"), "postgres error");
                }
            },
            conn,
@@ -205,7 +205,7 @@ impl FileCacheState {
        {
            Ok(rows) => Ok(rows),
            Err(e) => {
-                error!(error = ?e, "postgres error: {e} -> retrying");
+                error!(error = format_args!("{e:#}"), "postgres error -> retrying");

                let client = FileCacheState::connect(&self.conn_str, self.token.clone())
                    .await
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -191,15 +191,12 @@ async fn start_monitor(
    .await;
    let mut monitor = match monitor {
        Ok(Ok(monitor)) => monitor,
-        Ok(Err(error)) => {
-            error!(?error, "failed to create monitor");
+        Ok(Err(e)) => {
+            error!(error = format_args!("{e:#}"), "failed to create monitor");
            return;
        }
        Err(_) => {
-            error!(
-                ?timeout,
-                "creating monitor timed out (probably waiting to receive protocol range)"
-            );
+            error!(?timeout, "creating monitor timed out");
            return;
        }
    };
@@ -207,6 +204,9 @@ async fn start_monitor(

    match monitor.run().await {
        Ok(()) => info!("monitor was killed due to new connection"),
-        Err(e) => error!(error = ?e, "monitor terminated unexpectedly"),
+        Err(e) => error!(
+            error = format_args!("{e:#}"),
+            "monitor terminated unexpectedly"
+        ),
    }
 }
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -370,12 +370,16 @@ impl Runner {
                }),
            InboundMsgKind::InvalidMessage { error } => {
                warn!(
-                    %error, id, "received notification of an invalid message we sent"
+                    error = format_args!("{error:#}"),
+                    id, "received notification of an invalid message we sent"
                );
                Ok(None)
            }
            InboundMsgKind::InternalError { error } => {
-                warn!(error, id, "agent experienced an internal error");
+                warn!(
+                    error = format_args!("{error:#}"),
+                    id, "agent experienced an internal error"
+                );
                Ok(None)
            }
            InboundMsgKind::HealthCheck {} => {
@@ -476,7 +480,7 @@ impl Runner {
                                        // gives the outermost cause, and the debug impl
                                        // pretty-prints the error, whereas {:#} contains all the
                                        // causes, but is compact (no newlines).
-                                        warn!(error = format!("{e:#}"), "error handling message");
+                                        warn!(error = format_args!("{e:#}"), "error handling message");
                                        OutboundMsg::new(
                                            OutboundMsgKind::InternalError {
                                                error: e.to_string(),
@@ -492,7 +496,7 @@ impl Runner {
                                    .context("failed to send message")?;
                            }
                            Err(e) => warn!(
-                                error = format!("{e}"),
+                                error = format_args!("{e:#}"),
                                msg = ?msg,
                                "received error message"
                            ),
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -36,7 +36,7 @@ itertools.workspace = true
 md5.workspace = true
 nix.workspace = true
 # hack to get the number of worker threads tokio uses
-num_cpus = { version = "1.15" }
+num_cpus.workspace = true
 num-traits.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -140,6 +140,10 @@ pub struct PageServerConf {
    /// not terrible.
    pub background_task_maximum_delay: Duration,

+    /// If true, use a separate semaphore for compaction tasks instead of the common background task
+    /// semaphore. Defaults to false.
+    pub use_compaction_semaphore: bool,
+
    pub control_plane_api: Option<Url>,

    /// JWT token for use with the control plane API.
@@ -332,6 +336,7 @@ impl PageServerConf {
            test_remote_failures,
            ondemand_download_behavior_treat_error_as_warn,
            background_task_maximum_delay,
+            use_compaction_semaphore,
            control_plane_api,
            control_plane_api_token,
            control_plane_emergency_mode,
@@ -385,6 +390,7 @@ impl PageServerConf {
            test_remote_failures,
            ondemand_download_behavior_treat_error_as_warn,
            background_task_maximum_delay,
+            use_compaction_semaphore,
            control_plane_api,
            control_plane_emergency_mode,
            heatmap_upload_concurrency,
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -8,7 +8,6 @@ use std::time::Duration;

 use crate::controller_upcall_client::ControlPlaneGenerationsApi;
 use crate::metrics;
-use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
 use crate::tenant::remote_timeline_client::LayerFileMetadata;
 use crate::virtual_file::MaybeFatalIo;
@@ -463,45 +462,18 @@ impl DeletionQueueClient {
    ///
    /// The `current_generation` is the generation of this pageserver's current attachment.  The
    /// generations in `layers` are the generations in which those layers were written.
-    pub(crate) async fn push_layers(
+    pub(crate) fn push_layers(
        &self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        current_generation: Generation,
        layers: Vec<(LayerName, LayerFileMetadata)>,
    ) -> Result<(), DeletionQueueError> {
-        if current_generation.is_none() {
-            debug!("Enqueuing deletions in legacy mode, skipping queue");
+        // None generations are not valid for attached tenants: they must always be attached in
+        // a known generation.  None generations are still permitted for layers in the index because
+        // they may be historical.
+        assert!(!current_generation.is_none());

-            let mut layer_paths = Vec::new();
-            for (layer, meta) in layers {
-                layer_paths.push(remote_layer_path(
-                    &tenant_shard_id.tenant_id,
-                    &timeline_id,
-                    meta.shard,
-                    &layer,
-                    meta.generation,
-                ));
-            }
-            self.push_immediate(layer_paths).await?;
-            return self.flush_immediate().await;
-        }
-
-        self.push_layers_sync(tenant_shard_id, timeline_id, current_generation, layers)
-    }
-
-    /// When a Tenant has a generation, push_layers is always synchronous because
-    /// the ListValidator channel is an unbounded channel.
-    ///
-    /// This can be merged into push_layers when we remove the Generation-less mode
-    /// support (`<https://github.com/neondatabase/neon/issues/5395>`)
-    pub(crate) fn push_layers_sync(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        current_generation: Generation,
-        layers: Vec<(LayerName, LayerFileMetadata)>,
-    ) -> Result<(), DeletionQueueError> {
        metrics::DELETION_QUEUE
            .keys_submitted
            .inc_by(layers.len() as u64);
@@ -957,14 +929,12 @@ mod test {

        // File should still be there after we push it to the queue (we haven't pushed enough to flush anything)
        info!("Pushing");
-        client
-            .push_layers(
-                tenant_shard_id,
-                TIMELINE_ID,
-                now_generation,
-                [(layer_file_name_1.clone(), layer_metadata)].to_vec(),
-            )
-            .await?;
+        client.push_layers(
+            tenant_shard_id,
+            TIMELINE_ID,
+            now_generation,
+            [(layer_file_name_1.clone(), layer_metadata)].to_vec(),
+        )?;
        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);

        assert_local_files(&[], &deletion_prefix);
@@ -1017,14 +987,12 @@ mod test {
        assert_remote_files(&[&remote_layer_name], &remote_timeline_path);

        tracing::debug!("Pushing...");
-        client
-            .push_layers(
-                tenant_shard_id,
-                TIMELINE_ID,
-                stale_generation,
-                [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
-            )
-            .await?;
+        client.push_layers(
+            tenant_shard_id,
+            TIMELINE_ID,
+            stale_generation,
+            [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
+        )?;

        // We enqueued the operation in a stale generation: it should have failed validation
        tracing::debug!("Flushing...");
@@ -1032,14 +1000,12 @@ mod test {
        assert_remote_files(&[&remote_layer_name], &remote_timeline_path);

        tracing::debug!("Pushing...");
-        client
-            .push_layers(
-                tenant_shard_id,
-                TIMELINE_ID,
-                latest_generation,
-                [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
-            )
-            .await?;
+        client.push_layers(
+            tenant_shard_id,
+            TIMELINE_ID,
+            latest_generation,
+            [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
+        )?;

        // We enqueued the operation in a fresh generation: it should have passed validation
        tracing::debug!("Flushing...");
@@ -1074,28 +1040,24 @@ mod test {
        // generation gets that treatment)
        let remote_layer_file_name_historical =
            ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
-        client
-            .push_layers(
-                tenant_shard_id,
-                TIMELINE_ID,
-                now_generation.previous(),
-                [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
-            )
-            .await?;
+        client.push_layers(
+            tenant_shard_id,
+            TIMELINE_ID,
+            now_generation.previous(),
+            [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
+        )?;

        // Inject a deletion in the generation before generation_now: after restart,
        // this deletion should get executed, because we execute deletions in the
        // immediately previous generation on the same node.
        let remote_layer_file_name_previous =
            ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
-        client
-            .push_layers(
-                tenant_shard_id,
-                TIMELINE_ID,
-                now_generation,
-                [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
-            )
-            .await?;
+        client.push_layers(
+            tenant_shard_id,
+            TIMELINE_ID,
+            now_generation,
+            [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
+        )?;

        client.flush().await?;
        assert_remote_files(
@@ -1139,6 +1101,7 @@ pub(crate) mod mock {
    use tracing::info;

    use super::*;
+    use crate::tenant::remote_timeline_client::remote_layer_path;
    use std::sync::atomic::{AtomicUsize, Ordering};

    pub struct ConsumerState {
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -61,6 +61,7 @@ use crate::{
        remote_timeline_client::LayerFileMetadata,
        secondary::SecondaryTenant,
        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint},
+        tasks::sleep_random,
    },
    CancellableTask, DiskUsageEvictionTask,
 };
@@ -210,14 +211,8 @@ async fn disk_usage_eviction_task(
        info!("disk usage based eviction task finishing");
    };

-    use crate::tenant::tasks::random_init_delay;
-    {
-        if random_init_delay(task_config.period, &cancel)
-            .await
-            .is_err()
-        {
-            return;
-        }
+    if sleep_random(task_config.period, &cancel).await.is_err() {
+        return;
    }

    let mut iteration_no = 0;
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1472,7 +1472,13 @@ async fn layer_download_handler(
    let downloaded = timeline
        .download_layer(&layer_name)
        .await
-        .map_err(ApiError::InternalServerError)?;
+        .map_err(|e| match e {
+            tenant::storage_layer::layer::DownloadError::TimelineShutdown
+            | tenant::storage_layer::layer::DownloadError::DownloadCancelled => {
+                ApiError::ShuttingDown
+            }
+            other => ApiError::InternalServerError(other.into()),
+        })?;

    match downloaded {
        Some(true) => json_response(StatusCode::OK, ()),
@@ -3169,12 +3175,16 @@ async fn put_tenant_timeline_import_basebackup(

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

-    let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let span = info_span!("import_basebackup",
+        tenant_id=%tenant_id, timeline_id=%timeline_id, shard_id=%tenant_shard_id.shard_slug(),
+        base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
    async move {
        let state = get_state(&request);
        let tenant = state
            .tenant_manager
-            .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;
+            .get_attached_tenant_shard(tenant_shard_id)?;

        let broker_client = state.broker_client.clone();

@@ -3383,7 +3393,17 @@ where
                            let status = response.status();
                            info!(%status, "Cancelled request finished successfully")
                        }
-                        Err(e) => error!("Cancelled request finished with an error: {e:?}"),
+                        Err(e) => match e {
+                            ApiError::ShuttingDown | ApiError::ResourceUnavailable(_) => {
+                                // Don't log this at error severity: they are normal during lifecycle of tenants/process
+                                info!("Cancelled request aborted for shutdown")
+                            }
+                            _ => {
+                                // Log these in a highly visible way, because we have no client to send the response to, but
+                                // would like to know that something went wrong.
+                                error!("Cancelled request finished with an error: {e:?}")
+                            }
+                        },
                    }
                }
                // only logging for cancelled panicked request handlers is the tracing_panic_hook,
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -263,14 +263,6 @@ pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
 /// data directory at pageserver startup can be automatically removed.
 pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp";

-/// A marker file to mark that a timeline directory was not fully initialized.
-/// If a timeline directory with this marker is encountered at pageserver startup,
-/// the timeline directory and the marker file are both removed.
-/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
-pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
-
-pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
-
 pub fn is_temporary(path: &Utf8Path) -> bool {
    match path.file_name() {
        Some(name) => name.ends_with(TEMP_FILE_SUFFIX),
@@ -278,25 +270,6 @@ pub fn is_temporary(path: &Utf8Path) -> bool {
    }
 }

-fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
-    match path.file_name() {
-        Some(name) => name.ends_with(suffix),
-        None => false,
-    }
-}
-
-// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
-// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
-// from the name.
-
-pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool {
-    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
-}
-
-pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool {
-    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
-}
-
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,4 +1,13 @@
-use enum_map::EnumMap;
+use std::collections::HashMap;
+use std::num::NonZeroUsize;
+use std::pin::Pin;
+use std::sync::atomic::AtomicU64;
+use std::sync::{Arc, Mutex};
+use std::task::{Context, Poll};
+use std::time::{Duration, Instant};
+
+use enum_map::{Enum as _, EnumMap};
+use futures::Future;
 use metrics::{
    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
    register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
@@ -11,13 +20,27 @@ use pageserver_api::config::{
    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
    PageServiceProtocolPipelinedExecutionStrategy,
 };
+use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
+use pin_project_lite::pin_project;
 use postgres_backend::{is_expected_io_error, QueryError};
 use pq_proto::framed::ConnectionError;
-use strum::{EnumCount, VariantNames};
+
+use strum::{EnumCount, IntoEnumIterator as _, VariantNames};
 use strum_macros::{IntoStaticStr, VariantNames};
 use utils::id::TimelineId;

+use crate::config::PageServerConf;
+use crate::context::{PageContentKind, RequestContext};
+use crate::pgdatadir_mapping::DatadirModificationStats;
+use crate::task_mgr::TaskKind;
+use crate::tenant::layer_map::LayerMap;
+use crate::tenant::mgr::TenantSlot;
+use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc};
+use crate::tenant::tasks::BackgroundLoopKind;
+use crate::tenant::throttle::ThrottleResult;
+use crate::tenant::Timeline;
+
 /// Prometheus histogram buckets (in seconds) for operations in the critical
 /// path. In other words, operations that directly affect that latency of user
 /// queries.
@@ -81,7 +104,7 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::n
    .expect("failed to define a metric")
 });

-// Buckets for background operations like compaction, GC, size calculation
+// Buckets for background operation duration in seconds, like compaction, GC, size calculation.
 const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];

 pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
@@ -94,11 +117,38 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
+/// Measures layers visited per read (i.e. read amplification).
+///
+/// NB: for a batch, we count all visited layers towards each read. While the cost of layer visits
+/// are amortized across the batch, and some layers may not intersect with a given key, each visited
+/// layer contributes directly to the observed latency for every read in the batch, which is what we
+/// care about.
+pub(crate) static LAYERS_PER_READ: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_layers_per_read",
+        "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
+        &["tenant_id", "shard_id", "timeline_id"],
+        // Low resolution to reduce cardinality.
+        vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0],
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
-        "pageserver_layers_visited_per_vectored_read_global",
-        "Average number of layers visited to reconstruct one key",
-        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+        "pageserver_layers_per_read_global",
+        "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
+        vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    // We expect this to be low because of Postgres checkpoints. Let's see if that holds.
+    register_histogram!(
+        "pageserver_deltas_per_read_global",
+        "Number of delta pages applied to image page per read",
+        vec![0.0, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0],
    )
    .expect("failed to define a metric")
 });
@@ -186,7 +236,7 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(||

    GetVectoredLatency {
        map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
-            let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
+            let task_kind = TaskKind::from_usize(task_kind_idx);

            if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
                let task_kind = task_kind.into();
@@ -209,7 +259,7 @@ pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {

    ScanLatency {
        map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
-            let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
+            let task_kind = TaskKind::from_usize(task_kind_idx);

            if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
                let task_kind = task_kind.into();
@@ -250,10 +300,10 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {

 pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
    map: EnumMap::from_array(std::array::from_fn(|task_kind| {
-        let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
+        let task_kind = TaskKind::from_usize(task_kind);
        let task_kind: &'static str = task_kind.into();
        EnumMap::from_array(std::array::from_fn(|content_kind| {
-            let content_kind = <PageContentKind as enum_map::Enum>::from_usize(content_kind);
+            let content_kind = PageContentKind::from_usize(content_kind);
            let content_kind: &'static str = content_kind.into();
            PageCacheMetricsForTaskKind {
                read_accesses_immutable: {
@@ -443,18 +493,38 @@ static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
+#[derive(
+    strum_macros::EnumIter,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    strum_macros::IntoStaticStr,
+)]
 #[strum(serialize_all = "kebab_case")]
-pub(crate) enum MetricLayerKind {
+pub(crate) enum LayerKind {
    Delta,
    Image,
 }

+#[derive(
+    strum_macros::EnumIter,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    strum_macros::IntoStaticStr,
+)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum LayerLevel {
+    // We don't track the currently open ephemeral layer, since there's always exactly 1 and its
+    // size changes. See `TIMELINE_EPHEMERAL_BYTES`.
+    Frozen,
+    L0,
+    L1,
+}
+
 static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_layer_bytes",
-        "Sum of layer physical sizes in bytes",
-        &["tenant_id", "shard_id", "timeline_id", "kind"]
+        "Sum of frozen, L0, and L1 layer physical sizes in bytes (excluding the open ephemeral layer)",
+        &["tenant_id", "shard_id", "timeline_id", "level", "kind"]
    )
    .expect("failed to define a metric")
 });
@@ -462,8 +532,8 @@ static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
 static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_layer_count",
-        "Number of layers that exist",
-        &["tenant_id", "shard_id", "timeline_id", "kind"]
+        "Number of frozen, L0, and L1 layers (excluding the open ephemeral layer)",
+        &["tenant_id", "shard_id", "timeline_id", "level", "kind"]
    )
    .expect("failed to define a metric")
 });
@@ -1843,7 +1913,7 @@ pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy<ComputeCommandCounters> = Lazy

    ComputeCommandCounters {
        map: EnumMap::from_array(std::array::from_fn(|i| {
-            let command = <ComputeCommandKind as enum_map::Enum>::from_usize(i);
+            let command = ComputeCommandKind::from_usize(i);
            let command_str: &'static str = command.into();
            inner.with_label_values(&[command_str])
        })),
@@ -2143,11 +2213,13 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {

 pub struct BackgroundLoopSemaphoreMetrics {
    counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
-    durations: EnumMap<BackgroundLoopKind, Counter>,
+    durations: EnumMap<BackgroundLoopKind, Histogram>,
+    waiting_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
+    running_tasks: EnumMap<BackgroundLoopKind, IntGauge>,
 }

-pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
-    || {
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> =
+    Lazy::new(|| {
        let counters = register_int_counter_pair_vec!(
            "pageserver_background_loop_semaphore_wait_start_count",
            "Counter for background loop concurrency-limiting semaphore acquire calls started",
@@ -2157,45 +2229,101 @@ pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics
        )
        .unwrap();

-        let durations = register_counter_vec!(
-            "pageserver_background_loop_semaphore_wait_duration_seconds",
-            "Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls",
+        let durations = register_histogram_vec!(
+            "pageserver_background_loop_semaphore_wait_seconds",
+            "Seconds spent waiting on background loop semaphore acquisition",
+            &["task"],
+            vec![0.01, 1.0, 5.0, 10.0, 30.0, 60.0, 180.0, 300.0, 600.0],
+        )
+        .unwrap();
+
+        let waiting_tasks = register_int_gauge_vec!(
+            "pageserver_background_loop_semaphore_waiting_tasks",
+            "Number of background loop tasks waiting for semaphore",
+            &["task"],
+        )
+        .unwrap();
+
+        let running_tasks = register_int_gauge_vec!(
+            "pageserver_background_loop_semaphore_running_tasks",
+            "Number of background loop tasks running concurrently",
            &["task"],
        )
        .unwrap();

        BackgroundLoopSemaphoreMetrics {
-            counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+            counters: EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = BackgroundLoopKind::from_usize(i);
                counters.with_label_values(&[kind.into()])
            })),
-            durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-                let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+            durations: EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = BackgroundLoopKind::from_usize(i);
                durations.with_label_values(&[kind.into()])
            })),
+            waiting_tasks: EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = BackgroundLoopKind::from_usize(i);
+                waiting_tasks.with_label_values(&[kind.into()])
+            })),
+            running_tasks: EnumMap::from_array(std::array::from_fn(|i| {
+                let kind = BackgroundLoopKind::from_usize(i);
+                running_tasks.with_label_values(&[kind.into()])
+            })),
        }
-    },
-);
+    });

 impl BackgroundLoopSemaphoreMetrics {
-    pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
-        struct Record<'a> {
-            metrics: &'a BackgroundLoopSemaphoreMetrics,
-            task: BackgroundLoopKind,
-            _counter_guard: metrics::IntCounterPairGuard,
-            start: Instant,
-        }
-        impl Drop for Record<'_> {
-            fn drop(&mut self) {
-                let elapsed = self.start.elapsed().as_secs_f64();
-                self.metrics.durations[self.task].inc_by(elapsed);
-            }
-        }
-        Record {
-            metrics: self,
+    /// Starts recording semaphore metrics. Call `acquired()` on the returned recorder when the
+    /// semaphore is acquired, and drop it when the task completes or is cancelled.
+    pub(crate) fn record(
+        &self,
+        task: BackgroundLoopKind,
+    ) -> BackgroundLoopSemaphoreMetricsRecorder {
+        BackgroundLoopSemaphoreMetricsRecorder::start(self, task)
+    }
+}
+
+/// Records metrics for a background task.
+pub struct BackgroundLoopSemaphoreMetricsRecorder<'a> {
+    metrics: &'a BackgroundLoopSemaphoreMetrics,
+    task: BackgroundLoopKind,
+    start: Instant,
+    wait_counter_guard: Option<metrics::IntCounterPairGuard>,
+}
+
+impl<'a> BackgroundLoopSemaphoreMetricsRecorder<'a> {
+    /// Starts recording semaphore metrics, by recording wait time and incrementing
+    /// `wait_start_count` and `waiting_tasks`.
+    fn start(metrics: &'a BackgroundLoopSemaphoreMetrics, task: BackgroundLoopKind) -> Self {
+        metrics.waiting_tasks[task].inc();
+        Self {
+            metrics,
            task,
-            _counter_guard: self.counters[task].guard(),
            start: Instant::now(),
+            wait_counter_guard: Some(metrics.counters[task].guard()),
+        }
+    }
+
+    /// Signals that the semaphore has been acquired, and updates relevant metrics.
+    pub fn acquired(&mut self) -> Duration {
+        let waited = self.start.elapsed();
+        self.wait_counter_guard.take().expect("already acquired");
+        self.metrics.durations[self.task].observe(waited.as_secs_f64());
+        self.metrics.waiting_tasks[self.task].dec();
+        self.metrics.running_tasks[self.task].inc();
+        waited
+    }
+}
+
+impl Drop for BackgroundLoopSemaphoreMetricsRecorder<'_> {
+    /// The task either completed or was cancelled.
+    fn drop(&mut self) {
+        if self.wait_counter_guard.take().is_some() {
+            // Waiting.
+            self.metrics.durations[self.task].observe(self.start.elapsed().as_secs_f64());
+            self.metrics.waiting_tasks[self.task].dec();
+        } else {
+            // Running.
+            self.metrics.running_tasks[self.task].dec();
        }
    }
 }
@@ -2309,11 +2437,40 @@ pub(crate) struct WalIngestMetrics {
    pub(crate) records_observed: IntCounter,
    pub(crate) records_committed: IntCounter,
    pub(crate) records_filtered: IntCounter,
+    pub(crate) values_committed_metadata_images: IntCounter,
+    pub(crate) values_committed_metadata_deltas: IntCounter,
+    pub(crate) values_committed_data_images: IntCounter,
+    pub(crate) values_committed_data_deltas: IntCounter,
    pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
-    pub(crate) clear_vm_bits_unknown: IntCounterVec,
+}
+
+impl WalIngestMetrics {
+    pub(crate) fn inc_values_committed(&self, stats: &DatadirModificationStats) {
+        if stats.metadata_images > 0 {
+            self.values_committed_metadata_images
+                .inc_by(stats.metadata_images);
+        }
+        if stats.metadata_deltas > 0 {
+            self.values_committed_metadata_deltas
+                .inc_by(stats.metadata_deltas);
+        }
+        if stats.data_images > 0 {
+            self.values_committed_data_images.inc_by(stats.data_images);
+        }
+        if stats.data_deltas > 0 {
+            self.values_committed_data_deltas.inc_by(stats.data_deltas);
+        }
+    }
 }

 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
+    let values_committed = register_int_counter_vec!(
+        "pageserver_wal_ingest_values_committed",
+        "Number of values committed to pageserver storage from WAL records",
+        &["class", "kind"],
+    )
+    .expect("failed to define a metric");
+
    WalIngestMetrics {
    bytes_received: register_int_counter!(
        "pageserver_wal_ingest_bytes_received",
@@ -2340,17 +2497,15 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
        "Number of WAL records filtered out due to sharding"
    )
    .expect("failed to define a metric"),
+    values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]),
+    values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]),
+    values_committed_data_images: values_committed.with_label_values(&["data", "image"]),
+    values_committed_data_deltas: values_committed.with_label_values(&["data", "delta"]),
    gap_blocks_zeroed_on_rel_extend: register_int_counter!(
        "pageserver_gap_blocks_zeroed_on_rel_extend",
        "Total number of zero gap blocks written on relation extends"
    )
    .expect("failed to define a metric"),
-    clear_vm_bits_unknown: register_int_counter_vec!(
-        "pageserver_wal_ingest_clear_vm_bits_unknown",
-        "Number of ignored ClearVmBits operations due to unknown pages/relations",
-        &["entity"],
-    )
-    .expect("failed to define a metric"),
 }
 });

@@ -2417,7 +2572,7 @@ pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> =

 pub(crate) struct WalRedoProcessCounters {
    pub(crate) started: IntCounter,
-    pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
+    pub(crate) killed_by_cause: EnumMap<WalRedoKillCause, IntCounter>,
    pub(crate) active_stderr_logger_tasks_started: IntCounter,
    pub(crate) active_stderr_logger_tasks_finished: IntCounter,
 }
@@ -2459,7 +2614,7 @@ impl Default for WalRedoProcessCounters {
        Self {
            started,
            killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
-                let cause = <WalRedoKillCause as enum_map::Enum>::from_usize(i);
+                let cause = WalRedoKillCause::from_usize(i);
                let cause_str: &'static str = cause.into();
                killed.with_label_values(&[cause_str])
            })),
@@ -2590,10 +2745,7 @@ pub(crate) struct TimelineMetrics {
    pub disk_consistent_lsn_gauge: IntGauge,
    pub pitr_history_size: UIntGauge,
    pub archival_size: UIntGauge,
-    pub(crate) layer_size_image: UIntGauge,
-    pub(crate) layer_count_image: UIntGauge,
-    pub(crate) layer_size_delta: UIntGauge,
-    pub(crate) layer_count_delta: UIntGauge,
+    pub layers_per_read: Histogram,
    pub standby_horizon_gauge: IntGauge,
    pub resident_physical_size_gauge: UIntGauge,
    pub visible_physical_size_gauge: UIntGauge,
@@ -2691,40 +2843,8 @@ impl TimelineMetrics {
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();

-        let layer_size_image = TIMELINE_LAYER_SIZE
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Image.into(),
-            ])
-            .unwrap();
-
-        let layer_count_image = TIMELINE_LAYER_COUNT
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Image.into(),
-            ])
-            .unwrap();
-
-        let layer_size_delta = TIMELINE_LAYER_SIZE
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Delta.into(),
-            ])
-            .unwrap();
-
-        let layer_count_delta = TIMELINE_LAYER_COUNT
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Delta.into(),
-            ])
+        let layers_per_read = LAYERS_PER_READ
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();

        let standby_horizon_gauge = STANDBY_HORIZON
@@ -2791,10 +2911,7 @@ impl TimelineMetrics {
            disk_consistent_lsn_gauge,
            pitr_history_size,
            archival_size,
-            layer_size_image,
-            layer_count_image,
-            layer_size_delta,
-            layer_count_delta,
+            layers_per_read,
            standby_horizon_gauge,
            resident_physical_size_gauge,
            visible_physical_size_gauge,
@@ -2837,6 +2954,92 @@ impl TimelineMetrics {
            .add(duration);
    }

+    /// Generates TIMELINE_LAYER labels for a persistent layer.
+    fn make_layer_labels(&self, layer_desc: &PersistentLayerDesc) -> [&str; 5] {
+        let level = match LayerMap::is_l0(&layer_desc.key_range, layer_desc.is_delta()) {
+            true => LayerLevel::L0,
+            false => LayerLevel::L1,
+        };
+        let kind = match layer_desc.is_delta() {
+            true => LayerKind::Delta,
+            false => LayerKind::Image,
+        };
+        [
+            &self.tenant_id,
+            &self.shard_id,
+            &self.timeline_id,
+            level.into(),
+            kind.into(),
+        ]
+    }
+
+    /// Generates TIMELINE_LAYER labels for a frozen ephemeral layer.
+    fn make_frozen_layer_labels(&self, _layer: &InMemoryLayer) -> [&str; 5] {
+        [
+            &self.tenant_id,
+            &self.shard_id,
+            &self.timeline_id,
+            LayerLevel::Frozen.into(),
+            LayerKind::Delta.into(), // by definition
+        ]
+    }
+
+    /// Removes a frozen ephemeral layer to TIMELINE_LAYER metrics.
+    pub fn dec_frozen_layer(&self, layer: &InMemoryLayer) {
+        assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
+        let labels = self.make_frozen_layer_labels(layer);
+        let size = layer.try_len().expect("frozen layer should have no writer");
+        TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .dec();
+        TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .sub(size);
+    }
+
+    /// Adds a frozen ephemeral layer to TIMELINE_LAYER metrics.
+    pub fn inc_frozen_layer(&self, layer: &InMemoryLayer) {
+        assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
+        let labels = self.make_frozen_layer_labels(layer);
+        let size = layer.try_len().expect("frozen layer should have no writer");
+        TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .inc();
+        TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .add(size);
+    }
+
+    /// Removes a persistent layer from TIMELINE_LAYER metrics.
+    pub fn dec_layer(&self, layer_desc: &PersistentLayerDesc) {
+        let labels = self.make_layer_labels(layer_desc);
+        TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .dec();
+        TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .sub(layer_desc.file_size);
+    }
+
+    /// Adds a persistent layer to TIMELINE_LAYER metrics.
+    pub fn inc_layer(&self, layer_desc: &PersistentLayerDesc) {
+        let labels = self.make_layer_labels(layer_desc);
+        TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .inc();
+        TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .add(layer_desc.file_size);
+    }
+
    pub(crate) fn shutdown(&self) {
        let was_shutdown = self
            .shutdown
@@ -2869,30 +3072,16 @@ impl TimelineMetrics {
        let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);

-        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Image.into(),
-        ]);
-        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Image.into(),
-        ]);
-        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Delta.into(),
-        ]);
-        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Delta.into(),
-        ]);
+        for ref level in LayerLevel::iter() {
+            for ref kind in LayerKind::iter() {
+                let labels: [&str; 5] =
+                    [tenant_id, shard_id, timeline_id, level.into(), kind.into()];
+                let _ = TIMELINE_LAYER_SIZE.remove_label_values(&labels);
+                let _ = TIMELINE_LAYER_COUNT.remove_label_values(&labels);
+            }
+        }
+
+        let _ = LAYERS_PER_READ.remove_label_values(&[tenant_id, shard_id, timeline_id]);

        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
@@ -2974,24 +3163,6 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
    // we leave the BROKEN_TENANTS_SET entry if any
 }

-use futures::Future;
-use pin_project_lite::pin_project;
-use std::collections::HashMap;
-use std::num::NonZeroUsize;
-use std::pin::Pin;
-use std::sync::atomic::AtomicU64;
-use std::sync::{Arc, Mutex};
-use std::task::{Context, Poll};
-use std::time::{Duration, Instant};
-
-use crate::config::PageServerConf;
-use crate::context::{PageContentKind, RequestContext};
-use crate::task_mgr::TaskKind;
-use crate::tenant::mgr::TenantSlot;
-use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::throttle::ThrottleResult;
-use crate::tenant::Timeline;
-
 /// Maintain a per timeline gauge in addition to the global gauge.
 pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
    last_set: AtomicU64,
@@ -3862,7 +4033,8 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {

    // histograms
    [
-        &VEC_READ_NUM_LAYERS_VISITED,
+        &LAYERS_PER_READ_GLOBAL,
+        &DELTAS_PER_READ_GLOBAL,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
        &WAL_REDO_RECORDS_HISTOGRAM,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -489,7 +489,6 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
        let timeline = tenant_shard
            .get_timeline(timeline_id, true)
            .map_err(GetActiveTimelineError::Timeline)?;
-        set_tracing_field_shard_id(&timeline);
        Ok(timeline)
    }
 }
@@ -774,11 +773,11 @@ impl PageServerHandler {

        let batched_msg = match neon_fe_msg {
            PagestreamFeMessage::Exists(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
                let shard = timeline_handles
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
-                    .instrument(span.clone()) // sets `shard_id` field
                    .await?;
+                debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
+                let span = tracing::info_span!(parent: &parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetRelExists,
@@ -793,11 +792,10 @@ impl PageServerHandler {
                }
            }
            PagestreamFeMessage::Nblocks(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
                let shard = timeline_handles
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
-                    .instrument(span.clone()) // sets `shard_id` field
                    .await?;
+                let span = tracing::info_span!(parent: &parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetRelSize,
@@ -812,11 +810,10 @@ impl PageServerHandler {
                }
            }
            PagestreamFeMessage::DbSize(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn);
                let shard = timeline_handles
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
-                    .instrument(span.clone()) // sets `shard_id` field
                    .await?;
+                let span = tracing::info_span!(parent: &parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetDbSize,
@@ -831,11 +828,10 @@ impl PageServerHandler {
                }
            }
            PagestreamFeMessage::GetSlruSegment(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn);
                let shard = timeline_handles
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
-                    .instrument(span.clone()) // sets `shard_id` field
                    .await?;
+                let span = tracing::info_span!(parent: &parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetSlruSegment,
@@ -850,12 +846,20 @@ impl PageServerHandler {
                }
            }
            PagestreamFeMessage::GetPage(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %req.hdr.request_lsn);
+                // avoid a somewhat costly Span::record() by constructing the entire span in one go.
+                macro_rules! mkspan {
+                    (before shard routing) => {{
+                        tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn)
+                    }};
+                    ($shard_id:expr) => {{
+                        tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id)
+                    }};
+                }

                macro_rules! respond_error {
-                    ($error:expr) => {{
+                    ($span:expr, $error:expr) => {{
                        let error = BatchedFeMessage::RespondError {
-                            span,
+                            span: $span,
                            error: BatchedPageStreamError {
                                req: req.hdr,
                                err: $error,
@@ -868,27 +872,35 @@ impl PageServerHandler {
                let key = rel_block_to_key(req.rel, req.blkno);
                let shard = match timeline_handles
                    .get(tenant_id, timeline_id, ShardSelector::Page(key))
-                    .instrument(span.clone()) // sets `shard_id` field
                    .await
                {
                    Ok(tl) => tl,
-                    Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
-                        // We already know this tenant exists in general, because we resolved it at
-                        // start of connection.  Getting a NotFound here indicates that the shard containing
-                        // the requested page is not present on this node: the client's knowledge of shard->pageserver
-                        // mapping is out of date.
-                        //
-                        // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
-                        // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
-                        // and talk to a different pageserver.
-                        return respond_error!(PageStreamError::Reconnect(
-                            "getpage@lsn request routed to wrong shard".into()
-                        ));
-                    }
                    Err(e) => {
-                        return respond_error!(e.into());
+                        let span = mkspan!(before shard routing);
+                        match e {
+                            GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_)) => {
+                                // We already know this tenant exists in general, because we resolved it at
+                                // start of connection.  Getting a NotFound here indicates that the shard containing
+                                // the requested page is not present on this node: the client's knowledge of shard->pageserver
+                                // mapping is out of date.
+                                //
+                                // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
+                                // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
+                                // and talk to a different pageserver.
+                                return respond_error!(
+                                    span,
+                                    PageStreamError::Reconnect(
+                                        "getpage@lsn request routed to wrong shard".into()
+                                    )
+                                );
+                            }
+                            e => {
+                                return respond_error!(span, e.into());
+                            }
+                        }
                    }
                };
+                let span = mkspan!(shard.tenant_shard_id.shard_slug());

                let timer = record_op_start_and_throttle(
                    &shard,
@@ -910,7 +922,7 @@ impl PageServerHandler {
                {
                    Ok(lsn) => lsn,
                    Err(e) => {
-                        return respond_error!(e);
+                        return respond_error!(span, e);
                    }
                };
                BatchedFeMessage::GetPage {
@@ -922,11 +934,10 @@ impl PageServerHandler {
            }
            #[cfg(feature = "testing")]
            PagestreamFeMessage::Test(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_test_request");
                let shard = timeline_handles
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
-                    .instrument(span.clone()) // sets `shard_id` field
                    .await?;
+                let span = tracing::info_span!(parent: &parent_span, "handle_test_request", shard_id = %shard.tenant_shard_id.shard_slug());
                let timer =
                    record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
                        .await?;
@@ -1190,6 +1201,29 @@ impl PageServerHandler {
            }
        };

+        // We purposefully don't count flush time into the smgr operaiton timer.
+        //
+        // The reason is that current compute client will not perform protocol processing
+        // if the postgres backend process is doing things other than `->smgr_read()`.
+        // This is especially the case for prefetch.
+        //
+        // If the compute doesn't read from the connection, eventually TCP will backpressure
+        // all the way into our flush call below.
+        //
+        // The timer's underlying metric is used for a storage-internal latency SLO and
+        // we don't want to include latency in it that we can't control.
+        // And as pointed out above, in this case, we don't control the time that flush will take.
+        //
+        // We put each response in the batch onto the wire in a separate pgb_writer.flush()
+        // call, which (all unmeasured) adds syscall overhead but reduces time to first byte
+        // and avoids building up a "giant" contiguous userspace buffer to hold the entire response.
+        // TODO: vectored socket IO would be great, but pgb_writer doesn't support that.
+        //
+        // Since we're flushing multiple times in the loop, but only have access to the per-op
+        // timers inside the loop, we capture the flush start time here and reuse it to finish
+        // each op timer.
+        let flushing_start_time = Instant::now();
+
        // Map handler result to protocol behavior.
        // Some handler errors cause exit from pagestream protocol.
        // Other handler errors are sent back as an error message and we stay in pagestream protocol.
@@ -1238,21 +1272,9 @@ impl PageServerHandler {
                &response_msg.serialize(protocol_version),
            ))?;

-            // We purposefully don't count flush time into the timer.
-            //
-            // The reason is that current compute client will not perform protocol processing
-            // if the postgres backend process is doing things other than `->smgr_read()`.
-            // This is especially the case for prefetch.
-            //
-            // If the compute doesn't read from the connection, eventually TCP will backpressure
-            // all the way into our flush call below.
-            //
-            // The timer's underlying metric is used for a storage-internal latency SLO and
-            // we don't want to include latency in it that we can't control.
-            // And as pointed out above, in this case, we don't control the time that flush will take.
            let flushing_timer = timer.map(|mut timer| {
                timer
-                    .observe_execution_end_flush_start(Instant::now())
+                    .observe_execution_end_flush_start(flushing_start_time)
                    .expect("we are the first caller")
            });

@@ -1280,8 +1302,6 @@ impl PageServerHandler {
                }
                Ok(())
            }
-            // and log the info! line inside the request span
-            .instrument(span.clone())
            .await?;
        }
        Ok(())
@@ -1342,7 +1362,7 @@ impl PageServerHandler {
            .take()
            .expect("implementation error: timeline_handles should not be locked");

-        let request_span = info_span!("request", shard_id = tracing::field::Empty);
+        let request_span = info_span!("request");
        let ((pgb_reader, timeline_handles), result) = match self.pipelining_config.clone() {
            PageServicePipeliningConfig::Pipelined(pipelining_config) => {
                self.handle_pagerequests_pipelined(
@@ -1692,7 +1712,7 @@ impl PageServerHandler {
        // to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN).
        if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() {
            let gc_info = &timeline.gc_info.read().unwrap();
-            if !gc_info.leases.contains_key(&request_lsn) {
+            if !gc_info.lsn_covered_by_lease(request_lsn) {
                return Err(
                    PageStreamError::BadRequest(format!(
                        "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
@@ -2036,6 +2056,13 @@ impl PageServerHandler {
            .unwrap()
            .get(tenant_id, timeline_id, ShardSelector::Zero)
            .await?;
+        set_tracing_field_shard_id(&timeline);
+
+        if timeline.is_archived() == Some(true) {
+            // TODO after a grace period, turn this log line into a hard error
+            tracing::warn!("timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it.");
+            //return Err(QueryError::NotFound("timeline is archived".into()))
+        }

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -48,7 +48,7 @@ use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
 use utils::{bin_ser::BeSer, lsn::Lsn};
-use wal_decoder::serialized_batch::SerializedValueBatch;
+use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};

 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
 pub const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -612,11 +612,18 @@ impl Timeline {
        pausable_failpoint!("find-lsn-for-timestamp-pausable");

        let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
+        let gc_cutoff_planned = {
+            let gc_info = self.gc_info.read().unwrap();
+            gc_info.min_cutoff()
+        };
+        // Usually the planned cutoff is newer than the cutoff of the last gc run,
+        // but let's be defensive.
+        let gc_cutoff = gc_cutoff_planned.max(*gc_cutoff_lsn_guard);
        // We use this method to figure out the branching LSN for the new branch, but the
        // GC cutoff could be before the branching point and we cannot create a new branch
        // with LSN < `ancestor_lsn`. Thus, pick the maximum of these two to be
        // on the safe side.
-        let min_lsn = std::cmp::max(*gc_cutoff_lsn_guard, self.get_ancestor_lsn());
+        let min_lsn = std::cmp::max(gc_cutoff, self.get_ancestor_lsn());
        let max_lsn = self.get_last_record_lsn();

        // LSNs are always 8-byte aligned. low/mid/high represent the
@@ -1297,6 +1304,26 @@ impl DatadirModification<'_> {
            .is_some_and(|b| b.has_data())
    }

+    /// Returns statistics about the currently pending modifications.
+    pub(crate) fn stats(&self) -> DatadirModificationStats {
+        let mut stats = DatadirModificationStats::default();
+        for (_, _, value) in self.pending_metadata_pages.values().flatten() {
+            match value {
+                Value::Image(_) => stats.metadata_images += 1,
+                Value::WalRecord(r) if r.will_init() => stats.metadata_images += 1,
+                Value::WalRecord(_) => stats.metadata_deltas += 1,
+            }
+        }
+        for valuemeta in self.pending_data_batch.iter().flat_map(|b| &b.metadata) {
+            match valuemeta {
+                ValueMeta::Serialized(s) if s.will_init => stats.data_images += 1,
+                ValueMeta::Serialized(_) => stats.data_deltas += 1,
+                ValueMeta::Observed(_) => {}
+            }
+        }
+        stats
+    }
+
    /// Set the current lsn
    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
        ensure!(
@@ -2317,6 +2344,15 @@ impl DatadirModification<'_> {
    }
 }

+/// Statistics for a DatadirModification.
+#[derive(Default)]
+pub struct DatadirModificationStats {
+    pub metadata_images: u64,
+    pub metadata_deltas: u64,
+    pub data_images: u64,
+    pub data_deltas: u64,
+}
+
 /// This struct facilitates accessing either a committed key from the timeline at a
 /// specific LSN, or the latest uncommitted key from a pending modification.
 ///
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -328,8 +328,8 @@ pub enum TaskKind {
    // Eviction. One per timeline.
    Eviction,

-    // Ingest housekeeping (flushing ephemeral layers on time threshold or disk pressure)
-    IngestHousekeeping,
+    // Tenant housekeeping (flush idle ephemeral layers, shut down idle walredo, etc.).
+    TenantHousekeeping,

    /// See [`crate::disk_usage_eviction_task`].
    DiskUsageEviction,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -20,6 +20,7 @@ use chrono::NaiveDateTime;
 use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
+use itertools::Itertools as _;
 use pageserver_api::models;
 use pageserver_api::models::CompactInfoResponse;
 use pageserver_api::models::LsnLease;
@@ -46,6 +47,7 @@ use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
+use timeline::compaction::CompactionOutcome;
 use timeline::compaction::GcCompactionQueue;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
@@ -95,7 +97,6 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
-use crate::is_uninit_mark;
 use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::CONCURRENT_INITDBS;
 use crate::metrics::INITDB_RUN_TIME;
@@ -1793,11 +1794,7 @@ impl Tenant {
            let entry = entry.context("read timeline dir entry")?;
            let entry_path = entry.path();

-            let purge = if crate::is_temporary(entry_path)
-                // TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718)
-                || is_uninit_mark(entry_path)
-                || crate::is_delete_mark(entry_path)
-            {
+            let purge = if crate::is_temporary(entry_path) {
                true
            } else {
                match TimelineId::try_from(entry_path.file_name()) {
@@ -2426,7 +2423,7 @@ impl Tenant {
        // Make sure the freeze_and_flush reaches remote storage.
        tline.remote_client.wait_completion().await.unwrap();

-        let tl = uninit_tl.finish_creation()?;
+        let tl = uninit_tl.finish_creation().await?;
        // The non-test code would call tl.activate() here.
        tl.set_state(TimelineState::Active);
        Ok(tl)
@@ -2912,10 +2909,10 @@ impl Tenant {
        self: &Arc<Self>,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<bool, timeline::CompactionError> {
+    ) -> Result<CompactionOutcome, timeline::CompactionError> {
        // Don't start doing work during shutdown, or when broken, we do not need those in the logs
        if !self.is_active() {
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
        }

        {
@@ -2929,7 +2926,7 @@ impl Tenant {
            // to AttachedSingle state.
            if !conf.location.may_upload_layers_hint() {
                info!("Skipping compaction in location state {:?}", conf.location);
-                return Ok(false);
+                return Ok(CompactionOutcome::Done);
            }
        }

@@ -2972,7 +2969,7 @@ impl Tenant {
        // Before doing any I/O work, check our circuit breaker
        if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
            info!("Skipping compaction due to previous failures");
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
        }

        let mut has_pending_task = false;
@@ -2980,10 +2977,10 @@ impl Tenant {
        for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
        {
            // pending_task_left == None: cannot compact, maybe still pending tasks
-            // pending_task_left == Some(true): compaction task left
-            // pending_task_left == Some(false): no compaction task left
+            // pending_task_left == Some(Pending): compaction task left
+            // pending_task_left == Some(Done): no compaction task left
            let pending_task_left = if *can_compact {
-                let has_pending_l0_compaction_task = timeline
+                let compaction_outcome = timeline
                    .compact(cancel, EnumSet::empty(), ctx)
                    .instrument(info_span!("compact_timeline", %timeline_id))
                    .await
@@ -3001,27 +2998,27 @@ impl Tenant {
                                .fail(&CIRCUIT_BREAKERS_BROKEN, e);
                        }
                    })?;
-                if has_pending_l0_compaction_task {
-                    Some(true)
+                if let CompactionOutcome::Pending = compaction_outcome {
+                    Some(CompactionOutcome::Pending)
                } else {
                    let queue = {
                        let guard = self.scheduled_compaction_tasks.lock().unwrap();
                        guard.get(timeline_id).cloned()
                    };
                    if let Some(queue) = queue {
-                        let has_pending_tasks = queue
+                        let outcome = queue
                            .iteration(cancel, ctx, &self.gc_block, timeline)
                            .await?;
-                        Some(has_pending_tasks)
+                        Some(outcome)
                    } else {
-                        Some(false)
+                        Some(CompactionOutcome::Done)
                    }
                }
            } else {
                None
            };
-            has_pending_task |= pending_task_left.unwrap_or(false);
-            if pending_task_left == Some(false) && *can_offload {
+            has_pending_task |= pending_task_left == Some(CompactionOutcome::Pending);
+            if pending_task_left == Some(CompactionOutcome::Done) && *can_offload {
                pausable_failpoint!("before-timeline-auto-offload");
                match offload_timeline(self, timeline)
                    .instrument(info_span!("offload_timeline", %timeline_id))
@@ -3041,7 +3038,11 @@ impl Tenant {
            .unwrap()
            .success(&CIRCUIT_BREAKERS_UNBROKEN);

-        Ok(has_pending_task)
+        Ok(if has_pending_task {
+            CompactionOutcome::Pending
+        } else {
+            CompactionOutcome::Done
+        })
    }

    /// Cancel scheduled compaction tasks
@@ -3088,32 +3089,28 @@ impl Tenant {
        Ok(rx)
    }

-    // Call through to all timelines to freeze ephemeral layers if needed.  Usually
-    // this happens during ingest: this background housekeeping is for freezing layers
-    // that are open but haven't been written to for some time.
-    async fn ingest_housekeeping(&self) {
-        // Scan through the hashmap and collect a list of all the timelines,
-        // while holding the lock. Then drop the lock and actually perform the
-        // compactions.  We don't want to block everything else while the
-        // compaction runs.
-        let timelines = {
-            self.timelines
-                .lock()
-                .unwrap()
-                .values()
-                .filter_map(|timeline| {
-                    if timeline.is_active() {
-                        Some(timeline.clone())
-                    } else {
-                        None
-                    }
-                })
-                .collect::<Vec<_>>()
-        };
+    /// Performs periodic housekeeping, via the tenant housekeeping background task.
+    async fn housekeeping(&self) {
+        // Call through to all timelines to freeze ephemeral layers as needed. This usually happens
+        // during ingest, but we don't want idle timelines to hold open layers for too long.
+        let timelines = self
+            .timelines
+            .lock()
+            .unwrap()
+            .values()
+            .filter(|tli| tli.is_active())
+            .cloned()
+            .collect_vec();

-        for timeline in &timelines {
+        for timeline in timelines {
            timeline.maybe_freeze_ephemeral_layer().await;
        }
+
+        // Shut down walredo if idle.
+        const WALREDO_IDLE_TIMEOUT: Duration = Duration::from_secs(180);
+        if let Some(ref walredo_mgr) = self.walredo_mgr {
+            walredo_mgr.maybe_quiesce(WALREDO_IDLE_TIMEOUT);
+        }
    }

    pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool {
@@ -4642,22 +4639,26 @@ impl Tenant {

        // check against last actual 'latest_gc_cutoff' first
        let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
-        src_timeline
-            .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
-            .context(format!(
-                "invalid branch start lsn: less than latest GC cutoff {}",
-                *latest_gc_cutoff_lsn,
-            ))
-            .map_err(CreateTimelineError::AncestorLsn)?;
-
-        // and then the planned GC cutoff
        {
            let gc_info = src_timeline.gc_info.read().unwrap();
-            let cutoff = gc_info.min_cutoff();
-            if start_lsn < cutoff {
-                return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
-                    "invalid branch start lsn: less than planned GC cutoff {cutoff}"
-                )));
+            let planned_cutoff = gc_info.min_cutoff();
+            if gc_info.lsn_covered_by_lease(start_lsn) {
+                tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *latest_gc_cutoff_lsn);
+            } else {
+                src_timeline
+                    .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
+                    .context(format!(
+                        "invalid branch start lsn: less than latest GC cutoff {}",
+                        *latest_gc_cutoff_lsn,
+                    ))
+                    .map_err(CreateTimelineError::AncestorLsn)?;
+
+                // and then the planned GC cutoff
+                if start_lsn < planned_cutoff {
+                    return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
+                        "invalid branch start lsn: less than planned GC cutoff {planned_cutoff}"
+                    )));
+                }
            }
        }

@@ -4702,7 +4703,7 @@ impl Tenant {
            )
            .await?;

-        let new_timeline = uninitialized_timeline.finish_creation()?;
+        let new_timeline = uninitialized_timeline.finish_creation().await?;

        // Root timeline gets its layers during creation and uploads them along with the metadata.
        // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created.
@@ -4892,10 +4893,11 @@ impl Tenant {
        }

        // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
+        let pgdata_path_deferred = pgdata_path.clone();
        scopeguard::defer! {
-            if let Err(e) = fs::remove_dir_all(&pgdata_path) {
+            if let Err(e) = fs::remove_dir_all(&pgdata_path_deferred) {
                // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call
-                error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}");
+                error!("Failed to remove temporary initdb directory '{pgdata_path_deferred}': {e}");
            }
        }
        if let Some(existing_initdb_timeline_id) = load_existing_initdb {
@@ -4962,7 +4964,7 @@ impl Tenant {
            pgdata_lsn,
            pg_version,
        );
-        let raw_timeline = self
+        let mut raw_timeline = self
            .prepare_new_timeline(
                timeline_id,
                &new_metadata,
@@ -4973,42 +4975,33 @@ impl Tenant {
            .await?;

        let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id;
-        let unfinished_timeline = raw_timeline.raw_timeline()?;
-
-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        unfinished_timeline.maybe_spawn_flush_loop();
-
-        import_datadir::import_timeline_from_postgres_datadir(
-            unfinished_timeline,
-            &pgdata_path,
-            pgdata_lsn,
-            ctx,
-        )
-        .await
-        .with_context(|| {
-            format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}")
-        })?;
-
-        fail::fail_point!("before-checkpoint-new-timeline", |_| {
-            Err(CreateTimelineError::Other(anyhow::anyhow!(
-                "failpoint before-checkpoint-new-timeline"
-            )))
-        });
-
-        unfinished_timeline
-            .freeze_and_flush()
-            .await
-            .with_context(|| {
-                format!(
-                    "Failed to flush after pgdatadir import for timeline {tenant_shard_id}/{timeline_id}"
+        raw_timeline
+            .write(|unfinished_timeline| async move {
+                import_datadir::import_timeline_from_postgres_datadir(
+                    &unfinished_timeline,
+                    &pgdata_path,
+                    pgdata_lsn,
+                    ctx,
                )
-            })?;
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}"
+                    )
+                })?;
+
+                fail::fail_point!("before-checkpoint-new-timeline", |_| {
+                    Err(CreateTimelineError::Other(anyhow::anyhow!(
+                        "failpoint before-checkpoint-new-timeline"
+                    )))
+                });
+
+                Ok(())
+            })
+            .await?;

        // All done!
-        let timeline = raw_timeline.finish_creation()?;
+        let timeline = raw_timeline.finish_creation().await?;

        // Callers are responsible to wait for uploads to complete and for activating the timeline.

@@ -5499,6 +5492,9 @@ pub(crate) mod harness {
                image_layer_creation_check_threshold: Some(
                    tenant_conf.image_layer_creation_check_threshold,
                ),
+                image_creation_preempt_threshold: Some(
+                    tenant_conf.image_creation_preempt_threshold,
+                ),
                lsn_lease_length: Some(tenant_conf.lsn_lease_length),
                lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
                timeline_offloading: Some(tenant_conf.timeline_offloading),
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -357,6 +357,9 @@ pub struct TenantConfOpt {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub image_layer_creation_check_threshold: Option<u8>,

+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_creation_preempt_threshold: Option<usize>,
+
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(with = "humantime_serde")]
    #[serde(default)]
@@ -453,6 +456,9 @@ impl TenantConfOpt {
            image_layer_creation_check_threshold: self
                .image_layer_creation_check_threshold
                .unwrap_or(global_conf.image_layer_creation_check_threshold),
+            image_creation_preempt_threshold: self
+                .image_creation_preempt_threshold
+                .unwrap_or(global_conf.image_creation_preempt_threshold),
            lsn_lease_length: self
                .lsn_lease_length
                .unwrap_or(global_conf.lsn_lease_length),
@@ -504,6 +510,7 @@ impl TenantConfOpt {
            mut lazy_slru_download,
            mut timeline_get_throttle,
            mut image_layer_creation_check_threshold,
+            mut image_creation_preempt_threshold,
            mut lsn_lease_length,
            mut lsn_lease_length_for_ts,
            mut timeline_offloading,
@@ -578,6 +585,9 @@ impl TenantConfOpt {
        patch
            .image_layer_creation_check_threshold
            .apply(&mut image_layer_creation_check_threshold);
+        patch
+            .image_creation_preempt_threshold
+            .apply(&mut image_creation_preempt_threshold);
        patch
            .lsn_lease_length
            .map(|v| humantime::parse_duration(&v))?
@@ -626,6 +636,7 @@ impl TenantConfOpt {
            lazy_slru_download,
            timeline_get_throttle,
            image_layer_creation_check_threshold,
+            image_creation_preempt_threshold,
            lsn_lease_length,
            lsn_lease_length_for_ts,
            timeline_offloading,
@@ -689,6 +700,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
            lazy_slru_download: value.lazy_slru_download,
            timeline_get_throttle: value.timeline_get_throttle,
            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
+            image_creation_preempt_threshold: value.image_creation_preempt_threshold,
            lsn_lease_length: value.lsn_lease_length.map(humantime),
            lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
            timeline_offloading: value.timeline_offloading,
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -437,8 +437,7 @@ impl RemoteTimelineClient {
            .conf
            .remote_storage_config
            .as_ref()
-            .and_then(|r| r.concurrency_limit())
-            .unwrap_or(0);
+            .map_or(0, |r| r.concurrency_limit());
        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
        self.update_remote_physical_size_gauge(Some(index_part));
@@ -461,8 +460,7 @@ impl RemoteTimelineClient {
            .conf
            .remote_storage_config
            .as_ref()
-            .and_then(|r| r.concurrency_limit())
-            .unwrap_or(0);
+            .map_or(0, |r| r.concurrency_limit());
        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
        self.update_remote_physical_size_gauge(None);
@@ -484,8 +482,7 @@ impl RemoteTimelineClient {
            .conf
            .remote_storage_config
            .as_ref()
-            .and_then(|r| r.concurrency_limit())
-            .unwrap_or(0);
+            .map_or(0, |r| r.concurrency_limit());

        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
@@ -520,7 +517,7 @@ impl RemoteTimelineClient {
            if let Ok(queue) = queue_locked.initialized_mut() {
                let blocked_deletions = std::mem::take(&mut queue.blocked_deletions);
                for d in blocked_deletions {
-                    if let Err(e) = self.deletion_queue_client.push_layers_sync(
+                    if let Err(e) = self.deletion_queue_client.push_layers(
                        self.tenant_shard_id,
                        self.timeline_id,
                        self.generation,
@@ -2154,7 +2151,6 @@ impl RemoteTimelineClient {
                                self.generation,
                                delete.layers.clone(),
                            )
-                            .await
                            .map_err(|e| anyhow::anyhow!(e))
                    }
                }
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -673,12 +673,30 @@ impl<'a> TenantDownloader<'a> {
            HeatMapDownload::Modified(m) => m,
        };

-        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
-
-        // Save the heatmap: this will be useful on restart, allowing us to reconstruct
-        // layer metadata without having to re-download it.
+        // Heatmap storage location
        let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id);

+        let last_heatmap = if last_download.is_none() {
+            match load_heatmap(&heatmap_path, ctx).await {
+                Ok(htm) => htm,
+                Err(e) => {
+                    tracing::warn!("Couldn't load heatmap from {heatmap_path}: {e:?}");
+                    None
+                }
+            }
+        } else {
+            None
+        };
+
+        let last_heatmap_timelines = last_heatmap.as_ref().map(|htm| {
+            htm.timelines
+                .iter()
+                .map(|tl| (tl.timeline_id, tl))
+                .collect::<HashMap<_, _>>()
+        });
+
+        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
+
        let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
        let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
        let heatmap_path_bg = heatmap_path.clone();
@@ -707,10 +725,17 @@ impl<'a> TenantDownloader<'a> {
            let timeline_state = match timeline_state {
                Some(t) => t,
                None => {
+                    let last_heatmap =
+                        last_heatmap_timelines
+                            .as_ref()
+                            .and_then(|last_heatmap_timelines| {
+                                last_heatmap_timelines.get(&timeline.timeline_id).copied()
+                            });
                    // We have no existing state: need to scan local disk for layers first.
                    let timeline_state = init_timeline_state(
                        self.conf,
                        tenant_shard_id,
+                        last_heatmap,
                        timeline,
                        &self.secondary_state.resident_size_metric,
                    )
@@ -1079,12 +1104,12 @@ impl<'a> TenantDownloader<'a> {
                }
            }

-            if on_disk.metadata.generation_file_size() != on_disk.metadata.generation_file_size() {
+            if on_disk.metadata.generation_file_size() != layer.metadata.generation_file_size() {
                tracing::info!(
                    "Re-downloading layer {} with changed size or generation: {:?}->{:?}",
                    layer.name,
                    on_disk.metadata.generation_file_size(),
-                    on_disk.metadata.generation_file_size()
+                    layer.metadata.generation_file_size()
                );
                return LayerAction::Download;
            }
@@ -1277,6 +1302,7 @@ impl<'a> TenantDownloader<'a> {
 async fn init_timeline_state(
    conf: &'static PageServerConf,
    tenant_shard_id: &TenantShardId,
+    last_heatmap: Option<&HeatMapTimeline>,
    heatmap: &HeatMapTimeline,
    resident_metric: &UIntGauge,
 ) -> SecondaryDetailTimeline {
@@ -1306,6 +1332,13 @@ async fn init_timeline_state(
    let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
        heatmap.layers.iter().map(|l| (&l.name, l)).collect();

+    let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
+        if let Some(last_heatmap) = last_heatmap {
+            last_heatmap.layers.iter().map(|l| (&l.name, l)).collect()
+        } else {
+            HashMap::new()
+        };
+
    while let Some(dentry) = dir
        .next_entry()
        .await
@@ -1339,18 +1372,32 @@ async fn init_timeline_state(
        match LayerName::from_str(file_name) {
            Ok(name) => {
                let remote_meta = heatmap_metadata.get(&name);
+                let last_meta = last_heatmap_metadata.get(&name);
+                let mut remove = false;
                match remote_meta {
                    Some(remote_meta) => {
+                        let last_meta_generation_file_size = last_meta
+                            .map(|m| m.metadata.generation_file_size())
+                            .unwrap_or(remote_meta.metadata.generation_file_size());
                        // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
-                        if local_meta.len() != remote_meta.metadata.file_size {
-                            // This should not happen, because we do crashsafe write-then-rename when downloading
-                            // layers, and layers in remote storage are immutable.  Remove the local file because
-                            // we cannot trust it.
-                            tracing::warn!(
+                        if remote_meta.metadata.generation_file_size()
+                            != last_meta_generation_file_size
+                        {
+                            tracing::info!(
+                                "Removing local layer {name} as on-disk json metadata has different generation or file size from remote: {:?} -> {:?}",
+                                last_meta_generation_file_size,
+                                remote_meta.metadata.generation_file_size()
+                            );
+                            remove = true;
+                        } else if local_meta.len() != remote_meta.metadata.file_size {
+                            // This can happen in the presence of race conditions: the remote and on-disk metadata have changed, but we haven't had
+                            // the chance yet to download the new layer to disk, before the process restarted.
+                            tracing::info!(
                                "Removing local layer {name} with unexpected local size {} != {}",
                                local_meta.len(),
                                remote_meta.metadata.file_size
                            );
+                            remove = true;
                        } else {
                            // We expect the access time to be initialized immediately afterwards, when
                            // the latest heatmap is applied to the state.
@@ -1372,15 +1419,18 @@ async fn init_timeline_state(
                            "Removing secondary local layer {} because it's absent in heatmap",
                            name
                        );
-                        tokio::fs::remove_file(&dentry.path())
-                            .await
-                            .or_else(fs_ext::ignore_not_found)
-                            .fatal_err(&format!(
-                                "Removing layer {}",
-                                dentry.path().to_string_lossy()
-                            ));
+                        remove = true;
                    }
                }
+                if remove {
+                    tokio::fs::remove_file(&dentry.path())
+                        .await
+                        .or_else(fs_ext::ignore_not_found)
+                        .fatal_err(&format!(
+                            "Removing layer {}",
+                            dentry.path().to_string_lossy()
+                        ));
+                }
            }
            Err(_) => {
                // Ignore it.
@@ -1391,3 +1441,18 @@ async fn init_timeline_state(

    detail
 }
+
+/// Loads a json-encoded heatmap file from the provided on-disk path
+async fn load_heatmap(
+    path: &Utf8PathBuf,
+    ctx: &RequestContext,
+) -> Result<Option<HeatMapTenant>, anyhow::Error> {
+    let mut file = match VirtualFile::open(path, ctx).await {
+        Ok(file) => file,
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None),
+        Err(e) => Err(e)?,
+    };
+    let st = file.read_to_string(ctx).await?;
+    let htm = serde_json::from_str(&st)?;
+    Ok(Some(htm))
+}
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -9,13 +9,14 @@ use crate::{
    metrics::SECONDARY_MODE,
    tenant::{
        config::AttachmentMode,
-        mgr::GetTenantError,
-        mgr::TenantManager,
+        mgr::{GetTenantError, TenantManager},
        remote_timeline_client::remote_heatmap_path,
        span::debug_assert_current_span_has_tenant_id,
        tasks::{warn_when_period_overrun, BackgroundLoopKind},
        Tenant,
    },
+    virtual_file::VirtualFile,
+    TEMP_FILE_SUFFIX,
 };

 use futures::Future;
@@ -32,7 +33,10 @@ use super::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, Instrument};
-use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
+use utils::{
+    backoff, completion::Barrier, crashsafe::path_with_suffix_extension,
+    yielding_loop::yielding_loop,
+};

 pub(super) async fn heatmap_uploader_task(
    tenant_manager: Arc<TenantManager>,
@@ -461,6 +465,18 @@ async fn upload_tenant_heatmap(
        }
    }

+    // After a successful upload persist the fresh heatmap to disk.
+    // When restarting, the tenant will read the heatmap from disk
+    // and additively generate a new heatmap (see [`Timeline::generate_heatmap`]).
+    // If the heatmap is stale, the additive generation can lead to keeping previously
+    // evicted timelines on the secondarie's disk.
+    let tenant_shard_id = tenant.get_tenant_shard_id();
+    let heatmap_path = tenant.conf.tenant_heatmap_path(tenant_shard_id);
+    let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
+    if let Err(err) = VirtualFile::crashsafe_overwrite(heatmap_path, temp_path, bytes).await {
+        tracing::warn!("Non fatal IO error writing to disk after heatmap upload: {err}");
+    }
+
    tracing::info!("Successfully uploaded {size} byte heatmap to {path}");

    Ok(UploadHeatmapOutcome::Uploaded(LastUploadState {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -80,6 +80,16 @@ pub(crate) struct ValueReconstructState {
    pub(crate) img: Option<(Lsn, Bytes)>,
 }

+impl ValueReconstructState {
+    /// Returns the number of page deltas applied to the page image.
+    pub fn num_deltas(&self) -> usize {
+        match self.img {
+            Some(_) => self.records.len(),
+            None => self.records.len() - 1, // omit will_init record
+        }
+    }
+}
+
 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
 pub(crate) enum ValueReconstructSituation {
    Complete,
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -166,6 +166,10 @@ impl BatchLayerWriter {
        // END: catch every error and do the recovery in the above section
        Ok(generated_layers)
    }
+
+    pub fn pending_layer_num(&self) -> usize {
+        self.generated_layer_writers.len()
+    }
 }

 /// An image writer that takes images and produces multiple image layers.
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -340,7 +340,7 @@ impl Layer {
    /// Download the layer if evicted.
    ///
    /// Will not error when the layer is already downloaded.
-    pub(crate) async fn download(&self) -> anyhow::Result<()> {
+    pub(crate) async fn download(&self) -> Result<(), DownloadError> {
        self.0.get_or_maybe_download(true, None).await?;
        Ok(())
    }
@@ -701,13 +701,7 @@ impl Drop for LayerInner {
        if let Some(timeline) = timeline.as_ref() {
            // Only need to decrement metrics if the timeline still exists: otherwise
            // it will have already de-registered these metrics via TimelineMetrics::shutdown
-            if self.desc.is_delta() {
-                timeline.metrics.layer_count_delta.dec();
-                timeline.metrics.layer_size_delta.sub(self.desc.file_size);
-            } else {
-                timeline.metrics.layer_count_image.dec();
-                timeline.metrics.layer_size_image.sub(self.desc.file_size);
-            }
+            timeline.metrics.dec_layer(&self.desc);

            if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
                debug_assert!(
@@ -817,13 +811,7 @@ impl LayerInner {
        };

        // This object acts as a RAII guard on these metrics: increment on construction
-        if desc.is_delta() {
-            timeline.metrics.layer_count_delta.inc();
-            timeline.metrics.layer_size_delta.add(desc.file_size);
-        } else {
-            timeline.metrics.layer_count_image.inc();
-            timeline.metrics.layer_size_image.add(desc.file_size);
-        }
+        timeline.metrics.inc_layer(&desc);

        // New layers are visible by default. This metric is later updated on drop or in set_visibility
        timeline
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -1,52 +1,80 @@
-//! This module contains functions to serve per-tenant background processes,
-//! such as compaction and GC
+//! This module contains per-tenant background processes, e.g. compaction and GC.

-use std::ops::ControlFlow;
-use std::str::FromStr;
-use std::sync::Arc;
+use std::cmp::max;
+use std::future::Future;
+use std::ops::{ControlFlow, RangeInclusive};
+use std::pin::pin;
+use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};

-use crate::context::{DownloadBehavior, RequestContext};
-use crate::metrics::TENANT_TASK_EVENTS;
-use crate::task_mgr;
-use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::throttle::Stats;
-use crate::tenant::timeline::CompactionError;
-use crate::tenant::{Tenant, TenantState};
+use once_cell::sync::Lazy;
 use rand::Rng;
+use scopeguard::defer;
+use tokio::sync::{Semaphore, SemaphorePermit};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::{backoff, completion, pausable_failpoint};

-static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
-    once_cell::sync::Lazy::new(|| {
-        let total_threads = task_mgr::TOKIO_WORKER_THREADS.get();
-        let permits = usize::max(
-            1,
-            // while a lot of the work is done on spawn_blocking, we still do
-            // repartitioning in the async context. this should give leave us some workers
-            // unblocked to be blocked on other work, hopefully easing any outside visible
-            // effects of restarts.
-            //
-            // 6/8 is a guess; previously we ran with unlimited 8 and more from
-            // spawn_blocking.
-            (total_threads * 3).checked_div(4).unwrap_or(0),
-        );
-        assert_ne!(permits, 0, "we will not be adding in permits later");
-        assert!(
-            permits < total_threads,
-            "need threads avail for shorter work"
-        );
-        tokio::sync::Semaphore::new(permits)
-    });
+use crate::context::{DownloadBehavior, RequestContext};
+use crate::metrics::{BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
+use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS};
+use crate::tenant::throttle::Stats;
+use crate::tenant::timeline::compaction::CompactionOutcome;
+use crate::tenant::timeline::CompactionError;
+use crate::tenant::{Tenant, TenantState};
+use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD;
+use utils::completion::Barrier;
+use utils::rate_limit::RateLimit;
+use utils::{backoff, pausable_failpoint};

-#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr, enum_map::Enum)]
+/// Semaphore limiting concurrent background tasks (across all tenants).
+///
+/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
+static CONCURRENT_BACKGROUND_TASKS: Lazy<Semaphore> = Lazy::new(|| {
+    let total_threads = TOKIO_WORKER_THREADS.get();
+    let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
+    assert_ne!(permits, 0, "we will not be adding in permits later");
+    assert!(permits < total_threads, "need threads for other work");
+    Semaphore::new(permits)
+});
+
+/// Semaphore limiting concurrent compaction tasks (across all tenants). This is disabled by
+/// default, see `use_compaction_semaphore`.
+///
+/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
+///
+/// This is a separate semaphore from background tasks, because L0 compaction needs to be responsive
+/// to avoid high read amp during heavy write workloads.
+///
+/// TODO: split image compaction and L0 compaction, and move image compaction to background tasks.
+/// Only L0 compaction needs to be responsive, and it shouldn't block on image compaction.
+static CONCURRENT_COMPACTION_TASKS: Lazy<Semaphore> = Lazy::new(|| {
+    let total_threads = TOKIO_WORKER_THREADS.get();
+    let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
+    assert_ne!(permits, 0, "we will not be adding in permits later");
+    assert!(permits < total_threads, "need threads for other work");
+    Semaphore::new(permits)
+});
+
+/// Background jobs.
+///
+/// NB: not all of these acquire a CONCURRENT_BACKGROUND_TASKS semaphore permit, only the ones that
+/// do any significant IO.
+#[derive(
+    Debug,
+    PartialEq,
+    Eq,
+    Clone,
+    Copy,
+    strum_macros::IntoStaticStr,
+    strum_macros::Display,
+    enum_map::Enum,
+)]
 #[strum(serialize_all = "snake_case")]
 pub(crate) enum BackgroundLoopKind {
    Compaction,
    Gc,
    Eviction,
-    IngestHouseKeeping,
+    TenantHouseKeeping,
    ConsumptionMetricsCollectMetrics,
    ConsumptionMetricsSyntheticSizeWorker,
    InitialLogicalSizeCalculation,
@@ -54,36 +82,56 @@ pub(crate) enum BackgroundLoopKind {
    SecondaryDownload,
 }

-impl BackgroundLoopKind {
-    fn as_static_str(&self) -> &'static str {
-        self.into()
-    }
+pub struct BackgroundLoopSemaphorePermit<'a> {
+    _permit: SemaphorePermit<'static>,
+    _recorder: BackgroundLoopSemaphoreMetricsRecorder<'a>,
 }

-/// Cancellation safe.
-pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
+/// Acquires a semaphore permit, to limit concurrent background jobs.
+pub(crate) async fn acquire_concurrency_permit(
    loop_kind: BackgroundLoopKind,
+    use_compaction_semaphore: bool,
    _ctx: &RequestContext,
-) -> tokio::sync::SemaphorePermit<'static> {
-    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind);
+) -> BackgroundLoopSemaphorePermit<'static> {
+    // TODO: use a lower threshold and remove the pacer once we resolve some blockage.
+    const WARN_THRESHOLD: Duration = Duration::from_secs(600);
+    static WARN_PACER: Lazy<Mutex<RateLimit>> =
+        Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+
+    let mut recorder = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);

    if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
        pausable_failpoint!("initial-size-calculation-permit-pause");
    }

    // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
-    match CONCURRENT_BACKGROUND_TASKS.acquire().await {
-        Ok(permit) => permit,
-        Err(_closed) => unreachable!("we never close the semaphore"),
+    let permit = if loop_kind == BackgroundLoopKind::Compaction && use_compaction_semaphore {
+        CONCURRENT_COMPACTION_TASKS.acquire().await
+    } else {
+        assert!(!use_compaction_semaphore);
+        CONCURRENT_BACKGROUND_TASKS.acquire().await
+    }
+    .expect("should never close");
+
+    let waited = recorder.acquired();
+    if waited >= WARN_THRESHOLD {
+        let waited = waited.as_secs_f64();
+        WARN_PACER
+            .lock()
+            .unwrap()
+            .call(|| warn!("{loop_kind} task waited {waited:.3}s for semaphore permit"));
+    }
+
+    BackgroundLoopSemaphorePermit {
+        _permit: permit,
+        _recorder: recorder,
    }
 }

-/// Start per tenant background loops: compaction and gc.
-pub fn start_background_loops(
-    tenant: &Arc<Tenant>,
-    background_jobs_can_start: Option<&completion::Barrier>,
-) {
+/// Start per tenant background loops: compaction, GC, and ingest housekeeping.
+pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>) {
    let tenant_shard_id = tenant.tenant_shard_id;
+
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::Compaction,
@@ -92,13 +140,15 @@ pub fn start_background_loops(
        &format!("compactor for tenant {tenant_shard_id}"),
        {
            let tenant = Arc::clone(tenant);
-            let background_jobs_can_start = background_jobs_can_start.cloned();
+            let can_start = can_start.cloned();
            async move {
-                let cancel = task_mgr::shutdown_token();
+                let cancel = task_mgr::shutdown_token(); // NB: must be in async context
                tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()) },
-                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
+                    _ = cancel.cancelled() => return Ok(()),
+                    _ = Barrier::maybe_wait(can_start) => {}
                };
+                TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
+                defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
                compaction_loop(tenant, cancel)
                    // If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py
                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
@@ -107,6 +157,7 @@ pub fn start_background_loops(
            }
        },
    );
+
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::GarbageCollector,
@@ -115,13 +166,15 @@ pub fn start_background_loops(
        &format!("garbage collector for tenant {tenant_shard_id}"),
        {
            let tenant = Arc::clone(tenant);
-            let background_jobs_can_start = background_jobs_can_start.cloned();
+            let can_start = can_start.cloned();
            async move {
-                let cancel = task_mgr::shutdown_token();
+                let cancel = task_mgr::shutdown_token(); // NB: must be in async context
                tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()) },
-                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
+                    _ = cancel.cancelled() => return Ok(()),
+                    _ = Barrier::maybe_wait(can_start) => {}
                };
+                TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
+                defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
                gc_loop(tenant, cancel)
                    .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                    .await;
@@ -132,21 +185,23 @@ pub fn start_background_loops(

    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
-        TaskKind::IngestHousekeeping,
+        TaskKind::TenantHousekeeping,
        tenant_shard_id,
        None,
-        &format!("ingest housekeeping for tenant {tenant_shard_id}"),
+        &format!("housekeeping for tenant {tenant_shard_id}"),
        {
            let tenant = Arc::clone(tenant);
-            let background_jobs_can_start = background_jobs_can_start.cloned();
+            let can_start = can_start.cloned();
            async move {
-                let cancel = task_mgr::shutdown_token();
+                let cancel = task_mgr::shutdown_token(); // NB: must be in async context
                tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()) },
-                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
+                    _ = cancel.cancelled() => return Ok(()),
+                    _ = Barrier::maybe_wait(can_start) => {}
                };
-                ingest_housekeeping_loop(tenant, cancel)
-                    .instrument(info_span!("ingest_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
+                TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
+                defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
+                tenant_housekeeping_loop(tenant, cancel)
+                    .instrument(info_span!("tenant_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                    .await;
                Ok(())
            }
@@ -154,372 +209,293 @@ pub fn start_background_loops(
    );
 }

-///
-/// Compaction task's main loop
-///
+/// Compaction task's main loop.
 async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    const MAX_BACKOFF_SECS: f64 = 300.0;
-    // How many errors we have seen consequtively
-    let mut error_run_count = 0;

-    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
-    async {
-        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
-        let mut first = true;
-        loop {
-            tokio::select! {
-                _ = cancel.cancelled() => {
-                    return;
-                },
-                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
-                    ControlFlow::Break(()) => return,
-                    ControlFlow::Continue(()) => (),
-                },
-            }
+    let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
+    let mut first = true;
+    let mut error_run = 0; // consecutive errors

-            let period = tenant.get_compaction_period();
+    loop {
+        if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
+            return;
+        }

-            // TODO: we shouldn't need to await to find tenant and this could be moved outside of
-            // loop, #3501. There are also additional "allowed_errors" in tests.
-            if first {
-                first = false;
-                if random_init_delay(period, &cancel).await.is_err() {
-                    break;
-                }
-            }
+        let period = tenant.get_compaction_period();

-            let sleep_duration;
-            if period == Duration::ZERO {
-                #[cfg(not(feature = "testing"))]
-                info!("automatic compaction is disabled");
-                // check again in 10 seconds, in case it's been enabled again.
-                sleep_duration = Duration::from_secs(10)
-            } else {
-                let iteration = Iteration {
-                    started_at: Instant::now(),
-                    period,
-                    kind: BackgroundLoopKind::Compaction,
-                };
-
-                // Run compaction
-                let IterationResult { output, elapsed } = iteration
-                    .run(tenant.compaction_iteration(&cancel, &ctx))
-                    .await;
-                match output {
-                    Ok(has_pending_task) => {
-                        error_run_count = 0;
-                        // schedule the next compaction immediately in case there is a pending compaction task
-                        sleep_duration = if has_pending_task {
-                            Duration::ZERO
-                        } else {
-                            period
-                        };
-                    }
-                    Err(e) => {
-                        let wait_duration = backoff::exponential_backoff_duration_seconds(
-                            error_run_count + 1,
-                            1.0,
-                            MAX_BACKOFF_SECS,
-                        );
-                        error_run_count += 1;
-                        let wait_duration = Duration::from_secs_f64(wait_duration);
-                        log_compaction_error(
-                            &e,
-                            error_run_count,
-                            &wait_duration,
-                            cancel.is_cancelled(),
-                        );
-                        sleep_duration = wait_duration;
-                    }
-                }
-
-                // the duration is recorded by performance tests by enabling debug in this function
-                tracing::debug!(
-                    elapsed_ms = elapsed.as_millis(),
-                    "compaction iteration complete"
-                );
-            };
-
-            // Perhaps we did no work and the walredo process has been idle for some time:
-            // give it a chance to shut down to avoid leaving walredo process running indefinitely.
-            // TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off,
-            // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
-            if let Some(walredo_mgr) = &tenant.walredo_mgr {
-                walredo_mgr.maybe_quiesce(period * 10);
-            }
-
-            // Sleep
-            if tokio::time::timeout(sleep_duration, cancel.cancelled())
-                .await
-                .is_ok()
-            {
+        // TODO: we shouldn't need to await to find tenant and this could be moved outside of
+        // loop, #3501. There are also additional "allowed_errors" in tests.
+        if first {
+            first = false;
+            if sleep_random(period, &cancel).await.is_err() {
                break;
            }
        }
+
+        let sleep_duration;
+        if period == Duration::ZERO {
+            #[cfg(not(feature = "testing"))]
+            info!("automatic compaction is disabled");
+            // check again in 10 seconds, in case it's been enabled again.
+            sleep_duration = Duration::from_secs(10)
+        } else {
+            let iteration = Iteration {
+                started_at: Instant::now(),
+                period,
+                kind: BackgroundLoopKind::Compaction,
+            };
+
+            // Run compaction
+            let IterationResult { output, elapsed } = iteration
+                .run(tenant.compaction_iteration(&cancel, &ctx))
+                .await;
+            match output {
+                Ok(outcome) => {
+                    error_run = 0;
+                    // schedule the next compaction immediately in case there is a pending compaction task
+                    sleep_duration = if let CompactionOutcome::Pending = outcome {
+                        Duration::from_secs(1)
+                    } else {
+                        period
+                    };
+                }
+                Err(err) => {
+                    let wait_duration = backoff::exponential_backoff_duration_seconds(
+                        error_run + 1,
+                        1.0,
+                        MAX_BACKOFF_SECS,
+                    );
+                    error_run += 1;
+                    let wait_duration = Duration::from_secs_f64(wait_duration);
+                    log_compaction_error(&err, error_run, &wait_duration, cancel.is_cancelled());
+                    sleep_duration = wait_duration;
+                }
+            }
+
+            // the duration is recorded by performance tests by enabling debug in this function
+            debug!(
+                elapsed_ms = elapsed.as_millis(),
+                "compaction iteration complete"
+            );
+        };
+
+        // Sleep
+        if tokio::time::timeout(sleep_duration, cancel.cancelled())
+            .await
+            .is_ok()
+        {
+            break;
+        }
    }
-    .await;
-    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
 }

 fn log_compaction_error(
-    e: &CompactionError,
-    error_run_count: u32,
-    sleep_duration: &std::time::Duration,
+    err: &CompactionError,
+    error_count: u32,
+    sleep_duration: &Duration,
    task_cancelled: bool,
 ) {
    use crate::tenant::upload_queue::NotInitialized;
    use crate::tenant::PageReconstructError;
    use CompactionError::*;

-    enum LooksLike {
-        Info,
-        Error,
-    }
+    let level = match err {
+        ShuttingDown => return,
+        Offload(_) => Level::ERROR,
+        _ if task_cancelled => Level::INFO,
+        Other(err) => {
+            let root_cause = err.root_cause();

-    let decision = match e {
-        ShuttingDown => None,
-        Offload(_) => Some(LooksLike::Error),
-        _ if task_cancelled => Some(LooksLike::Info),
-        Other(e) => {
-            let root_cause = e.root_cause();
-
-            let is_stopping = {
-                let upload_queue = root_cause
-                    .downcast_ref::<NotInitialized>()
-                    .is_some_and(|e| e.is_stopping());
-
-                let timeline = root_cause
-                    .downcast_ref::<PageReconstructError>()
-                    .is_some_and(|e| e.is_stopping());
-
-                upload_queue || timeline
-            };
+            let upload_queue = root_cause
+                .downcast_ref::<NotInitialized>()
+                .is_some_and(|e| e.is_stopping());
+            let timeline = root_cause
+                .downcast_ref::<PageReconstructError>()
+                .is_some_and(|e| e.is_stopping());
+            let is_stopping = upload_queue || timeline;

            if is_stopping {
-                Some(LooksLike::Info)
+                Level::INFO
            } else {
-                Some(LooksLike::Error)
+                Level::ERROR
            }
        }
    };

-    match decision {
-        Some(LooksLike::Info) => info!(
-            "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}",
-        ),
-        Some(LooksLike::Error) => error!(
-            "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}",
-        ),
-        None => {}
+    match level {
+        Level::ERROR => {
+            error!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}")
+        }
+        Level::INFO => {
+            info!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}")
+        }
+        level => unimplemented!("unexpected level {level:?}"),
    }
 }

-///
-/// GC task's main loop
-///
+/// GC task's main loop.
 async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    const MAX_BACKOFF_SECS: f64 = 300.0;
-    // How many errors we have seen consequtively
-    let mut error_run_count = 0;
+    let mut error_run = 0; // consecutive errors

-    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
-    async {
-        // GC might require downloading, to find the cutoff LSN that corresponds to the
-        // cutoff specified as time.
-        let ctx =
-            RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+    // GC might require downloading, to find the cutoff LSN that corresponds to the
+    // cutoff specified as time.
+    let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+    let mut first = true;

-        let mut first = true;
-        loop {
-            tokio::select! {
-                _ = cancel.cancelled() => {
-                    return;
-                },
-                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
-                    ControlFlow::Break(()) => return,
-                    ControlFlow::Continue(()) => (),
-                },
-            }
+    loop {
+        if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
+            return;
+        }

-            let period = tenant.get_gc_period();
+        let period = tenant.get_gc_period();

-            if first {
-                first = false;
-
-                let delays = async {
-                    random_init_delay(period, &cancel).await?;
-                    Ok::<_, Cancelled>(())
-                };
-
-                if delays.await.is_err() {
-                    break;
-                }
-            }
-
-            let gc_horizon = tenant.get_gc_horizon();
-            let sleep_duration;
-            if period == Duration::ZERO || gc_horizon == 0 {
-                #[cfg(not(feature = "testing"))]
-                info!("automatic GC is disabled");
-                // check again in 10 seconds, in case it's been enabled again.
-                sleep_duration = Duration::from_secs(10);
-            } else {
-                let iteration = Iteration {
-                    started_at: Instant::now(),
-                    period,
-                    kind: BackgroundLoopKind::Gc,
-                };
-                // Run gc
-                let IterationResult { output, elapsed: _ } =
-                    iteration.run(tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx))
-                    .await;
-                match output {
-                    Ok(_) => {
-                        error_run_count = 0;
-                        sleep_duration = period;
-                    }
-                    Err(crate::tenant::GcError::TenantCancelled) => {
-                        return;
-                    }
-                    Err(e) => {
-                        let wait_duration = backoff::exponential_backoff_duration_seconds(
-                            error_run_count + 1,
-                            1.0,
-                            MAX_BACKOFF_SECS,
-                        );
-                        error_run_count += 1;
-                        let wait_duration = Duration::from_secs_f64(wait_duration);
-
-                        if matches!(e, crate::tenant::GcError::TimelineCancelled) {
-                            // Timeline was cancelled during gc. We might either be in an event
-                            // that affects the entire tenant (tenant deletion, pageserver shutdown),
-                            // or in one that affects the timeline only (timeline deletion).
-                            // Therefore, don't exit the loop.
-                            info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
-                        } else {
-                            error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
-                        }
-
-                        sleep_duration = wait_duration;
-                    }
-                }
-            };
-
-            if tokio::time::timeout(sleep_duration, cancel.cancelled())
-                .await
-                .is_ok()
-            {
+        if first {
+            first = false;
+            if sleep_random(period, &cancel).await.is_err() {
                break;
            }
        }
-    }
-    .await;
-    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
-}
-
-async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
-    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
-    async {
-    let mut last_throttle_flag_reset_at = Instant::now();
-        loop {
-            tokio::select! {
-                _ = cancel.cancelled() => {
-                    return;
-                },
-                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
-                    ControlFlow::Break(()) => return,
-                    ControlFlow::Continue(()) => (),
-                },
-            }
-
-            // We run ingest housekeeping with the same frequency as compaction: it is not worth
-            // having a distinct setting.  But we don't run it in the same task, because compaction
-            // blocks on acquiring the background job semaphore.
-            let period = tenant.get_compaction_period();
-
-            // If compaction period is set to zero (to disable it), then we will use a reasonable default
-            let period = if period == Duration::ZERO {
-                humantime::Duration::from_str(
-                    pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD,
-                )
-                .unwrap()
-                .into()
-            } else {
-                period
-            };
-
-            // Jitter the period by +/- 5%
-            let period =
-                rand::thread_rng().gen_range((period * (95)) / 100..(period * (105)) / 100);
-
-            // Always sleep first: we do not need to do ingest housekeeping early in the lifetime of
-            // a tenant, since it won't have started writing any ephemeral files yet.
-            if tokio::time::timeout(period, cancel.cancelled())
-                .await
-                .is_ok()
-            {
-                break;
-            }

+        let gc_horizon = tenant.get_gc_horizon();
+        let sleep_duration;
+        if period == Duration::ZERO || gc_horizon == 0 {
+            #[cfg(not(feature = "testing"))]
+            info!("automatic GC is disabled");
+            // check again in 10 seconds, in case it's been enabled again.
+            sleep_duration = Duration::from_secs(10);
+        } else {
            let iteration = Iteration {
                started_at: Instant::now(),
                period,
-                kind: BackgroundLoopKind::IngestHouseKeeping,
+                kind: BackgroundLoopKind::Gc,
            };
-            iteration.run(tenant.ingest_housekeeping()).await;
-
-            // TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
-            // Or just spawn another background loop for this throttle, it's not like it's super costly.
-            info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
-                let now = Instant::now();
-                let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
-                let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats();
-                if count_throttled == 0 {
+            // Run gc
+            let IterationResult { output, elapsed: _ } = iteration
+                .run(tenant.gc_iteration(
+                    None,
+                    gc_horizon,
+                    tenant.get_pitr_interval(),
+                    &cancel,
+                    &ctx,
+                ))
+                .await;
+            match output {
+                Ok(_) => {
+                    error_run = 0;
+                    sleep_duration = period;
+                }
+                Err(crate::tenant::GcError::TenantCancelled) => {
                    return;
                }
-                let allowed_rps = tenant.pagestream_throttle.steady_rps();
-                let delta = now - prev;
-                info!(
-                    n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
-                    count_accounted = count_accounted_finish,  // don't break existing log scraping
-                    count_throttled,
-                    sum_throttled_usecs,
-                    count_accounted_start, // log after pre-existing fields to not break existing log scraping
-                    allowed_rps=%format_args!("{allowed_rps:.0}"),
-                    "shard was throttled in the last n_seconds"
-                );
-            });
-        }
-    }
-    .await;
-    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
-}
+                Err(e) => {
+                    let wait_duration = backoff::exponential_backoff_duration_seconds(
+                        error_run + 1,
+                        1.0,
+                        MAX_BACKOFF_SECS,
+                    );
+                    error_run += 1;
+                    let wait_duration = Duration::from_secs_f64(wait_duration);

-async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
-    // if the tenant has a proper status already, no need to wait for anything
-    if tenant.current_state() == TenantState::Active {
-        ControlFlow::Continue(())
-    } else {
-        let mut tenant_state_updates = tenant.subscribe_for_state_updates();
-        loop {
-            match tenant_state_updates.changed().await {
-                Ok(()) => {
-                    let new_state = &*tenant_state_updates.borrow();
-                    match new_state {
-                        TenantState::Active => {
-                            debug!("Tenant state changed to active, continuing the task loop");
-                            return ControlFlow::Continue(());
-                        }
-                        state => {
-                            debug!("Not running the task loop, tenant is not active: {state:?}");
-                            continue;
-                        }
+                    if matches!(e, crate::tenant::GcError::TimelineCancelled) {
+                        // Timeline was cancelled during gc. We might either be in an event
+                        // that affects the entire tenant (tenant deletion, pageserver shutdown),
+                        // or in one that affects the timeline only (timeline deletion).
+                        // Therefore, don't exit the loop.
+                        info!("Gc failed {error_run} times, retrying in {wait_duration:?}: {e:?}");
+                    } else {
+                        error!("Gc failed {error_run} times, retrying in {wait_duration:?}: {e:?}");
                    }
-                }
-                Err(_sender_dropped_error) => {
-                    return ControlFlow::Break(());
+
+                    sleep_duration = wait_duration;
                }
            }
+        };
+
+        if tokio::time::timeout(sleep_duration, cancel.cancelled())
+            .await
+            .is_ok()
+        {
+            break;
+        }
+    }
+}
+
+/// Tenant housekeeping's main loop.
+async fn tenant_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+    let mut last_throttle_flag_reset_at = Instant::now();
+    loop {
+        if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
+            return;
+        }
+
+        // Use the same period as compaction; it's not worth a separate setting. But if it's set to
+        // zero (to disable compaction), then use a reasonable default. Jitter it by 5%.
+        let period = match tenant.get_compaction_period() {
+            Duration::ZERO => humantime::parse_duration(DEFAULT_COMPACTION_PERIOD).unwrap(),
+            period => period,
+        };
+
+        let Ok(period) = sleep_jitter(period, period * 5 / 100, &cancel).await else {
+            break;
+        };
+
+        // Do tenant housekeeping.
+        let iteration = Iteration {
+            started_at: Instant::now(),
+            period,
+            kind: BackgroundLoopKind::TenantHouseKeeping,
+        };
+        iteration.run(tenant.housekeeping()).await;
+
+        // Log any getpage throttling.
+        info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
+            let now = Instant::now();
+            let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
+            let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats();
+            if count_throttled == 0 {
+                return;
+            }
+            let allowed_rps = tenant.pagestream_throttle.steady_rps();
+            let delta = now - prev;
+            info!(
+                n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
+                count_accounted = count_accounted_finish,  // don't break existing log scraping
+                count_throttled,
+                sum_throttled_usecs,
+                count_accounted_start, // log after pre-existing fields to not break existing log scraping
+                allowed_rps=%format_args!("{allowed_rps:.0}"),
+                "shard was throttled in the last n_seconds"
+            );
+        });
+    }
+}
+
+/// Waits until the tenant becomes active, or returns `ControlFlow::Break()` to shut down.
+async fn wait_for_active_tenant(
+    tenant: &Arc<Tenant>,
+    cancel: &CancellationToken,
+) -> ControlFlow<()> {
+    if tenant.current_state() == TenantState::Active {
+        return ControlFlow::Continue(());
+    }
+
+    let mut update_rx = tenant.subscribe_for_state_updates();
+    loop {
+        tokio::select! {
+            _ = cancel.cancelled() => return ControlFlow::Break(()),
+            result = update_rx.changed() => if result.is_err() {
+                return ControlFlow::Break(());
+            }
+        }
+
+        match &*update_rx.borrow() {
+            TenantState::Active => {
+                debug!("Tenant state changed to active, continuing the task loop");
+                return ControlFlow::Continue(());
+            }
+            state => debug!("Not running the task loop, tenant is not active: {state:?}"),
        }
    }
 }
@@ -528,26 +504,41 @@ async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
 #[error("cancelled")]
 pub(crate) struct Cancelled;

-/// Provide a random delay for background task initialization.
+/// Sleeps for a random interval up to the given max value.
 ///
 /// This delay prevents a thundering herd of background tasks and will likely keep them running on
 /// different periods for more stable load.
-pub(crate) async fn random_init_delay(
-    period: Duration,
+pub(crate) async fn sleep_random(
+    max: Duration,
    cancel: &CancellationToken,
-) -> Result<(), Cancelled> {
-    if period == Duration::ZERO {
-        return Ok(());
-    }
+) -> Result<Duration, Cancelled> {
+    sleep_random_range(Duration::ZERO..=max, cancel).await
+}

-    let d = {
-        let mut rng = rand::thread_rng();
-        rng.gen_range(Duration::ZERO..=period)
-    };
-    match tokio::time::timeout(d, cancel.cancelled()).await {
-        Ok(_) => Err(Cancelled),
-        Err(_) => Ok(()),
+/// Sleeps for a random interval in the given range. Returns the duration.
+pub(crate) async fn sleep_random_range(
+    interval: RangeInclusive<Duration>,
+    cancel: &CancellationToken,
+) -> Result<Duration, Cancelled> {
+    let delay = rand::thread_rng().gen_range(interval);
+    if delay == Duration::ZERO {
+        return Ok(delay);
    }
+    tokio::select! {
+        _ = cancel.cancelled() => Err(Cancelled),
+        _ = tokio::time::sleep(delay) => Ok(delay),
+    }
+}
+
+/// Sleeps for an interval with a random jitter.
+pub(crate) async fn sleep_jitter(
+    duration: Duration,
+    jitter: Duration,
+    cancel: &CancellationToken,
+) -> Result<Duration, Cancelled> {
+    let from = duration.saturating_sub(jitter);
+    let to = duration.saturating_add(jitter);
+    sleep_random_range(from..=to, cancel).await
 }

 struct Iteration {
@@ -563,42 +554,25 @@ struct IterationResult<O> {

 impl Iteration {
    #[instrument(skip_all)]
-    pub(crate) async fn run<Fut, O>(self, fut: Fut) -> IterationResult<O>
-    where
-        Fut: std::future::Future<Output = O>,
-    {
-        let Self {
-            started_at,
-            period,
-            kind,
-        } = self;
-
-        let mut fut = std::pin::pin!(fut);
+    pub(crate) async fn run<F: Future<Output = O>, O>(self, fut: F) -> IterationResult<O> {
+        let mut fut = pin!(fut);

        // Wrap `fut` into a future that logs a message every `period` so that we get a
        // very obvious breadcrumb in the logs _while_ a slow iteration is happening.
-        let liveness_logger = async move {
-            loop {
-                match tokio::time::timeout(period, &mut fut).await {
-                    Ok(x) => return x,
-                    Err(_) => {
-                        // info level as per the same rationale why warn_when_period_overrun is info
-                        // =>  https://github.com/neondatabase/neon/pull/5724
-                        info!("still running");
-                    }
-                }
+        let output = loop {
+            match tokio::time::timeout(self.period, &mut fut).await {
+                Ok(r) => break r,
+                Err(_) => info!("still running"),
            }
        };
-
-        let output = liveness_logger.await;
-
-        let elapsed = started_at.elapsed();
-        warn_when_period_overrun(elapsed, period, kind);
+        let elapsed = self.started_at.elapsed();
+        warn_when_period_overrun(elapsed, self.period, self.kind);

        IterationResult { output, elapsed }
    }
 }
-/// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
+
+// NB: the `task` and `period` are used for metrics labels.
 pub(crate) fn warn_when_period_overrun(
    elapsed: Duration,
    period: Duration,
@@ -616,7 +590,7 @@ pub(crate) fn warn_when_period_overrun(
            "task iteration took longer than the configured period"
        );
        crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
-            .with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())])
+            .with_label_values(&[task.into(), &format!("{}", period.as_secs())])
            .inc();
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -18,6 +18,7 @@ use arc_swap::{ArcSwap, ArcSwapOption};
 use bytes::Bytes;
 use camino::Utf8Path;
 use chrono::{DateTime, Utc};
+use compaction::CompactionOutcome;
 use enumset::EnumSet;
 use fail::fail_point;
 use futures::{stream::FuturesUnordered, StreamExt};
@@ -51,6 +52,8 @@ use tokio::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::critical;
+use utils::rate_limit::RateLimit;
 use utils::{
    fs_ext,
    guard_arc_swap::GuardArcSwap,
@@ -115,7 +118,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;

 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::TimelineMetrics;
+use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL};
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
@@ -188,6 +191,19 @@ pub enum ImageLayerCreationMode {
    Initial,
 }

+#[derive(Clone, Debug, Default)]
+pub enum LastImageLayerCreationStatus {
+    Incomplete {
+        /// The last key of the partition (exclusive) that was processed in the last
+        /// image layer creation attempt. We will continue from this key in the next
+        /// attempt.
+        last_key: Key,
+    },
+    Complete,
+    #[default]
+    Initial,
+}
+
 impl std::fmt::Display for ImageLayerCreationMode {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{:?}", self)
@@ -340,10 +356,14 @@ pub struct Timeline {
    // Needed to ensure that we can't create a branch at a point that was already garbage collected
    pub latest_gc_cutoff_lsn: Rcu<Lsn>,

+    pub(crate) gc_compaction_layer_update_lock: tokio::sync::RwLock<()>,
+
    // List of child timelines and their branch points. This is needed to avoid
    // garbage collecting data that is still needed by the child timelines.
    pub(crate) gc_info: std::sync::RwLock<GcInfo>,

+    pub(crate) last_image_layer_creation_status: ArcSwap<LastImageLayerCreationStatus>,
+
    // It may change across major versions so for simplicity
    // keep it after running initdb for a timeline.
    // It is needed in checks when we want to error on some operations
@@ -512,6 +532,9 @@ impl GcInfo {
    pub(super) fn remove_child_offloaded(&mut self, child_id: TimelineId) -> bool {
        self.remove_child_maybe_offloaded(child_id, MaybeOffloaded::Yes)
    }
+    pub(crate) fn lsn_covered_by_lease(&self, lsn: Lsn) -> bool {
+        self.leases.contains_key(&lsn)
+    }
 }

 /// The `GcInfo` component describing which Lsns need to be retained.  Functionally, this
@@ -933,9 +956,16 @@ pub(crate) enum ShutdownMode {
    Hard,
 }

-struct ImageLayerCreationOutcome {
-    unfinished_image_layer: Option<ImageLayerWriter>,
-    next_start_key: Key,
+enum ImageLayerCreationOutcome {
+    /// We generated an image layer
+    Generated {
+        unfinished_image_layer: ImageLayerWriter,
+    },
+    /// The key range is empty
+    Empty,
+    /// (Only used in metadata image layer creation), after reading the metadata keys, we decide to skip
+    /// the image layer creation.
+    Skip,
 }

 /// Public interface functions
@@ -1044,7 +1074,7 @@ impl Timeline {
    }

    pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
-    pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;
+    pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100;

    /// Look up multiple page versions at a given LSN
    ///
@@ -1194,6 +1224,7 @@ impl Timeline {
                            return (key, Err(err));
                        }
                    };
+                    DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64);

                    // The walredo module expects the records to be descending in terms of Lsn.
                    // And we submit the IOs in that order, so, there shuold be no need to sort here.
@@ -1221,25 +1252,28 @@ impl Timeline {
        // (this is a requirement, not a bug). Skip updating the metric in these cases
        // to avoid infinite results.
        if !results.is_empty() {
-            let avg = layers_visited as f64 / results.len() as f64;
-            if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<RateLimit>> =
+            // Record the total number of layers visited towards each key in the batch. While some
+            // layers may not intersect with a given read, and the cost of layer visits are
+            // amortized across the batch, each visited layer contributes directly to the observed
+            // latency for every read in the batch, which is what we care about.
+            if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
+                static LOG_PACER: Lazy<Mutex<RateLimit>> =
                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
-                let mut rate_limit = LOGGED.lock().unwrap();
-                rate_limit.call(|| {
+                LOG_PACER.lock().unwrap().call(|| {
+                    let num_keys = keyspace.total_raw_size();
+                    let num_pages = results.len();
                    tracing::info!(
                      shard_id = %self.tenant_shard_id.shard_slug(),
                      lsn = %lsn,
-                      "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned",
-                      keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size());
+                      "Vectored read for {keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
+                    );
                });
            }

-            // Note that this is an approximation. Tracking the exact number of layers visited
-            // per key requires virtually unbounded memory usage and is inefficient
-            // (i.e. segment tree tracking each range queried from a layer)
-            crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg);
+            for _ in &results {
+                self.metrics.layers_per_read.observe(layers_visited as f64);
+                LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
+            }
        }

        Ok(results)
@@ -1655,7 +1689,7 @@ impl Timeline {
        cancel: &CancellationToken,
        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
        self.compact_with_options(
            cancel,
            CompactOptions {
@@ -1677,15 +1711,16 @@ impl Timeline {
        cancel: &CancellationToken,
        options: CompactOptions,
        ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
        // most likely the cancellation token is from background task, but in tests it could be the
        // request task as well.

        let prepare = async move {
            let guard = self.compaction_lock.lock().await;

-            let permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
+            let permit = super::tasks::acquire_concurrency_permit(
                BackgroundLoopKind::Compaction,
+                self.conf.use_compaction_semaphore,
                ctx,
            )
            .await;
@@ -1697,8 +1732,8 @@ impl Timeline {
        // compaction task goes over it's period (20s) which is quite often in production.
        let (_guard, _permit) = tokio::select! {
            tuple = prepare => { tuple },
-            _ = self.cancel.cancelled() => return Ok(false),
-            _ = cancel.cancelled() => return Ok(false),
+            _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Done),
+            _ = cancel.cancelled() => return Ok(CompactionOutcome::Done),
        };

        let last_record_lsn = self.get_last_record_lsn();
@@ -1706,13 +1741,13 @@ impl Timeline {
        // Last record Lsn could be zero in case the timeline was just created
        if !last_record_lsn.is_valid() {
            warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
        }

        let result = match self.get_compaction_algorithm_settings().kind {
            CompactionAlgorithm::Tiered => {
                self.compact_tiered(cancel, ctx).await?;
-                Ok(false)
+                Ok(CompactionOutcome::Done)
            }
            CompactionAlgorithm::Legacy => self.compact_legacy(cancel, options, ctx).await,
        };
@@ -1811,7 +1846,7 @@ impl Timeline {
        self.last_record_lsn.shutdown();

        if let ShutdownMode::FreezeAndFlush = mode {
-            if let Some((open, frozen)) = self
+            let do_flush = if let Some((open, frozen)) = self
                .layers
                .read()
                .await
@@ -1820,43 +1855,54 @@ impl Timeline {
                .ok()
                .filter(|(open, frozen)| *open || *frozen > 0)
            {
-                tracing::info!(?open, frozen, "flushing and freezing on shutdown");
+                if self.remote_client.is_archived() == Some(true) {
+                    // No point flushing on shutdown for an archived timeline: it is not important
+                    // to have it nice and fresh after our restart, and trying to flush here might
+                    // race with trying to offload it (which also stops the flush loop)
+                    false
+                } else {
+                    tracing::info!(?open, frozen, "flushing and freezing on shutdown");
+                    true
+                }
            } else {
-                // this is double-shutdown, ignore it
-            }
+                // this is double-shutdown, it'll be a no-op
+                true
+            };

            // we shut down walreceiver above, so, we won't add anything more
            // to the InMemoryLayer; freeze it and wait for all frozen layers
            // to reach the disk & upload queue, then shut the upload queue and
            // wait for it to drain.
-            match self.freeze_and_flush().await {
-                Ok(_) => {
-                    // drain the upload queue
-                    // if we did not wait for completion here, it might be our shutdown process
-                    // didn't wait for remote uploads to complete at all, as new tasks can forever
-                    // be spawned.
-                    //
-                    // what is problematic is the shutting down of RemoteTimelineClient, because
-                    // obviously it does not make sense to stop while we wait for it, but what
-                    // about corner cases like s3 suddenly hanging up?
-                    self.remote_client.shutdown().await;
+            if do_flush {
+                match self.freeze_and_flush().await {
+                    Ok(_) => {
+                        // drain the upload queue
+                        // if we did not wait for completion here, it might be our shutdown process
+                        // didn't wait for remote uploads to complete at all, as new tasks can forever
+                        // be spawned.
+                        //
+                        // what is problematic is the shutting down of RemoteTimelineClient, because
+                        // obviously it does not make sense to stop while we wait for it, but what
+                        // about corner cases like s3 suddenly hanging up?
+                        self.remote_client.shutdown().await;
+                    }
+                    Err(FlushLayerError::Cancelled) => {
+                        // this is likely the second shutdown, ignore silently.
+                        // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
+                        debug_assert!(self.cancel.is_cancelled());
+                    }
+                    Err(e) => {
+                        // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
+                        // we have some extra WAL replay to do next time the timeline starts.
+                        warn!("failed to freeze and flush: {e:#}");
+                    }
                }
-                Err(FlushLayerError::Cancelled) => {
-                    // this is likely the second shutdown, ignore silently.
-                    // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
-                    debug_assert!(self.cancel.is_cancelled());
-                }
-                Err(e) => {
-                    // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
-                    // we have some extra WAL replay to do next time the timeline starts.
-                    warn!("failed to freeze and flush: {e:#}");
-                }
-            }

-            // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
-            // we also do a final check here to ensure that the queue is empty.
-            if !self.remote_client.no_pending_work() {
-                warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+                // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
+                // we also do a final check here to ensure that the queue is empty.
+                if !self.remote_client.no_pending_work() {
+                    warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+                }
            }
        }

@@ -2021,8 +2067,16 @@ impl Timeline {
    pub(crate) async fn download_layer(
        &self,
        layer_file_name: &LayerName,
-    ) -> anyhow::Result<Option<bool>> {
-        let Some(layer) = self.find_layer(layer_file_name).await? else {
+    ) -> Result<Option<bool>, super::storage_layer::layer::DownloadError> {
+        let Some(layer) = self
+            .find_layer(layer_file_name)
+            .await
+            .map_err(|e| match e {
+                layer_manager::Shutdown => {
+                    super::storage_layer::layer::DownloadError::TimelineShutdown
+                }
+            })?
+        else {
            return Ok(None);
        };

@@ -2323,6 +2377,18 @@ impl Timeline {
            )
    }

+    fn get_image_creation_preempt_threshold(&self) -> usize {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .image_creation_preempt_threshold
+            .unwrap_or(
+                self.conf
+                    .default_tenant_conf
+                    .image_creation_preempt_threshold,
+            )
+    }
+
    /// Resolve the effective WAL receiver protocol to use for this tenant.
    ///
    /// Priority order is:
@@ -2432,6 +2498,7 @@ impl Timeline {
                shard_identity,
                pg_version,
                layers: Default::default(),
+                gc_compaction_layer_update_lock: tokio::sync::RwLock::new(()),

                walredo_mgr,
                walreceiver: Mutex::new(None),
@@ -2472,6 +2539,10 @@ impl Timeline {

                gc_info: std::sync::RwLock::new(GcInfo::default()),

+                last_image_layer_creation_status: ArcSwap::new(Arc::new(
+                    LastImageLayerCreationStatus::default(),
+                )),
+
                latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
                initdb_lsn: metadata.initdb_lsn(),

@@ -2562,7 +2633,7 @@ impl Timeline {
                return;
            }
            FlushLoopState::Exited => {
-                warn!(
+                info!(
                    "ignoring attempt to restart exited flush_loop {}/{}",
                    self.tenant_shard_id, self.timeline_id
                );
@@ -2986,8 +3057,9 @@ impl Timeline {
            let self_ref = &self;
            let skip_concurrency_limiter = &skip_concurrency_limiter;
            async move {
-                let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
+                let wait_for_permit = super::tasks::acquire_concurrency_permit(
                    BackgroundLoopKind::InitialLogicalSizeCalculation,
+                    false,
                    background_ctx,
                );

@@ -3468,6 +3540,16 @@ impl Timeline {
        let mut completed_keyspace = KeySpace::default();
        let mut image_covered_keyspace = KeySpaceRandomAccum::new();

+        // Prevent GC from progressing while visiting the current timeline.
+        // If we are GC-ing because a new image layer was added while traversing
+        // the timeline, then it will remove layers that are required for fulfilling
+        // the current get request (read-path cannot "look back" and notice the new
+        // image layer).
+        let _gc_cutoff_holder = timeline.get_latest_gc_cutoff_lsn();
+
+        // See `compaction::compact_with_gc` for why we need this.
+        let _guard = timeline.gc_compaction_layer_update_lock.read().await;
+
        loop {
            if cancel.is_cancelled() {
                return Err(GetVectoredError::Cancelled);
@@ -3696,7 +3778,7 @@ impl Timeline {
            let mut guard = self.layers.write().await;
            guard
                .open_mut()?
-                .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock)
+                .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock, &self.metrics)
                .await
        };

@@ -4005,15 +4087,20 @@ impl Timeline {
            }

            let mut layers_to_upload = Vec::new();
-            layers_to_upload.extend(
-                self.create_image_layers(
+            let (generated_image_layers, is_complete) = self
+                .create_image_layers(
                    &partitions,
                    self.initdb_lsn,
                    ImageLayerCreationMode::Initial,
                    ctx,
+                    LastImageLayerCreationStatus::Initial,
                )
-                .await?,
+                .await?;
+            debug_assert!(
+                matches!(is_complete, LastImageLayerCreationStatus::Complete),
+                "init image generation mode must fully cover the keyspace"
            );
+            layers_to_upload.extend(generated_image_layers);

            (layers_to_upload, None)
        } else {
@@ -4270,7 +4357,7 @@ impl Timeline {
        Ok(result)
    }

-    // Is it time to create a new image layer for the given partition?
+    // Is it time to create a new image layer for the given partition? True if we want to generate.
    async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
        let threshold = self.get_image_creation_threshold();

@@ -4333,7 +4420,6 @@ impl Timeline {
        lsn: Lsn,
        ctx: &RequestContext,
        img_range: Range<Key>,
-        start: Key,
        io_concurrency: IoConcurrency,
    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
        let mut wrote_keys = false;
@@ -4421,26 +4507,23 @@ impl Timeline {
                    lsn
                },
            );
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: Some(image_layer_writer),
-                next_start_key: img_range.end,
+            Ok(ImageLayerCreationOutcome::Generated {
+                unfinished_image_layer: image_layer_writer,
            })
        } else {
-            // Special case: the image layer may be empty if this is a sharded tenant and the
-            // partition does not cover any keys owned by this shard.  In this case, to ensure
-            // we don't leave gaps between image layers, leave `start` where it is, so that the next
-            // layer we write will cover the key range that we just scanned.
            tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: None,
-                next_start_key: start,
-            })
+            Ok(ImageLayerCreationOutcome::Empty)
        }
    }

    /// Create an image layer for metadata keys. This function produces one image layer for all metadata
    /// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it
    /// would not be too large to fit in a single image layer.
+    ///
+    /// Creating image layers for metadata keys are different from relational keys. Firstly, instead of
+    /// iterating each key and get an image for each of them, we do a `vectored_get` scan over the sparse
+    /// keyspace to get all images in one run. Secondly, we use a different image layer generation metrics
+    /// for metadata keys than relational keys, which is the number of delta files visited during the scan.
    #[allow(clippy::too_many_arguments)]
    async fn create_image_layer_for_metadata_keys(
        self: &Arc<Self>,
@@ -4450,12 +4533,13 @@ impl Timeline {
        ctx: &RequestContext,
        img_range: Range<Key>,
        mode: ImageLayerCreationMode,
-        start: Key,
        io_concurrency: IoConcurrency,
    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
        // Metadata keys image layer creation.
        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
        let begin = Instant::now();
+        // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should
+        // not contain too many keys, otherwise this takes a lot of memory.
        let data = self
            .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
            .await?;
@@ -4480,10 +4564,7 @@ impl Timeline {
        );

        if !trigger_generation && mode == ImageLayerCreationMode::Try {
-            return Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: None,
-                next_start_key: img_range.end,
-            });
+            return Ok(ImageLayerCreationOutcome::Skip);
        }
        if self.cancel.is_cancelled() {
            return Err(CreateImageLayersError::Cancelled);
@@ -4514,20 +4595,12 @@ impl Timeline {
                    lsn
                }
            );
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: Some(image_layer_writer),
-                next_start_key: img_range.end,
+            Ok(ImageLayerCreationOutcome::Generated {
+                unfinished_image_layer: image_layer_writer,
            })
        } else {
-            // Special case: the image layer may be empty if this is a sharded tenant and the
-            // partition does not cover any keys owned by this shard. In this case, to ensure
-            // we don't leave gaps between image layers, leave `start` where it is, so that the next
-            // layer we write will cover the key range that we just scanned.
            tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: None,
-                next_start_key: start,
-            })
+            Ok(ImageLayerCreationOutcome::Empty)
        }
    }

@@ -4583,6 +4656,8 @@ impl Timeline {
        decision
    }

+    /// Returns the image layers generated and an enum indicating whether the process is fully completed.
+    /// true = we have generate all image layers, false = we preempt the process for L0 compaction.
    #[tracing::instrument(skip_all, fields(%lsn, %mode))]
    async fn create_image_layers(
        self: &Arc<Timeline>,
@@ -4590,9 +4665,15 @@ impl Timeline {
        lsn: Lsn,
        mode: ImageLayerCreationMode,
        ctx: &RequestContext,
-    ) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
+        last_status: LastImageLayerCreationStatus,
+    ) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
        let timer = self.metrics.create_images_time_histo.start_timer();

+        if partitioning.parts.is_empty() {
+            warn!("no partitions to create image layers for");
+            return Ok((vec![], LastImageLayerCreationStatus::Complete));
+        }
+
        // We need to avoid holes between generated image layers.
        // Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one
        // image layer with hole between them. In this case such layer can not be utilized by GC.
@@ -4604,15 +4685,65 @@ impl Timeline {
        // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
        let mut start = Key::MIN;

-        let check_for_image_layers = self.should_check_if_image_layers_required(lsn);
+        let check_for_image_layers =
+            if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status {
+                info!(
+                    "resuming image layer creation: last_status=incomplete, continue from {}",
+                    last_key
+                );
+                true
+            } else {
+                self.should_check_if_image_layers_required(lsn)
+            };

        let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;

-        for partition in partitioning.parts.iter() {
+        let mut all_generated = true;
+
+        let mut partition_processed = 0;
+        let mut total_partitions = partitioning.parts.len();
+        let mut last_partition_processed = None;
+        let mut partition_parts = partitioning.parts.clone();
+
+        if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status {
+            // We need to skip the partitions that have already been processed.
+            let mut found = false;
+            for (i, partition) in partition_parts.iter().enumerate() {
+                if last_key <= partition.end().unwrap() {
+                    // ```plain
+                    // |------|--------|----------|------|
+                    //              ^last_key
+                    //                    ^start from this partition
+                    // ```
+                    // Why `i+1` instead of `i`?
+                    // It is possible that the user did some writes after the previous image layer creation attempt so that
+                    // a relation grows in size, and the last_key is now in the middle of the partition. In this case, we
+                    // still want to skip this partition, so that we can make progress and avoid generating image layers over
+                    // the same partition. Doing a mod to ensure we don't end up with an empty vec.
+                    if i + 1 >= total_partitions {
+                        // In general, this case should not happen -- if last_key is on the last partition, the previous
+                        // iteration of image layer creation should return a complete status.
+                        break; // with found=false
+                    }
+                    partition_parts = partition_parts.split_off(i + 1); // Remove the first i + 1 elements
+                    total_partitions = partition_parts.len();
+                    // Update the start key to the partition start.
+                    start = partition_parts[0].start().unwrap();
+                    found = true;
+                    break;
+                }
+            }
+            if !found {
+                // Last key is within the last partition, or larger than all partitions.
+                return Ok((vec![], LastImageLayerCreationStatus::Complete));
+            }
+        }
+
+        for partition in partition_parts.iter() {
            if self.cancel.is_cancelled() {
                return Err(CreateImageLayersError::Cancelled);
            }
-
+            partition_processed += 1;
            let img_range = start..partition.ranges.last().unwrap().end;
            let compact_metadata = partition.overlaps(&Key::metadata_key_range());
            if compact_metadata {
@@ -4647,6 +4778,8 @@ impl Timeline {
                    lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
                    is_delta: false,
                }) {
+                    // TODO: this can be processed with the BatchLayerWriter::finish_with_discard
+                    // in the future.
                    tracing::info!(
                        "Skipping image layer at {lsn} {}..{}, already exists",
                        img_range.start,
@@ -4680,17 +4813,13 @@ impl Timeline {
                    .map_err(|_| CreateImageLayersError::Cancelled)?,
            );

-            let ImageLayerCreationOutcome {
-                unfinished_image_layer,
-                next_start_key,
-            } = if !compact_metadata {
+            let outcome = if !compact_metadata {
                self.create_image_layer_for_rel_blocks(
                    partition,
                    image_layer_writer,
                    lsn,
                    ctx,
                    img_range.clone(),
-                    start,
                    io_concurrency,
                )
                .await?
@@ -4702,18 +4831,58 @@ impl Timeline {
                    ctx,
                    img_range.clone(),
                    mode,
-                    start,
                    io_concurrency,
                )
                .await?
            };
-            start = next_start_key;
-            if let Some(unfinished_image_layer) = unfinished_image_layer {
-                batch_image_writer.add_unfinished_image_writer(
+            match outcome {
+                ImageLayerCreationOutcome::Empty => {
+                    // No data in this partition, so we don't need to create an image layer (for now).
+                    // The next image layer should cover this key range, so we don't advance the `start`
+                    // key.
+                }
+                ImageLayerCreationOutcome::Generated {
                    unfinished_image_layer,
-                    img_range,
-                    lsn,
-                );
+                } => {
+                    batch_image_writer.add_unfinished_image_writer(
+                        unfinished_image_layer,
+                        img_range.clone(),
+                        lsn,
+                    );
+                    // The next image layer should be generated right after this one.
+                    start = img_range.end;
+                }
+                ImageLayerCreationOutcome::Skip => {
+                    // We don't need to create an image layer for this partition.
+                    // The next image layer should NOT cover this range, otherwise
+                    // the keyspace becomes empty (reads don't go past image layers).
+                    start = img_range.end;
+                }
+            }
+
+            if let ImageLayerCreationMode::Try = mode {
+                // We have at least made some progress
+                if batch_image_writer.pending_layer_num() >= 1 {
+                    // The `Try` mode is currently only used on the compaction path. We want to avoid
+                    // image layer generation taking too long time and blocking L0 compaction. So in this
+                    // mode, we also inspect the current number of L0 layers and skip image layer generation
+                    // if there are too many of them.
+                    let num_of_l0_layers = {
+                        let layers = self.layers.read().await;
+                        layers.layer_map()?.level0_deltas().len()
+                    };
+                    let image_preempt_threshold = self.get_image_creation_preempt_threshold()
+                        * self.get_compaction_threshold();
+                    if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold {
+                        tracing::info!(
+                        "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers {}",
+                        partition.start().unwrap(), partition.end().unwrap(), num_of_l0_layers
+                    );
+                        last_partition_processed = Some(partition.clone());
+                        all_generated = false;
+                        break;
+                    }
+                }
            }
        }

@@ -4728,14 +4897,42 @@ impl Timeline {
            .open_mut()?
            .track_new_image_layers(&image_layers, &self.metrics);
        drop_wlock(guard);
-        timer.stop_and_record();
+        let duration = timer.stop_and_record();

        // Creating image layers may have caused some previously visible layers to be covered
        if !image_layers.is_empty() {
            self.update_layer_visibility().await?;
        }

-        Ok(image_layers)
+        let total_layer_size = image_layers
+            .iter()
+            .map(|l| l.metadata().file_size)
+            .sum::<u64>();
+
+        info!(
+            "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions",
+            image_layers.len(),
+            total_layer_size,
+            duration.as_secs_f64(),
+            partition_processed,
+            total_partitions
+        );
+
+        Ok((
+            image_layers,
+            if all_generated {
+                LastImageLayerCreationStatus::Complete
+            } else {
+                LastImageLayerCreationStatus::Incomplete {
+                    last_key: if let Some(last_partition_processed) = last_partition_processed {
+                        last_partition_processed.end().unwrap_or(Key::MIN)
+                    } else {
+                        // This branch should be unreachable, but in case it happens, we can just return the start key.
+                        Key::MIN
+                    },
+                }
+            },
+        ))
    }

    /// Wait until the background initial logical size calculation is complete, or
@@ -5613,10 +5810,11 @@ impl Timeline {
                let img = match res {
                    Ok(img) => img,
                    Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
-                    Err(walredo::Error::Other(e)) => {
+                    Err(walredo::Error::Other(err)) => {
+                        critical!("walredo failure during page reconstruction: {err:?}");
                        return Err(PageReconstructError::WalRedo(
-                            e.context("reconstruct a page image"),
-                        ))
+                            err.context("reconstruct a page image"),
+                        ));
                    }
                };
                Ok(img)
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -10,8 +10,8 @@ use std::sync::Arc;

 use super::layer_manager::LayerManager;
 use super::{
-    CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
-    RecordedDuration, Timeline,
+    CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError,
+    ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline,
 };

 use anyhow::{anyhow, bail, Context};
@@ -26,6 +26,7 @@ use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, info_span, trace, warn, Instrument};
+use utils::critical;
 use utils::id::TimelineId;

 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
@@ -33,6 +34,7 @@ use crate::page_cache;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::gc_block::GcBlock;
+use crate::tenant::layer_map::LayerMap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::batch_split_writer::{
    BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -262,13 +264,13 @@ impl GcCompactionQueue {
        ctx: &RequestContext,
        gc_block: &GcBlock,
        timeline: &Arc<Timeline>,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
        let _one_op_at_a_time_guard = self.consumer_lock.lock().await;
        let has_pending_tasks;
        let (id, item) = {
            let mut guard = self.inner.lock().unwrap();
            let Some((id, item)) = guard.queued.pop_front() else {
-                return Ok(false);
+                return Ok(CompactionOutcome::Done);
            };
            guard.running = Some((id, item.clone()));
            has_pending_tasks = !guard.queued.is_empty();
@@ -323,7 +325,11 @@ impl GcCompactionQueue {
            let mut guard = self.inner.lock().unwrap();
            guard.running = None;
        }
-        Ok(has_pending_tasks)
+        Ok(if has_pending_tasks {
+            CompactionOutcome::Pending
+        } else {
+            CompactionOutcome::Done
+        })
    }

    #[allow(clippy::type_complexity)]
@@ -434,6 +440,11 @@ impl KeyHistoryRetention {
        if dry_run {
            return true;
        }
+        if LayerMap::is_l0(&key.key_range, key.is_delta) {
+            // gc-compaction should not produce L0 deltas, otherwise it will break the layer order.
+            // We should ignore such layers.
+            return true;
+        }
        let layer_generation;
        {
            let guard = tline.layers.read().await;
@@ -589,6 +600,17 @@ impl CompactionStatistics {
    }
 }

+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CompactionOutcome {
+    #[default]
+    /// No layers need to be compacted after this round. Compaction doesn't need
+    /// to be immediately scheduled.
+    Done,
+    /// Still has pending layers to be compacted after this round. Ideally, the scheduler
+    /// should immediately schedule another compaction.
+    Pending,
+}
+
 impl Timeline {
    /// TODO: cancellation
    ///
@@ -598,7 +620,7 @@ impl Timeline {
        cancel: &CancellationToken,
        options: CompactOptions,
        ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
        if options
            .flags
            .contains(CompactFlags::EnhancedGcBottomMostCompaction)
@@ -606,7 +628,7 @@ impl Timeline {
            self.compact_with_gc(cancel, options, ctx)
                .await
                .map_err(CompactionError::Other)?;
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
        }

        if options.flags.contains(CompactFlags::DryRun) {
@@ -624,7 +646,13 @@ impl Timeline {

        // High level strategy for compaction / image creation:
        //
-        // 1. First, calculate the desired "partitioning" of the
+        // 1. First, do a L0 compaction to ensure we move the L0
+        // layers into the historic layer map get flat levels of
+        // layers. If we did not compact all L0 layers, we will
+        // prioritize compacting the timeline again and not do
+        // any of the compactions below.
+        //
+        // 2. Then, calculate the desired "partitioning" of the
        // currently in-use key space. The goal is to partition the
        // key space into roughly fixed-size chunks, but also take into
        // account any existing image layers, and try to align the
@@ -638,7 +666,7 @@ impl Timeline {
        // identify a relation. This is just an optimization,
        // though.
        //
-        // 2. Once we know the partitioning, for each partition,
+        // 3. Once we know the partitioning, for each partition,
        // decide if it's time to create a new image layer. The
        // criteria is: there has been too much "churn" since the last
        // image layer? The "churn" is fuzzy concept, it's a
@@ -646,15 +674,8 @@ impl Timeline {
        // total in the delta file. Or perhaps: if creating an image
        // file would allow to delete some older files.
        //
-        // 3. After that, we compact all level0 delta files if there
-        // are too many of them.  While compacting, we also garbage
-        // collect any page versions that are no longer needed because
-        // of the new image layers we created in step 2.
-        //
-        // TODO: This high level strategy hasn't been implemented yet.
-        // Below are functions compact_level0() and create_image_layers()
-        // but they are a bit ad hoc and don't quite work like it's explained
-        // above. Rewrite it.
+        // 4. In the end, if the tenant gets auto-sharded, we will run
+        // a shard-ancestor compaction.

        // Is the timeline being deleted?
        if self.is_stopping() {
@@ -666,10 +687,32 @@ impl Timeline {

        // Define partitioning schema if needed

-        // FIXME: the match should only cover repartitioning, not the next steps
-        let (partition_count, has_pending_tasks) = match self
+        // 1. L0 Compact
+        let l0_compaction_outcome = {
+            let timer = self.metrics.compact_time_histo.start_timer();
+            let l0_compaction_outcome = self
+                .compact_level0(
+                    target_file_size,
+                    options.flags.contains(CompactFlags::ForceL0Compaction),
+                    ctx,
+                )
+                .await?;
+            timer.stop_and_record();
+            l0_compaction_outcome
+        };
+
+        if let CompactionOutcome::Pending = l0_compaction_outcome {
+            // Yield and do not do any other kind of compaction. True means
+            // that we have pending L0 compaction tasks and the compaction scheduler
+            // will prioritize compacting this tenant/timeline again.
+            info!("skipping image layer generation and shard ancestor compaction due to L0 compaction did not include all layers.");
+            return Ok(CompactionOutcome::Pending);
+        }
+
+        // 2. Repartition and create image layers if necessary
+        let partition_count = match self
            .repartition(
-                self.get_last_record_lsn(),
+                self.get_last_record_lsn(), // TODO: use L0-L1 boundary
                self.get_compaction_target_size(),
                options.flags,
                ctx,
@@ -682,46 +725,50 @@ impl Timeline {
                    .access_stats_behavior(AccessStatsBehavior::Skip)
                    .build();

-                // 2. Compact
-                let timer = self.metrics.compact_time_histo.start_timer();
-                let fully_compacted = self
-                    .compact_level0(
-                        target_file_size,
-                        options.flags.contains(CompactFlags::ForceL0Compaction),
-                        ctx,
-                    )
-                    .await?;
-                timer.stop_and_record();
-
                let mut partitioning = dense_partitioning;
                partitioning
                    .parts
                    .extend(sparse_partitioning.into_dense().parts);

-                // 3. Create new image layers for partitions that have been modified
-                // "enough". Skip image layer creation if L0 compaction cannot keep up.
-                if fully_compacted {
-                    let image_layers = self
-                        .create_image_layers(
-                            &partitioning,
-                            lsn,
-                            if options
-                                .flags
-                                .contains(CompactFlags::ForceImageLayerCreation)
-                            {
-                                ImageLayerCreationMode::Force
-                            } else {
-                                ImageLayerCreationMode::Try
-                            },
-                            &image_ctx,
-                        )
-                        .await?;
+                // 3. Create new image layers for partitions that have been modified "enough".
+                let (image_layers, outcome) = self
+                    .create_image_layers(
+                        &partitioning,
+                        lsn,
+                        if options
+                            .flags
+                            .contains(CompactFlags::ForceImageLayerCreation)
+                        {
+                            ImageLayerCreationMode::Force
+                        } else {
+                            ImageLayerCreationMode::Try
+                        },
+                        &image_ctx,
+                        self.last_image_layer_creation_status
+                            .load()
+                            .as_ref()
+                            .clone(),
+                    )
+                    .await
+                    .inspect_err(|err| {
+                        if let CreateImageLayersError::GetVectoredError(
+                            GetVectoredError::MissingKey(_),
+                        ) = err
+                        {
+                            critical!("missing key during compaction: {err:?}");
+                        }
+                    })?;

-                    self.upload_new_image_layers(image_layers)?;
-                } else {
-                    info!("skipping image layer generation due to L0 compaction did not include all layers.");
+                self.last_image_layer_creation_status
+                    .store(Arc::new(outcome.clone()));
+
+                self.upload_new_image_layers(image_layers)?;
+                if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
+                    // Yield and do not do any other kind of compaction.
+                    info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
+                    return Ok(CompactionOutcome::Pending);
                }
-                (partitioning.parts.len(), !fully_compacted)
+                partitioning.parts.len()
            }
            Err(err) => {
                // no partitioning? This is normal, if the timeline was just created
@@ -733,10 +780,12 @@ impl Timeline {
                if !self.cancel.is_cancelled() && !err.is_cancelled() {
                    tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
                }
-                (1, false)
+                1
            }
        };

+        // 4. Shard ancestor compaction
+
        if self.shard_identity.count >= ShardCount::new(2) {
            // Limit the number of layer rewrites to the number of partitions: this means its
            // runtime should be comparable to a full round of image layer creations, rather than
@@ -746,7 +795,7 @@ impl Timeline {
            self.compact_shard_ancestors(rewrite_max, ctx).await?;
        }

-        Ok(has_pending_tasks)
+        Ok(CompactionOutcome::Done)
    }

    /// Check for layers that are elegible to be rewritten:
@@ -1003,11 +1052,11 @@ impl Timeline {
        target_file_size: u64,
        force_compaction_ignore_threshold: bool,
        ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
        let CompactLevel0Phase1Result {
            new_layers,
            deltas_to_compact,
-            fully_compacted,
+            outcome,
        } = {
            let phase1_span = info_span!("compact_level0_phase1");
            let ctx = ctx.attached_child();
@@ -1036,12 +1085,12 @@ impl Timeline {

        if new_layers.is_empty() && deltas_to_compact.is_empty() {
            // nothing to do
-            return Ok(true);
+            return Ok(CompactionOutcome::Done);
        }

        self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
            .await?;
-        Ok(fully_compacted)
+        Ok(outcome)
    }

    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
@@ -1112,10 +1161,16 @@ impl Timeline {
        // Accumulate the size of layers in `deltas_to_compact`
        let mut deltas_to_compact_bytes = 0;

-        // Under normal circumstances, we will accumulate up to compaction_interval L0s of size
+        // Under normal circumstances, we will accumulate up to compaction_upper_limit L0s of size
        // checkpoint_distance each.  To avoid edge cases using extra system resources, bound our
        // work in this function to only operate on this much delta data at once.
-        let delta_size_limit = self.get_compaction_upper_limit() as u64
+        //
+        // In general, compaction_threshold should be <= compaction_upper_limit, but in case that
+        // the constraint is not respected, we use the larger of the two.
+        let delta_size_limit = std::cmp::max(
+            self.get_compaction_upper_limit(),
+            self.get_compaction_threshold(),
+        ) as u64
            * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);

        let mut fully_compacted = true;
@@ -1490,11 +1545,9 @@ impl Timeline {
                    .await
                    .map_err(CompactionError::Other)?;
            } else {
-                let shard = self.shard_identity.shard_index();
                let owner = self.shard_identity.get_shard_number(&key);
-                if cfg!(debug_assertions) {
-                    panic!("key {key} does not belong on shard {shard}, owned by {owner}");
-                }
+
+                // This happens after a shard split, when we're compacting an L0 created by our parent shard
                debug!("dropping key {key} during compaction (it belongs on shard {owner})");
            }

@@ -1579,7 +1632,11 @@ impl Timeline {
                .into_iter()
                .map(|x| x.drop_eviction_guard())
                .collect::<Vec<_>>(),
-            fully_compacted,
+            outcome: if fully_compacted {
+                CompactionOutcome::Done
+            } else {
+                CompactionOutcome::Pending
+            },
        })
    }
 }
@@ -1590,7 +1647,7 @@ struct CompactLevel0Phase1Result {
    deltas_to_compact: Vec<Layer>,
    // Whether we have included all L0 layers, or selected only part of them due to the
    // L0 compaction size limit.
-    fully_compacted: bool,
+    outcome: CompactionOutcome,
 }

 #[derive(Default)]
@@ -2906,10 +2963,45 @@ impl Timeline {
        // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only
        // operate on L1 layers.
        {
+            // Gc-compaction will rewrite the history of a key. This could happen in two ways:
+            //
+            // 1. We create an image layer to replace all the deltas below the compact LSN. In this case, assume
+            // we have 2 delta layers A and B, both below the compact LSN. We create an image layer I to replace
+            // A and B at the compact LSN. If the read path finishes reading A, yields, and now we update the layer
+            // map, the read path then cannot find any keys below A, reporting a missing key error, while the key
+            // now gets stored in I at the compact LSN.
+            //
+            // ---------------                                       ---------------
+            //   delta1@LSN20                                         image1@LSN20
+            // ---------------  (read path collects delta@LSN20,  => ---------------  (read path cannot find anything
+            //   delta1@LSN10    yields)                                               below LSN 20)
+            // ---------------
+            //
+            // 2. We create a delta layer to replace all the deltas below the compact LSN, and in the delta layers,
+            // we combines the history of a key into a single image. For example, we have deltas at LSN 1, 2, 3, 4,
+            // Assume one delta layer contains LSN 1, 2, 3 and the other contains LSN 4.
+            //
+            // We let gc-compaction combine delta 2, 3, 4 into an image at LSN 4, which produces a delta layer that
+            // contains the delta at LSN 1, the image at LSN 4. If the read path finishes reading the original delta
+            // layer containing 4, yields, and we update the layer map to put the delta layer.
+            //
+            // ---------------                                      ---------------
+            //   delta1@LSN4                                          image1@LSN4
+            // ---------------  (read path collects delta@LSN4,  => ---------------  (read path collects LSN4 and LSN1,
+            //  delta1@LSN1-3    yields)                              delta1@LSN1     which is an invalid history)
+            // ---------------                                      ---------------
+            //
+            // Therefore, the gc-compaction layer update operation should wait for all ongoing reads, block all pending reads,
+            // and only allow reads to continue after the update is finished.
+
+            let update_guard = self.gc_compaction_layer_update_lock.write().await;
+            // Acquiring the update guard ensures current read operations end and new read operations are blocked.
+            // TODO: can we use `latest_gc_cutoff` Rcu to achieve the same effect?
            let mut guard = self.layers.write().await;
            guard
                .open_mut()?
-                .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
+                .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics);
+            drop(update_guard); // Allow new reads to start ONLY after we finished updating the layer map.
        };

        // Schedule an index-only upload to update the `latest_gc_cutoff` in the index_part.json.
@@ -3186,11 +3278,7 @@ impl TimelineAdaptor {
            ranges: self.get_keyspace(key_range, lsn, ctx).await?,
        };
        // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
-        let start = Key::MIN;
-        let ImageLayerCreationOutcome {
-            unfinished_image_layer,
-            next_start_key: _,
-        } = self
+        let outcome = self
            .timeline
            .create_image_layer_for_rel_blocks(
                &keyspace,
@@ -3198,13 +3286,15 @@ impl TimelineAdaptor {
                lsn,
                ctx,
                key_range.clone(),
-                start,
                IoConcurrency::sequential(),
            )
            .await?;

-        if let Some(image_layer_writer) = unfinished_image_layer {
-            let (desc, path) = image_layer_writer.finish(ctx).await?;
+        if let ImageLayerCreationOutcome::Generated {
+            unfinished_image_layer,
+        } = outcome
+        {
+            let (desc, path) = unfinished_image_layer.finish(ctx).await?;
            let image_layer =
                Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
            self.new_images.push(image_layer);
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -341,6 +341,13 @@ impl DeleteTimelineFlow {
        let tenant_shard_id = timeline.tenant_shard_id();
        let timeline_id = timeline.timeline_id();

+        // Take a tenant gate guard, because timeline deletion needs access to the tenant to update its manifest.
+        let Ok(tenant_guard) = tenant.gate.enter() else {
+            // It is safe to simply skip here, because we only schedule background work once the timeline is durably marked for deletion.
+            info!("Tenant is shutting down, timeline deletion will be resumed when it next starts");
+            return;
+        };
+
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            TaskKind::TimelineDeletionWorker,
@@ -348,6 +355,8 @@ impl DeleteTimelineFlow {
            Some(timeline_id),
            "timeline_delete",
            async move {
+                let _guard = tenant_guard;
+
                if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await {
                    // Only log as an error if it's not a cancellation.
                    if matches!(err, DeleteTimelineError::Cancelled) {
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,8 +30,11 @@ use crate::{
    pgdatadir_mapping::CollectKeySpaceError,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
-        size::CalculateSyntheticSizeError, storage_layer::LayerVisibilityHint,
-        tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
+        size::CalculateSyntheticSizeError,
+        storage_layer::LayerVisibilityHint,
+        tasks::{sleep_random, BackgroundLoopKind, BackgroundLoopSemaphorePermit},
+        timeline::EvictionError,
+        LogicalSizeCalculationCause, Tenant,
    },
 };

@@ -80,8 +83,6 @@ impl Timeline {

    #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
    async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
-        use crate::tenant::tasks::random_init_delay;
-
        // acquire the gate guard only once within a useful span
        let Ok(guard) = self.gate.enter() else {
            return;
@@ -94,7 +95,7 @@ impl Timeline {
                EvictionPolicy::OnlyImitiate(lat) => lat.period,
                EvictionPolicy::NoEviction => Duration::from_secs(10),
            };
-            if random_init_delay(period, &self.cancel).await.is_err() {
+            if sleep_random(period, &self.cancel).await.is_err() {
                return;
            }
        }
@@ -330,9 +331,10 @@ impl Timeline {
        &self,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> {
-        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
+    ) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> {
+        let acquire_permit = crate::tenant::tasks::acquire_concurrency_permit(
            BackgroundLoopKind::Eviction,
+            false,
            ctx,
        );

@@ -374,7 +376,7 @@ impl Timeline {
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
-        permit: tokio::sync::SemaphorePermit<'static>,
+        permit: BackgroundLoopSemaphorePermit<'static>,
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
        if !self.tenant_shard_id.is_shard_zero() {
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -91,6 +91,7 @@ impl LayerManager {
                layer_map,
                layer_fmgr: LayerFileManager(hashmap),
            }) => {
+                // NB: no need to decrement layer metrics; metrics are removed on timeline shutdown.
                let open = layer_map.open_layer.take();
                let frozen = layer_map.frozen_layers.len();
                let taken_writer_state = writer_state.take();
@@ -234,6 +235,7 @@ impl OpenLayerManager {
        lsn: Lsn,
        last_freeze_at: &AtomicLsn,
        write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
+        metrics: &TimelineMetrics,
    ) -> bool {
        let Lsn(last_record_lsn) = lsn;
        let end_lsn = Lsn(last_record_lsn + 1);
@@ -242,6 +244,11 @@ impl OpenLayerManager {
            let open_layer_rc = Arc::clone(open_layer);
            open_layer.freeze(end_lsn).await;

+            // Increment the frozen layer metrics. This is decremented in `finish_flush_l0_layer()`.
+            // TODO: It would be nicer to do this via `InMemoryLayer::drop()`, but it requires a
+            // reference to the timeline metrics. Other methods use a metrics borrow as well.
+            metrics.inc_frozen_layer(open_layer);
+
            // The layer is no longer open, update the layer map to reflect this.
            // We will replace it with on-disk historics below.
            self.layer_map.frozen_layers.push_back(open_layer_rc);
@@ -298,6 +305,7 @@ impl OpenLayerManager {
            .frozen_layers
            .pop_front()
            .expect("there must be a inmem layer to flush");
+        metrics.dec_frozen_layer(&inmem);

        // Only one task may call this function at a time (for this
        // timeline). If two tasks tried to flush the same frozen
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -1,4 +1,4 @@
-use std::{collections::hash_map::Entry, fs, sync::Arc};
+use std::{collections::hash_map::Entry, fs, future::Future, sync::Arc};

 use anyhow::Context;
 use camino::Utf8PathBuf;
@@ -8,7 +8,8 @@ use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard};
 use crate::{
    context::RequestContext,
    import_datadir,
-    tenant::{CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
+    span::debug_assert_current_span_has_tenant_and_timeline_id,
+    tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
 };

 use super::Timeline;
@@ -24,6 +25,9 @@ pub struct UninitializedTimeline<'t> {
    pub(crate) owning_tenant: &'t Tenant,
    timeline_id: TimelineId,
    raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
+    /// Whether we spawned the inner Timeline's tasks such that we must later shut it down
+    /// if aborting the timeline creation
+    needs_shutdown: bool,
 }

 impl<'t> UninitializedTimeline<'t> {
@@ -36,6 +40,50 @@ impl<'t> UninitializedTimeline<'t> {
            owning_tenant,
            timeline_id,
            raw_timeline,
+            needs_shutdown: false,
+        }
+    }
+
+    /// When writing data to this timeline during creation, use this wrapper: it will take care of
+    /// setup of Timeline tasks required for I/O (flush loop) and making sure they are torn down
+    /// later.
+    pub(crate) async fn write<F, Fut>(&mut self, f: F) -> anyhow::Result<()>
+    where
+        F: FnOnce(Arc<Timeline>) -> Fut,
+        Fut: Future<Output = Result<(), CreateTimelineError>>,
+    {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
+        // Remember that we did I/O (spawned the flush loop), so that we can check we shut it down on drop
+        self.needs_shutdown = true;
+
+        let timeline = self.raw_timeline()?;
+
+        // Spawn flush loop so that the Timeline is ready to accept writes
+        timeline.maybe_spawn_flush_loop();
+
+        // Invoke the provided function, which will write some data into the new timeline
+        if let Err(e) = f(timeline.clone()).await {
+            self.abort().await;
+            return Err(e.into());
+        }
+
+        // Flush the underlying timeline's ephemeral layers to disk
+        if let Err(e) = timeline
+            .freeze_and_flush()
+            .await
+            .context("Failed to flush after timeline creation writes")
+        {
+            self.abort().await;
+            return Err(e);
+        }
+
+        Ok(())
+    }
+
+    pub(crate) async fn abort(&self) {
+        if let Some((raw_timeline, _)) = self.raw_timeline.as_ref() {
+            raw_timeline.shutdown(super::ShutdownMode::Hard).await;
        }
    }

@@ -44,11 +92,13 @@ impl<'t> UninitializedTimeline<'t> {
    /// This function launches the flush loop if not already done.
    ///
    /// The caller is responsible for activating the timeline (function `.activate()`).
-    pub(crate) fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
+    pub(crate) async fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
        let timeline_id = self.timeline_id;
        let tenant_shard_id = self.owning_tenant.tenant_shard_id;

        if self.raw_timeline.is_none() {
+            self.abort().await;
+
            return Err(anyhow::anyhow!(
                "No timeline for initialization found for {tenant_shard_id}/{timeline_id}"
            ));
@@ -62,16 +112,25 @@ impl<'t> UninitializedTimeline<'t> {
            .0
            .get_disk_consistent_lsn();

-        anyhow::ensure!(
-            new_disk_consistent_lsn.is_valid(),
-            "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn"
-        );
+        if !new_disk_consistent_lsn.is_valid() {
+            self.abort().await;
+
+            return Err(anyhow::anyhow!(
+                "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn"
+            ));
+        }

        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
        match timelines.entry(timeline_id) {
-            Entry::Occupied(_) => anyhow::bail!(
+            Entry::Occupied(_) => {
+                // Unexpected, bug in the caller.  Tenant is responsible for preventing concurrent creation of the same timeline.
+                //
+                // We do not call Self::abort here.  Because we don't cleanly shut down our Timeline, [`Self::drop`] should
+                // skip trying to delete the timeline directory too.
+                anyhow::bail!(
                "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map"
-            ),
+                )
+            }
            Entry::Vacant(v) => {
                // after taking here should be no fallible operations, because the drop guard will not
                // cleanup after and would block for example the tenant deletion
@@ -93,36 +152,31 @@ impl<'t> UninitializedTimeline<'t> {

    /// Prepares timeline data by loading it from the basebackup archive.
    pub(crate) async fn import_basebackup_from_tar(
-        self,
+        mut self,
        tenant: Arc<Tenant>,
        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
        base_lsn: Lsn,
        broker_client: storage_broker::BrokerClientChannel,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
-        let raw_timeline = self.raw_timeline()?;
+        self.write(|raw_timeline| async move {
+            import_datadir::import_basebackup_from_tar(&raw_timeline, copyin_read, base_lsn, ctx)
+                .await
+                .context("Failed to import basebackup")
+                .map_err(CreateTimelineError::Other)?;

-        import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx)
-            .await
-            .context("Failed to import basebackup")?;
+            fail::fail_point!("before-checkpoint-new-timeline", |_| {
+                Err(CreateTimelineError::Other(anyhow::anyhow!(
+                    "failpoint before-checkpoint-new-timeline"
+                )))
+            });

-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        raw_timeline.maybe_spawn_flush_loop();
-
-        fail::fail_point!("before-checkpoint-new-timeline", |_| {
-            anyhow::bail!("failpoint before-checkpoint-new-timeline");
-        });
-
-        raw_timeline
-            .freeze_and_flush()
-            .await
-            .context("Failed to flush after basebackup import")?;
+            Ok(())
+        })
+        .await?;

        // All the data has been imported. Insert the Timeline into the tenant's timelines map
-        let tl = self.finish_creation()?;
+        let tl = self.finish_creation().await?;
        tl.activate(tenant, broker_client, None, ctx);
        Ok(tl)
    }
@@ -143,12 +197,19 @@ impl<'t> UninitializedTimeline<'t> {

 impl Drop for UninitializedTimeline<'_> {
    fn drop(&mut self) {
-        if let Some((_, create_guard)) = self.raw_timeline.take() {
+        if let Some((timeline, create_guard)) = self.raw_timeline.take() {
            let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
-            // This is unusual, but can happen harmlessly if the pageserver is stopped while
-            // creating a timeline.
-            info!("Timeline got dropped without initializing, cleaning its files");
-            cleanup_timeline_directory(create_guard);
+            if self.needs_shutdown && !timeline.gate.close_complete() {
+                // This should not happen: caller should call [`Self::abort`] on failures
+                tracing::warn!(
+                    "Timeline not shut down after initialization failure, cannot clean up files"
+                );
+            } else {
+                // This is unusual, but can happen harmlessly if the pageserver is stopped while
+                // creating a timeline.
+                info!("Timeline got dropped without initializing, cleaning its files");
+                cleanup_timeline_directory(create_guard);
+            }
        }
    }
 }
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -39,7 +39,7 @@ use crate::{
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use utils::{id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
+use utils::{critical, id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
 use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};

 /// Status of the connection.
@@ -355,6 +355,19 @@ pub(super) async fn handle_walreceiver_connection(
                // advances it to its end LSN. 0 is just an initialization placeholder.
                let mut modification = timeline.begin_modification(Lsn(0));

+                async fn commit(
+                    modification: &mut DatadirModification<'_>,
+                    ctx: &RequestContext,
+                    uncommitted: &mut u64,
+                ) -> anyhow::Result<()> {
+                    let stats = modification.stats();
+                    modification.commit(ctx).await?;
+                    WAL_INGEST.records_committed.inc_by(*uncommitted);
+                    WAL_INGEST.inc_values_committed(&stats);
+                    *uncommitted = 0;
+                    Ok(())
+                }
+
                if !records.is_empty() {
                    timeline
                        .metrics
@@ -366,8 +379,7 @@ pub(super) async fn handle_walreceiver_connection(
                    if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                        && uncommitted_records > 0
                    {
-                        modification.commit(&ctx).await?;
-                        uncommitted_records = 0;
+                        commit(&mut modification, &ctx, &mut uncommitted_records).await?;
                    }

                    let local_next_record_lsn = interpreted.next_record_lsn;
@@ -381,6 +393,13 @@ pub(super) async fn handle_walreceiver_connection(
                        .await
                        .with_context(|| {
                            format!("could not ingest record at {local_next_record_lsn}")
+                        })
+                        .inspect_err(|err| {
+                            // TODO: we can't differentiate cancellation errors with
+                            // anyhow::Error, so just ignore it if we're cancelled.
+                            if !cancellation.is_cancelled() {
+                                critical!("{err:?}")
+                            }
                        })?;

                    uncommitted_records += 1;
@@ -396,8 +415,7 @@ pub(super) async fn handle_walreceiver_connection(
                        || modification.approx_pending_bytes()
                            > DatadirModification::MAX_PENDING_BYTES
                    {
-                        modification.commit(&ctx).await?;
-                        uncommitted_records = 0;
+                        commit(&mut modification, &ctx, &mut uncommitted_records).await?;
                    }
                }

@@ -415,7 +433,7 @@ pub(super) async fn handle_walreceiver_connection(

                if uncommitted_records > 0 || needs_last_record_lsn_advance {
                    // Commit any uncommitted records
-                    modification.commit(&ctx).await?;
+                    commit(&mut modification, &ctx, &mut uncommitted_records).await?;
                }

                if !caught_up && streaming_lsn >= end_of_wal {
@@ -442,10 +460,12 @@ pub(super) async fn handle_walreceiver_connection(
                    filtered: &mut u64,
                    ctx: &RequestContext,
                ) -> anyhow::Result<()> {
+                    let stats = modification.stats();
+                    modification.commit(ctx).await?;
                    WAL_INGEST
                        .records_committed
                        .inc_by(*uncommitted - *filtered);
-                    modification.commit(ctx).await?;
+                    WAL_INGEST.inc_values_committed(&stats);
                    *uncommitted = 0;
                    *filtered = 0;
                    Ok(())
@@ -507,6 +527,13 @@ pub(super) async fn handle_walreceiver_connection(
                            .await
                            .with_context(|| {
                                format!("could not ingest record at {next_record_lsn}")
+                            })
+                            .inspect_err(|err| {
+                                // TODO: we can't differentiate cancellation errors with
+                                // anyhow::Error, so just ignore it if we're cancelled.
+                                if !cancellation.is_cancelled() {
+                                    critical!("{err:?}")
+                                }
                            })?;
                        if !ingested {
                            tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -234,6 +234,19 @@ impl VirtualFile {
    ) -> (FullSlice<Buf>, Result<usize, Error>) {
        self.inner.write_all(buf, ctx).await
    }
+
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
+        self.inner.read_to_end(buf, ctx).await
+    }
+
+    pub(crate) async fn read_to_string(
+        &mut self,
+        ctx: &RequestContext,
+    ) -> Result<String, anyhow::Error> {
+        let mut buf = Vec::new();
+        self.read_to_end(&mut buf, ctx).await?;
+        Ok(String::from_utf8(buf)?)
+    }
 }

 /// Indicates whether to enable fsync, fdatasync, or O_SYNC/O_DSYNC when writing
@@ -993,6 +1006,24 @@ impl VirtualFileInner {
            (buf, result)
        })
    }
+
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
+        let mut tmp = vec![0; 128];
+        loop {
+            let slice = tmp.slice(..128);
+            let (slice, res) = self.read_at(slice, self.pos, ctx).await;
+            match res {
+                Ok(0) => return Ok(()),
+                Ok(n) => {
+                    self.pos += n as u64;
+                    buf.extend_from_slice(&slice[..n]);
+                }
+                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+            tmp = slice.into_inner();
+        }
+    }
 }

 // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
@@ -1237,10 +1268,6 @@ impl VirtualFile {
    ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
        self.inner.read_blk(blknum, ctx).await
    }
-
-    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
-        self.inner.read_to_end(buf, ctx).await
-    }
 }

 #[cfg(test)]
@@ -1260,24 +1287,6 @@ impl VirtualFileInner {
            slice.into_inner(),
        ))
    }
-
-    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
-        let mut tmp = vec![0; 128];
-        loop {
-            let slice = tmp.slice(..128);
-            let (slice, res) = self.read_at(slice, self.pos, ctx).await;
-            match res {
-                Ok(0) => return Ok(()),
-                Ok(n) => {
-                    self.pos += n as u64;
-                    buf.extend_from_slice(&slice[..n]);
-                }
-                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
-            }
-            tmp = slice.into_inner();
-        }
-    }
 }

 impl Drop for VirtualFileInner {
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -28,17 +28,9 @@ use std::time::Duration;
 use std::time::Instant;
 use std::time::SystemTime;

-use pageserver_api::shard::ShardIdentity;
-use postgres_ffi::fsm_logical_to_physical;
-use postgres_ffi::walrecord::*;
-use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
-use wal_decoder::models::*;
-
 use anyhow::{bail, Result};
 use bytes::{Buf, Bytes};
 use tracing::*;
-use utils::failpoint_support;
-use utils::rate_limit::RateLimit;

 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
@@ -50,11 +42,18 @@ use crate::ZERO_PAGE;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
+use pageserver_api::shard::ShardIdentity;
+use postgres_ffi::fsm_logical_to_physical;
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::walrecord::*;
 use postgres_ffi::TransactionId;
+use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
 use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;
+use utils::rate_limit::RateLimit;
+use utils::{critical, failpoint_support};
+use wal_decoder::models::*;

 enum_pgversion! {CheckPoint, pgv::CheckPoint}

@@ -327,93 +326,75 @@ impl WalIngest {
        let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
        let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);

-        // Sometimes, Postgres seems to create heap WAL records with the
-        // ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is
-        // not set. In fact, it's possible that the VM page does not exist at all.
-        // In that case, we don't want to store a record to clear the VM bit;
-        // replaying it would fail to find the previous image of the page, because
-        // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
-        // record if it doesn't.
-        //
-        // TODO: analyze the metrics and tighten this up accordingly. This logic
-        // implicitly assumes that VM pages see explicit WAL writes before
-        // implicit ClearVmBits, and will otherwise silently drop updates.
+        // VM bits can only be cleared on the shard(s) owning the VM relation, and must be within
+        // its view of the VM relation size. Out of caution, error instead of failing WAL ingestion,
+        // as there has historically been cases where PostgreSQL has cleared spurious VM pages. See:
+        // https://github.com/neondatabase/neon/pull/10634.
        let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else {
-            WAL_INGEST
-                .clear_vm_bits_unknown
-                .with_label_values(&["relation"])
-                .inc();
+            critical!("clear_vm_bits for unknown VM relation {vm_rel}");
            return Ok(());
        };
        if let Some(blknum) = new_vm_blk {
            if blknum >= vm_size {
-                WAL_INGEST
-                    .clear_vm_bits_unknown
-                    .with_label_values(&["new_page"])
-                    .inc();
+                critical!("new_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
                new_vm_blk = None;
            }
        }
        if let Some(blknum) = old_vm_blk {
            if blknum >= vm_size {
-                WAL_INGEST
-                    .clear_vm_bits_unknown
-                    .with_label_values(&["old_page"])
-                    .inc();
+                critical!("old_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
                old_vm_blk = None;
            }
        }

-        if new_vm_blk.is_some() || old_vm_blk.is_some() {
-            if new_vm_blk == old_vm_blk {
-                // An UPDATE record that needs to clear the bits for both old and the
-                // new page, both of which reside on the same VM page.
+        if new_vm_blk.is_none() && old_vm_blk.is_none() {
+            return Ok(());
+        } else if new_vm_blk == old_vm_blk {
+            // An UPDATE record that needs to clear the bits for both old and the new page, both of
+            // which reside on the same VM page.
+            self.put_rel_wal_record(
+                modification,
+                vm_rel,
+                new_vm_blk.unwrap(),
+                NeonWalRecord::ClearVisibilityMapFlags {
+                    new_heap_blkno,
+                    old_heap_blkno,
+                    flags,
+                },
+                ctx,
+            )
+            .await?;
+        } else {
+            // Clear VM bits for one heap page, or for two pages that reside on different VM pages.
+            if let Some(new_vm_blk) = new_vm_blk {
                self.put_rel_wal_record(
                    modification,
                    vm_rel,
-                    new_vm_blk.unwrap(),
+                    new_vm_blk,
                    NeonWalRecord::ClearVisibilityMapFlags {
                        new_heap_blkno,
+                        old_heap_blkno: None,
+                        flags,
+                    },
+                    ctx,
+                )
+                .await?;
+            }
+            if let Some(old_vm_blk) = old_vm_blk {
+                self.put_rel_wal_record(
+                    modification,
+                    vm_rel,
+                    old_vm_blk,
+                    NeonWalRecord::ClearVisibilityMapFlags {
+                        new_heap_blkno: None,
                        old_heap_blkno,
                        flags,
                    },
                    ctx,
                )
                .await?;
-            } else {
-                // Clear VM bits for one heap page, or for two pages that reside on
-                // different VM pages.
-                if let Some(new_vm_blk) = new_vm_blk {
-                    self.put_rel_wal_record(
-                        modification,
-                        vm_rel,
-                        new_vm_blk,
-                        NeonWalRecord::ClearVisibilityMapFlags {
-                            new_heap_blkno,
-                            old_heap_blkno: None,
-                            flags,
-                        },
-                        ctx,
-                    )
-                    .await?;
-                }
-                if let Some(old_vm_blk) = old_vm_blk {
-                    self.put_rel_wal_record(
-                        modification,
-                        vm_rel,
-                        old_vm_blk,
-                        NeonWalRecord::ClearVisibilityMapFlags {
-                            new_heap_blkno: None,
-                            old_heap_blkno,
-                            flags,
-                        },
-                        ctx,
-                    )
-                    .await?;
-                }
            }
        }
-
        Ok(())
    }

--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -79,6 +79,14 @@ impl WalRedoProcess {
            .env_clear()
            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env(
+                "ASAN_OPTIONS",
+                std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+            )
+            .env(
+                "UBSAN_OPTIONS",
+                std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+            )
            // NB: The redo process is not trusted after we sent it the first
            // walredo work. Before that, it is trusted. Specifically, we trust
            // it to
--- a/Show More
+++ b/Show More