Merge branch 'main' into yuchen/direct-io-reads-in-ci-by-default

2026-05-16 20:50:37 +00:00 · 2024-11-11 16:26:01 -05:00
parent 8d76bca3d2 4b075db7ea
commit b4a26e32ed
37 changed files with 1066 additions and 389 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -20,3 +20,4 @@ config-variables:
  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
  - DEV_AWS_OIDC_ROLE_ARN
+  - BENCHMARK_INGEST_TARGET_PROJECTID
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -0,0 +1,372 @@
+name: Benchmarking
+
+on:
+  # uncomment to run on push for debugging your PR
+  # push:
+  #   branches: [ your branch ]
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:   '0 9 * * *' # run once a day, timezone is utc
+  workflow_dispatch: # adds ability to run this manually
+    
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow globally because we need dedicated resources which only exist once
+  group: ingest-bench-workflow
+  cancel-in-progress: true
+
+jobs:
+  ingest:
+    strategy:
+      matrix:
+        target_project: [new_empty_project, large_existing_project]  
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
+    env:
+      PG_CONFIG: /tmp/neon/pg_install/v16/bin/pg_config
+      PSQL: /tmp/neon/pg_install/v16/bin/psql
+      PG_16_LIB_PATH: /tmp/neon/pg_install/v16/lib
+      PGCOPYDB: /pgcopydb/bin/pgcopydb
+      PGCOPYDB_LIB_PATH: /pgcopydb/lib
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+    timeout-minutes: 1440
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Configure AWS credentials # necessary to download artefacts
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role 
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Create Neon Project
+      if: ${{ matrix.target_project == 'new_empty_project' }}
+      id: create-neon-project-ingest-target
+      uses: ./.github/actions/neon-project-create
+      with:
+        region_id: aws-us-east-2
+        postgres_version: 16
+        compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Initialize Neon project and retrieve current backpressure seconds
+      if: ${{ matrix.target_project == 'new_empty_project' }}
+      env:
+          NEW_PROJECT_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }}
+          NEW_PROJECT_ID: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
+      run: |
+        echo "Initializing Neon project with project_id: ${NEW_PROJECT_ID}"
+        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
+        ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
+        BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
+        echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV
+        echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV
+
+    - name: Create Neon Branch for large tenant
+      if: ${{ matrix.target_project == 'large_existing_project' }}
+      id: create-neon-branch-ingest-target
+      uses: ./.github/actions/neon-branch-create
+      with:
+        project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Initialize Neon project and retrieve current backpressure seconds
+      if: ${{ matrix.target_project == 'large_existing_project' }}
+      env:
+          NEW_PROJECT_CONNSTR: ${{ steps.create-neon-branch-ingest-target.outputs.dsn }}
+          NEW_BRANCH_ID: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }}
+      run: |
+        echo "Initializing Neon branch with branch_id: ${NEW_BRANCH_ID}"
+        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
+        # Extract the part before the database name
+        base_connstr="${NEW_PROJECT_CONNSTR%/*}"
+        # Extract the query parameters (if any) after the database name
+        query_params="${NEW_PROJECT_CONNSTR#*\?}"
+        # Reconstruct the new connection string
+        if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then
+          new_connstr="${base_connstr}/neondb?${query_params}"
+        else
+          new_connstr="${base_connstr}/neondb"
+        fi
+        ${PSQL} "${new_connstr}" -c "drop database ludicrous;"
+        ${PSQL} "${new_connstr}" -c "CREATE DATABASE ludicrous;"
+        if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then
+          NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous?${query_params}"
+        else
+          NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous"
+        fi
+        ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
+        BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
+        echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV
+        echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV
+      
+        
+    - name: Create pgcopydb filter file
+      run: |
+        cat << EOF > /tmp/pgcopydb_filter.txt
+          [include-only-table]
+          public.events
+          public.emails
+          public.email_transmissions
+          public.payments
+          public.editions
+          public.edition_modules
+          public.sp_content
+          public.email_broadcasts
+          public.user_collections
+          public.devices
+          public.user_accounts
+          public.lessons
+          public.lesson_users
+          public.payment_methods
+          public.orders
+          public.course_emails
+          public.modules
+          public.users
+          public.module_users
+          public.courses
+          public.payment_gateway_keys
+          public.accounts
+          public.roles
+          public.payment_gateways
+          public.management
+          public.event_names
+        EOF
+
+    - name: Invoke pgcopydb
+      env:
+          BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }}
+      run: |
+        export LD_LIBRARY_PATH=${PGCOPYDB_LIB_PATH}:${PG_16_LIB_PATH}
+        export PGCOPYDB_SOURCE_PGURI="${BENCHMARK_INGEST_SOURCE_CONNSTR}"
+        export PGCOPYDB_TARGET_PGURI="${NEW_PROJECT_CONNSTR}"
+        export PGOPTIONS="-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
+        ${PG_CONFIG} --bindir
+        ${PGCOPYDB} --version
+        ${PGCOPYDB} clone --skip-vacuum  --no-owner --no-acl --skip-db-properties --table-jobs 4 \
+          --index-jobs 4 --restore-jobs 4 --split-tables-larger-than 10GB --skip-extensions \
+          --use-copy-binary --filters /tmp/pgcopydb_filter.txt 2>&1 | tee /tmp/pgcopydb_${{ matrix.target_project }}.log
+
+    # create dummy pgcopydb log to test parsing
+    # - name: create dummy log for parser test
+    #   run: |
+    #     cat << EOF > /tmp/pgcopydb_${{ matrix.target_project }}.log
+    #     2024-11-04 18:00:53.433 500861 INFO   main.c:136                Running pgcopydb version 0.17.10.g8361a93 from "/usr/lib/postgresql/17/bin/pgcopydb"
+    #     2024-11-04 18:00:53.434 500861 INFO   cli_common.c:1225         [SOURCE] Copying database from "postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60"
+    #     2024-11-04 18:00:53.434 500861 INFO   cli_common.c:1226         [TARGET] Copying database into "postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60"
+    #     2024-11-04 18:00:53.442 500861 INFO   copydb.c:105              Using work dir "/tmp/pgcopydb"
+    #     2024-11-04 18:00:53.541 500861 INFO   snapshot.c:107            Exported snapshot "00000008-00000033-1" from the source database
+    #     2024-11-04 18:00:53.556 500865 INFO   cli_clone_follow.c:543    STEP 1: fetch source database tables, indexes, and sequences
+    #     2024-11-04 18:00:54.570 500865 INFO   copydb_schema.c:716       Splitting source candidate tables larger than 10 GB
+    #     2024-11-04 18:00:54.570 500865 INFO   copydb_schema.c:829       Table public.events is 96 GB large which is larger than --split-tables-larger-than 10 GB, and does not have a unique column of type integer: splitting by CTID
+    #     2024-11-04 18:01:05.538 500865 INFO   copydb_schema.c:905       Table public.events is 96 GB large, 10 COPY processes will be used, partitioning on ctid.
+    #     2024-11-04 18:01:05.564 500865 INFO   copydb_schema.c:905       Table public.email_transmissions is 27 GB large, 4 COPY processes will be used, partitioning on id.
+    #     2024-11-04 18:01:05.584 500865 INFO   copydb_schema.c:905       Table public.lessons is 25 GB large, 4 COPY processes will be used, partitioning on id.
+    #     2024-11-04 18:01:05.605 500865 INFO   copydb_schema.c:905       Table public.lesson_users is 16 GB large, 3 COPY processes will be used, partitioning on id.
+    #     2024-11-04 18:01:05.605 500865 INFO   copydb_schema.c:761       Fetched information for 26 tables (including 4 tables split in 21 partitions total), with an estimated total of 907 million tuples and 175 GB on-disk
+    #     2024-11-04 18:01:05.687 500865 INFO   copydb_schema.c:968       Fetched information for 57 indexes (supporting 25 constraints)
+    #     2024-11-04 18:01:05.753 500865 INFO   sequences.c:78            Fetching information for 24 sequences
+    #     2024-11-04 18:01:05.903 500865 INFO   copydb_schema.c:1122      Fetched information for 4 extensions
+    #     2024-11-04 18:01:06.178 500865 INFO   copydb_schema.c:1538      Found 0 indexes (supporting 0 constraints) in the target database
+    #     2024-11-04 18:01:06.184 500865 INFO   cli_clone_follow.c:584    STEP 2: dump the source database schema (pre/post data)
+    #     2024-11-04 18:01:06.186 500865 INFO   pgcmd.c:468                /usr/lib/postgresql/16/bin/pg_dump -Fc --snapshot 00000008-00000033-1 --section=pre-data --section=post-data --file /tmp/pgcopydb/schema/schema.dump 'postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60'
+    #     2024-11-04 18:01:06.952 500865 INFO   cli_clone_follow.c:592    STEP 3: restore the pre-data section to the target database
+    #     2024-11-04 18:01:07.004 500865 INFO   pgcmd.c:1001               /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section pre-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/pre-filtered.list /tmp/pgcopydb/schema/schema.dump
+    #     2024-11-04 18:01:07.438 500874 INFO   table-data.c:656          STEP 4: starting 4 table-data COPY processes
+    #     2024-11-04 18:01:07.451 500877 INFO   vacuum.c:139              STEP 8: skipping VACUUM jobs per --skip-vacuum
+    #     2024-11-04 18:01:07.457 500875 INFO   indexes.c:182             STEP 6: starting 4 CREATE INDEX processes
+    #     2024-11-04 18:01:07.457 500875 INFO   indexes.c:183             STEP 7: constraints are built by the CREATE INDEX processes
+    #     2024-11-04 18:01:07.507 500865 INFO   blobs.c:74                Skipping large objects: none found.
+    #     2024-11-04 18:01:07.509 500865 INFO   sequences.c:194           STEP 9: reset sequences values
+    #     2024-11-04 18:01:07.510 500886 INFO   sequences.c:290           Set sequences values on the target database
+    #     2024-11-04 20:49:00.587 500865 INFO   cli_clone_follow.c:608    STEP 10: restore the post-data section to the target database
+    #     2024-11-04 20:49:00.600 500865 INFO   pgcmd.c:1001               /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section post-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/post-filtered.list /tmp/pgcopydb/schema/schema.dump
+    #     2024-11-05 10:50:58.508 500865 INFO   cli_clone_follow.c:639    All step are now done, 16h49m elapsed
+    #     2024-11-05 10:50:58.508 500865 INFO   summary.c:3155            Printing summary for 26 tables and 57 indexes
+
+    #       OID | Schema |                 Name | Parts | copy duration | transmitted bytes | indexes | create index duration 
+    #     ------+--------+----------------------+-------+---------------+-------------------+---------+----------------------
+    #     24654 | public |               events |    10 |         1d11h |            878 GB |       1 |                 1h41m
+    #     24623 | public |  email_transmissions |     4 |         4h46m |             99 GB |       3 |                 2h04m
+    #     24665 | public |              lessons |     4 |         4h42m |            161 GB |       4 |                 1m11s
+    #     24661 | public |         lesson_users |     3 |         2h46m |             49 GB |       3 |                39m35s
+    #     24631 | public |               emails |     1 |        34m07s |             10 GB |       2 |                   17s
+    #     24739 | public |             payments |     1 |         5m47s |           1848 MB |       4 |                 4m40s
+    #     24681 | public |         module_users |     1 |         4m57s |           1610 MB |       3 |                 1m50s
+    #     24694 | public |               orders |     1 |         2m50s |            835 MB |       3 |                 1m05s
+    #     24597 | public |              devices |     1 |         1m45s |            498 MB |       2 |                   40s
+    #     24723 | public |      payment_methods |     1 |         1m24s |            548 MB |       2 |                   31s
+    #     24765 | public |     user_collections |     1 |         2m17s |           1005 MB |       2 |                 968ms
+    #     24774 | public |                users |     1 |           52s |            291 MB |       4 |                   27s
+    #     24760 | public |        user_accounts |     1 |           16s |            172 MB |       3 |                   16s
+    #     24606 | public |      edition_modules |     1 |         8s983 |             46 MB |       3 |                 4s749
+    #     24583 | public |        course_emails |     1 |         8s526 |             26 MB |       2 |                 996ms
+    #     24685 | public |              modules |     1 |         1s592 |             21 MB |       3 |                 1s696
+    #     24610 | public |             editions |     1 |         2s199 |           7483 kB |       2 |                 1s032
+    #     24755 | public |           sp_content |     1 |         1s555 |           4177 kB |       0 |                   0ms
+    #     24619 | public |     email_broadcasts |     1 |         744ms |           2645 kB |       2 |                 677ms
+    #     24590 | public |              courses |     1 |         387ms |           1540 kB |       2 |                 367ms
+    #     24704 | public | payment_gateway_keys |     1 |         1s972 |            164 kB |       2 |                  27ms
+    #     24576 | public |             accounts |     1 |          58ms |             24 kB |       1 |                  14ms
+    #     24647 | public |          event_names |     1 |          32ms |             397 B |       1 |                   8ms
+    #     24716 | public |     payment_gateways |     1 |         1s675 |             117 B |       1 |                  11ms
+    #     24748 | public |                roles |     1 |          71ms |             173 B |       1 |                   8ms
+    #     24676 | public |           management |     1 |          33ms |              40 B |       1 |                  19ms
+
+
+    #                                                   Step   Connection    Duration    Transfer   Concurrency
+    #     --------------------------------------------------   ----------  ----------  ----------  ------------
+    #       Catalog Queries (table ordering, filtering, etc)       source         12s                         1
+    #                                             Dump Schema       source       765ms                         1
+    #                                         Prepare Schema       target       466ms                         1
+    #           COPY, INDEX, CONSTRAINTS, VACUUM (wall clock)         both       2h47m                        12
+    #                                       COPY (cumulative)         both       7h46m     1225 GB             4
+    #                               CREATE INDEX (cumulative)       target       4h36m                         4
+    #                               CONSTRAINTS (cumulative)       target       8s493                         4
+    #                                     VACUUM (cumulative)       target         0ms                         4
+    #                                         Reset Sequences         both        60ms                         1
+    #                             Large Objects (cumulative)       (null)         0ms                         0
+    #                                         Finalize Schema         both      14h01m                         4
+    #     --------------------------------------------------   ----------  ----------  ----------  ------------
+    #                               Total Wall Clock Duration         both      16h49m                        20
+
+
+    #     EOF
+
+
+    - name: show tables sizes and retrieve current backpressure seconds
+      run: |
+        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
+        ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "\dt+"
+        BACKPRESSURE_TIME_AFTER_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
+        echo "BACKPRESSURE_TIME_AFTER_INGEST=${BACKPRESSURE_TIME_AFTER_INGEST}" >> $GITHUB_ENV
+
+    - name: Parse pgcopydb log and report performance metrics
+      env:
+        PERF_TEST_RESULT_CONNSTR: ${{ secrets.PERF_TEST_RESULT_CONNSTR }}
+      run: |
+        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
+
+        # Define the log file path
+        LOG_FILE="/tmp/pgcopydb_${{ matrix.target_project }}.log"
+        
+        # Get the current git commit hash
+        git config --global --add safe.directory /__w/neon/neon
+        COMMIT_HASH=$(git rev-parse --short HEAD)
+        
+        # Define the platform and test suite
+        PLATFORM="pg16-${{ matrix.target_project }}-us-east-2-staging"
+        SUIT="pgcopydb_ingest_bench"
+        
+        # Function to convert time (e.g., "2h47m", "4h36m", "118ms", "8s493") to seconds
+        convert_to_seconds() {
+          local duration=$1
+          local total_seconds=0
+    
+          # Check for hours (h)
+          if [[ "$duration" =~ ([0-9]+)h ]]; then
+            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 3600))
+          fi
+    
+          # Check for seconds (s)
+          if [[ "$duration" =~ ([0-9]+)s ]]; then
+            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0}))
+          fi
+    
+          # Check for milliseconds (ms) (if applicable)
+          if [[ "$duration" =~ ([0-9]+)ms ]]; then
+            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} / 1000))
+            duration=${duration/${BASH_REMATCH[0]}/} # need to remove it to avoid double counting with m 
+          fi
+
+          # Check for minutes (m) - must be checked after ms because m is contained in ms
+          if [[ "$duration" =~ ([0-9]+)m ]]; then
+            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 60))
+          fi
+    
+          echo $total_seconds
+        }
+
+        # Calculate the backpressure difference in seconds
+        BACKPRESSURE_TIME_DIFF=$(awk "BEGIN {print $BACKPRESSURE_TIME_AFTER_INGEST - $BACKPRESSURE_TIME_BEFORE_INGEST}")
+
+        # Insert the backpressure time difference into the performance database
+        if [ -n "$BACKPRESSURE_TIME_DIFF" ]; then
+          PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \"
+          INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp)
+          VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', 'backpressure_time', ${BACKPRESSURE_TIME_DIFF}, 'seconds', 'lower_is_better', now());
+          \""
+          echo "Inserting backpressure time difference: ${BACKPRESSURE_TIME_DIFF} seconds"
+          eval $PSQL_CMD
+        fi
+
+        # Extract and process log lines
+        while IFS= read -r line; do
+          METRIC_NAME=""
+          # Match each desired line and extract the relevant information
+          if [[ "$line" =~ COPY,\ INDEX,\ CONSTRAINTS,\ VACUUM.* ]]; then
+            METRIC_NAME="COPY, INDEX, CONSTRAINTS, VACUUM (wall clock)"
+          elif [[ "$line" =~ COPY\ \(cumulative\).* ]]; then
+            METRIC_NAME="COPY (cumulative)"
+          elif [[ "$line" =~ CREATE\ INDEX\ \(cumulative\).* ]]; then
+            METRIC_NAME="CREATE INDEX (cumulative)"
+          elif [[ "$line" =~ CONSTRAINTS\ \(cumulative\).* ]]; then
+            METRIC_NAME="CONSTRAINTS (cumulative)"
+          elif [[ "$line" =~ Finalize\ Schema.* ]]; then
+            METRIC_NAME="Finalize Schema"
+          elif [[ "$line" =~ Total\ Wall\ Clock\ Duration.* ]]; then
+            METRIC_NAME="Total Wall Clock Duration"
+          fi
+          
+          # If a metric was matched, insert it into the performance database
+          if [ -n "$METRIC_NAME" ]; then
+            DURATION=$(echo "$line" | grep -oP '\d+h\d+m|\d+s|\d+ms|\d{1,2}h\d{1,2}m|\d+\.\d+s' | head -n 1)
+            METRIC_VALUE=$(convert_to_seconds "$DURATION")
+            PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \"
+            INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp)
+            VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', '${METRIC_NAME}', ${METRIC_VALUE}, 'seconds', 'lower_is_better', now());
+            \""
+            echo "Inserting ${METRIC_NAME} with value ${METRIC_VALUE} seconds"
+            eval $PSQL_CMD
+          fi
+        done < "$LOG_FILE"
+      
+    - name: Delete Neon Project
+      if: ${{ always() && matrix.target_project == 'new_empty_project' }}
+      uses: ./.github/actions/neon-project-delete
+      with:
+        project_id: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Delete Neon Branch for large tenant
+      if: ${{ always() && matrix.target_project == 'large_existing_project' }}
+      uses: ./.github/actions/neon-branch-delete
+      with:
+        project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }}
+        branch_id: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
--- a/.github/workflows/report-workflow-stats-batch.yml
+++ b/.github/workflows/report-workflow-stats-batch.yml
@@ -0,0 +1,29 @@
+name: Report Workflow Stats Batch
+
+on:
+  schedule:
+    - cron: '*/15 * * * *'
+    - cron: '25 0 * * *'
+
+jobs:
+  gh-workflow-stats-batch:
+    name: GitHub Workflow Stats Batch
+    runs-on: ubuntu-22.04
+    permissions:
+      actions: read
+    steps:
+    - name: Export Workflow Run for the past 2 hours
+      uses: neondatabase/gh-workflow-stats-action@v0.2.1
+      with:
+        db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
+        db_table: "gh_workflow_stats_batch_neon"
+        gh_token: ${{ secrets.GITHUB_TOKEN }}
+        duration: '2h'
+    - name: Export Workflow Run for the past 24 hours
+      if: github.event.schedule == '25 0 * * *'
+      uses: neondatabase/gh-workflow-stats-action@v0.2.1
+      with:
+        db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
+        db_table: "gh_workflow_stats_batch_neon"
+        gh_token: ${{ secrets.GITHUB_TOKEN }}
+        duration: '24h'
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1475,6 +1475,8 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
 COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
 COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter

+COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
+
 COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml               /etc/sql_exporter.yml
 COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml             /etc/neon_collector.yml
 COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml   /etc/sql_exporter_autoscaling.yml
--- a/compute/etc/postgres_exporter.yml
+++ b/compute/etc/postgres_exporter.yml
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -26,7 +26,7 @@ commands:
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
-    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
  - name: sql-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -26,7 +26,7 @@ commands:
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
-    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
  - name: sql-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -74,10 +74,17 @@ pub fn write_postgres_conf(
    }

    // Locales
-    writeln!(file, "lc_messages='C.UTF-8'")?;
-    writeln!(file, "lc_monetary='C.UTF-8'")?;
-    writeln!(file, "lc_time='C.UTF-8'")?;
-    writeln!(file, "lc_numeric='C.UTF-8'")?;
+    if cfg!(target_os = "macos") {
+        writeln!(file, "lc_messages='C'")?;
+        writeln!(file, "lc_monetary='C'")?;
+        writeln!(file, "lc_time='C'")?;
+        writeln!(file, "lc_numeric='C'")?;
+    } else {
+        writeln!(file, "lc_messages='C.UTF-8'")?;
+        writeln!(file, "lc_monetary='C.UTF-8'")?;
+        writeln!(file, "lc_time='C.UTF-8'")?;
+        writeln!(file, "lc_numeric='C.UTF-8'")?;
+    }

    match spec.mode {
        ComputeMode::Primary => {}
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -277,7 +277,11 @@ pub mod defaults {
    pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";

    pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
-    pub const DEFAULT_LOCALE: &str = "C.UTF-8";
+    pub const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
+        "C"
+    } else {
+        "C.UTF-8"
+    };

    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -35,6 +35,15 @@ pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
    !(a.end <= b.start || b.end <= a.start)
 }

+/// Whether a fully contains b, example as below
+/// ```plain
+/// |      a       |
+///       |  b  |
+/// ```
+pub fn fully_contains<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
+    a.start <= b.start && a.end >= b.end
+}
+
 pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
    let x = std::mem::take(a);
    let mut all_ranges_iter = [x.into_iter(), b.into_iter()]
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4786,12 +4786,6 @@ async fn run_initdb(
        .args(["--username", &conf.superuser])
        .args(["--encoding", "utf8"])
        .args(["--locale", &conf.locale])
-        .args(["--lc-collate", &conf.locale])
-        .args(["--lc-ctype", &conf.locale])
-        .args(["--lc-messages", &conf.locale])
-        .args(["--lc-monetary", &conf.locale])
-        .args(["--lc-numeric", &conf.locale])
-        .args(["--lc-time", &conf.locale])
        .arg("--no-instructions")
        .arg("--no-sync")
        .env_clear()
@@ -9229,6 +9223,23 @@ mod tests {
        Ok(())
    }

+    fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
+        (
+            k1.is_delta,
+            k1.key_range.start,
+            k1.key_range.end,
+            k1.lsn_range.start,
+            k1.lsn_range.end,
+        )
+            .cmp(&(
+                k2.is_delta,
+                k2.key_range.start,
+                k2.key_range.end,
+                k2.lsn_range.start,
+                k2.lsn_range.end,
+            ))
+    }
+
    async fn inspect_and_sort(
        tline: &Arc<Timeline>,
        filter: Option<std::ops::Range<Key>>,
@@ -9237,25 +9248,30 @@ mod tests {
        if let Some(filter) = filter {
            all_layers.retain(|layer| overlaps_with(&layer.key_range, &filter));
        }
-        all_layers.sort_by(|k1, k2| {
-            (
-                k1.is_delta,
-                k1.key_range.start,
-                k1.key_range.end,
-                k1.lsn_range.start,
-                k1.lsn_range.end,
-            )
-                .cmp(&(
-                    k2.is_delta,
-                    k2.key_range.start,
-                    k2.key_range.end,
-                    k2.lsn_range.start,
-                    k2.lsn_range.end,
-                ))
-        });
+        all_layers.sort_by(sort_layer_key);
        all_layers
    }

+    #[cfg(feature = "testing")]
+    fn check_layer_map_key_eq(
+        mut left: Vec<PersistentLayerKey>,
+        mut right: Vec<PersistentLayerKey>,
+    ) {
+        left.sort_by(sort_layer_key);
+        right.sort_by(sort_layer_key);
+        if left != right {
+            eprintln!("---LEFT---");
+            for left in left.iter() {
+                eprintln!("{}", left);
+            }
+            eprintln!("---RIGHT---");
+            for right in right.iter() {
+                eprintln!("{}", right);
+            }
+            assert_eq!(left, right);
+        }
+    }
+
    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_simple_partial_bottom_most_compaction() -> anyhow::Result<()> {
@@ -9348,127 +9364,206 @@ mod tests {

        let cancel = CancellationToken::new();

-        // Do a partial compaction on key range 0..4, we should generate a image layer; no other layers
-        // can be removed because they might be used for other key ranges.
+        // Do a partial compaction on key range 0..2
        tline
-            .partial_compact_with_gc(Some(get_key(0)..get_key(4)), &cancel, EnumSet::new(), &ctx)
+            .partial_compact_with_gc(get_key(0)..get_key(2), &cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
-        assert_eq!(
+        check_layer_map_key_eq(
            all_layers,
            vec![
+                // newly-generated image layer for the partial compaction range 0-2
                PersistentLayerKey {
-                    key_range: get_key(0)..get_key(4),
+                    key_range: get_key(0)..get_key(2),
                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
+                    is_delta: false,
                },
                PersistentLayerKey {
                    key_range: get_key(0)..get_key(10),
                    lsn_range: Lsn(0x10)..Lsn(0x11),
-                    is_delta: false
+                    is_delta: false,
                },
+                // delta1 is split and the second part is rewritten
                PersistentLayerKey {
-                    key_range: get_key(1)..get_key(4),
+                    key_range: get_key(2)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                },
                PersistentLayerKey {
                    key_range: get_key(5)..get_key(7),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                },
                PersistentLayerKey {
                    key_range: get_key(8)..get_key(10),
                    lsn_range: Lsn(0x48)..Lsn(0x50),
-                    is_delta: true
-                }
-            ]
+                    is_delta: true,
+                },
+            ],
        );

-        // Do a partial compaction on key range 4..10
+        // Do a partial compaction on key range 2..4
        tline
-            .partial_compact_with_gc(Some(get_key(4)..get_key(10)), &cancel, EnumSet::new(), &ctx)
+            .partial_compact_with_gc(get_key(2)..get_key(4), &cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
-        assert_eq!(
+        check_layer_map_key_eq(
            all_layers,
            vec![
                PersistentLayerKey {
-                    key_range: get_key(0)..get_key(4),
+                    key_range: get_key(0)..get_key(2),
                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
+                    is_delta: false,
                },
                PersistentLayerKey {
-                    // if (in the future) GC kicks in, this layer will be removed
                    key_range: get_key(0)..get_key(10),
                    lsn_range: Lsn(0x10)..Lsn(0x11),
-                    is_delta: false
+                    is_delta: false,
                },
+                // image layer generated for the compaction range 2-4
                PersistentLayerKey {
-                    key_range: get_key(4)..get_key(10),
+                    key_range: get_key(2)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
+                    is_delta: false,
                },
+                // we have key2/key3 above the retain_lsn, so we still need this delta layer
                PersistentLayerKey {
-                    key_range: get_key(1)..get_key(4),
+                    key_range: get_key(2)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                },
                PersistentLayerKey {
                    key_range: get_key(5)..get_key(7),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                },
                PersistentLayerKey {
                    key_range: get_key(8)..get_key(10),
                    lsn_range: Lsn(0x48)..Lsn(0x50),
-                    is_delta: true
-                }
-            ]
+                    is_delta: true,
+                },
+            ],
+        );
+
+        // Do a partial compaction on key range 4..9
+        tline
+            .partial_compact_with_gc(get_key(4)..get_key(9), &cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(2)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(2)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true,
+                },
+                // image layer generated for this compaction range
+                PersistentLayerKey {
+                    key_range: get_key(4)..get_key(9),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        // Do a partial compaction on key range 9..10
+        tline
+            .partial_compact_with_gc(get_key(9)..get_key(10), &cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(2)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(2)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(4)..get_key(9),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                // image layer generated for the compaction range
+                PersistentLayerKey {
+                    key_range: get_key(9)..get_key(10),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
        );

        // Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
        tline
-            .partial_compact_with_gc(Some(get_key(0)..get_key(10)), &cancel, EnumSet::new(), &ctx)
+            .partial_compact_with_gc(get_key(0)..get_key(10), &cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
-        assert_eq!(
+        check_layer_map_key_eq(
            all_layers,
            vec![
-                PersistentLayerKey {
-                    key_range: get_key(0)..get_key(4),
-                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
-                },
+                // aha, we removed all unnecessary image/delta layers and got a very clean layer map!
                PersistentLayerKey {
                    key_range: get_key(0)..get_key(10),
                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
+                    is_delta: false,
                },
                PersistentLayerKey {
-                    key_range: get_key(4)..get_key(10),
-                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
-                },
-                PersistentLayerKey {
-                    key_range: get_key(1)..get_key(4),
+                    key_range: get_key(2)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
-                },
-                PersistentLayerKey {
-                    key_range: get_key(5)..get_key(7),
-                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                },
                PersistentLayerKey {
                    key_range: get_key(8)..get_key(10),
                    lsn_range: Lsn(0x48)..Lsn(0x50),
-                    is_delta: true
-                }
-            ]
+                    is_delta: true,
+                },
+            ],
        );

        Ok(())
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -653,6 +653,10 @@ impl DeltaLayerWriter {
        })
    }

+    pub fn is_empty(&self) -> bool {
+        self.inner.as_ref().unwrap().num_keys == 0
+    }
+
    ///
    /// Append a key-value pair to the file.
    ///
--- a/pageserver/src/tenant/storage_layer/filter_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs
@@ -1,4 +1,4 @@
-use std::ops::Range;
+use std::{ops::Range, sync::Arc};

 use anyhow::bail;
 use pageserver_api::{
@@ -9,7 +9,10 @@ use utils::lsn::Lsn;

 use pageserver_api::value::Value;

-use super::merge_iterator::MergeIterator;
+use super::{
+    merge_iterator::{MergeIterator, MergeIteratorItem},
+    PersistentLayerKey,
+};

 /// A filter iterator over merge iterators (and can be easily extended to other types of iterators).
 ///
@@ -48,10 +51,10 @@ impl<'a> FilterIterator<'a> {
        })
    }

-    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        while let Some(item) = self.inner.next().await? {
+    async fn next_inner<R: MergeIteratorItem>(&mut self) -> anyhow::Result<Option<R>> {
+        while let Some(item) = self.inner.next_inner::<R>().await? {
            while self.current_filter_idx < self.retain_key_filters.len()
-                && item.0 >= self.retain_key_filters[self.current_filter_idx].end
+                && item.key_lsn_value().0 >= self.retain_key_filters[self.current_filter_idx].end
            {
                // [filter region]    [filter region]     [filter region]
                //                                     ^ item
@@ -68,7 +71,7 @@ impl<'a> FilterIterator<'a> {
                //                                                 ^ current filter (nothing)
                return Ok(None);
            }
-            if self.retain_key_filters[self.current_filter_idx].contains(&item.0) {
+            if self.retain_key_filters[self.current_filter_idx].contains(&item.key_lsn_value().0) {
                // [filter region]    [filter region]     [filter region]
                //                                              ^ item
                //                                        ^ current filter
@@ -81,6 +84,16 @@ impl<'a> FilterIterator<'a> {
        }
        Ok(None)
    }
+
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        self.next_inner().await
+    }
+
+    pub async fn next_with_trace(
+        &mut self,
+    ) -> anyhow::Result<Option<((Key, Lsn, Value), Arc<PersistentLayerKey>)>> {
+        self.next_inner().await
+    }
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -1,6 +1,7 @@
 use std::{
    cmp::Ordering,
    collections::{binary_heap, BinaryHeap},
+    sync::Arc,
 };

 use anyhow::bail;
@@ -13,10 +14,11 @@ use pageserver_api::value::Value;
 use super::{
    delta_layer::{DeltaLayerInner, DeltaLayerIterator},
    image_layer::{ImageLayerInner, ImageLayerIterator},
+    PersistentLayerDesc, PersistentLayerKey,
 };

 #[derive(Clone, Copy)]
-enum LayerRef<'a> {
+pub(crate) enum LayerRef<'a> {
    Image(&'a ImageLayerInner),
    Delta(&'a DeltaLayerInner),
 }
@@ -62,18 +64,20 @@ impl LayerIterRef<'_> {
 /// 1. Unified iterator for image and delta layers.
 /// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
 /// 3. Lazy creation of the real delta/image iterator.
-enum IteratorWrapper<'a> {
+pub(crate) enum IteratorWrapper<'a> {
    NotLoaded {
        ctx: &'a RequestContext,
        first_key_lower_bound: (Key, Lsn),
        layer: LayerRef<'a>,
+        source_desc: Arc<PersistentLayerKey>,
    },
    Loaded {
        iter: PeekableLayerIterRef<'a>,
+        source_desc: Arc<PersistentLayerKey>,
    },
 }

-struct PeekableLayerIterRef<'a> {
+pub(crate) struct PeekableLayerIterRef<'a> {
    iter: LayerIterRef<'a>,
    peeked: Option<(Key, Lsn, Value)>, // None == end
 }
@@ -151,6 +155,12 @@ impl<'a> IteratorWrapper<'a> {
            layer: LayerRef::Image(image_layer),
            first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()),
            ctx,
+            source_desc: PersistentLayerKey {
+                key_range: image_layer.key_range().clone(),
+                lsn_range: PersistentLayerDesc::image_layer_lsn_range(image_layer.lsn()),
+                is_delta: false,
+            }
+            .into(),
        }
    }

@@ -162,12 +172,18 @@ impl<'a> IteratorWrapper<'a> {
            layer: LayerRef::Delta(delta_layer),
            first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start),
            ctx,
+            source_desc: PersistentLayerKey {
+                key_range: delta_layer.key_range().clone(),
+                lsn_range: delta_layer.lsn_range().clone(),
+                is_delta: true,
+            }
+            .into(),
        }
    }

    fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> {
        match self {
-            Self::Loaded { iter } => iter
+            Self::Loaded { iter, .. } => iter
                .peek()
                .as_ref()
                .map(|(key, lsn, val)| (key, *lsn, Some(val))),
@@ -191,6 +207,7 @@ impl<'a> IteratorWrapper<'a> {
            ctx,
            first_key_lower_bound,
            layer,
+            source_desc,
        } = self
        else {
            unreachable!()
@@ -206,7 +223,10 @@ impl<'a> IteratorWrapper<'a> {
                );
            }
        }
-        *self = Self::Loaded { iter };
+        *self = Self::Loaded {
+            iter,
+            source_desc: source_desc.clone(),
+        };
        Ok(())
    }

@@ -220,11 +240,19 @@ impl<'a> IteratorWrapper<'a> {
    /// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and
    /// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        let Self::Loaded { iter } = self else {
+        let Self::Loaded { iter, .. } = self else {
            panic!("must load the iterator before using")
        };
        iter.next().await
    }
+
+    /// Get the persistent layer key corresponding to this iterator
+    fn trace_source(&self) -> Arc<PersistentLayerKey> {
+        match self {
+            Self::Loaded { source_desc, .. } => source_desc.clone(),
+            Self::NotLoaded { source_desc, .. } => source_desc.clone(),
+        }
+    }
 }

 /// A merge iterator over delta/image layer iterators.
@@ -242,6 +270,32 @@ pub struct MergeIterator<'a> {
    heap: BinaryHeap<IteratorWrapper<'a>>,
 }

+pub(crate) trait MergeIteratorItem {
+    fn new(item: (Key, Lsn, Value), iterator: &IteratorWrapper<'_>) -> Self;
+
+    fn key_lsn_value(&self) -> &(Key, Lsn, Value);
+}
+
+impl MergeIteratorItem for (Key, Lsn, Value) {
+    fn new(item: (Key, Lsn, Value), _: &IteratorWrapper<'_>) -> Self {
+        item
+    }
+
+    fn key_lsn_value(&self) -> &(Key, Lsn, Value) {
+        self
+    }
+}
+
+impl MergeIteratorItem for ((Key, Lsn, Value), Arc<PersistentLayerKey>) {
+    fn new(item: (Key, Lsn, Value), iter: &IteratorWrapper<'_>) -> Self {
+        (item, iter.trace_source().clone())
+    }
+
+    fn key_lsn_value(&self) -> &(Key, Lsn, Value) {
+        &self.0
+    }
+}
+
 impl<'a> MergeIterator<'a> {
    pub fn create(
        deltas: &[&'a DeltaLayerInner],
@@ -260,7 +314,7 @@ impl<'a> MergeIterator<'a> {
        }
    }

-    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+    pub(crate) async fn next_inner<R: MergeIteratorItem>(&mut self) -> anyhow::Result<Option<R>> {
        while let Some(mut iter) = self.heap.peek_mut() {
            if !iter.is_loaded() {
                // Once we load the iterator, we can know the real first key-value pair in the iterator.
@@ -275,10 +329,22 @@ impl<'a> MergeIterator<'a> {
                binary_heap::PeekMut::pop(iter);
                continue;
            };
-            return Ok(Some(item));
+            return Ok(Some(R::new(item, &iter)));
        }
        Ok(None)
    }
+
+    /// Get the next key-value pair from the iterator.
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        self.next_inner().await
+    }
+
+    /// Get the next key-value pair from the iterator, and trace where the key comes from.
+    pub async fn next_with_trace(
+        &mut self,
+    ) -> anyhow::Result<Option<((Key, Lsn, Value), Arc<PersistentLayerKey>)>> {
+        self.next_inner().await
+    }
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,7 +4,7 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.

-use std::collections::{BinaryHeap, HashSet};
+use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::ops::{Deref, Range};
 use std::sync::Arc;

@@ -56,7 +56,7 @@ use pageserver_api::value::Value;

 use utils::lsn::Lsn;

-use pageserver_compaction::helpers::overlaps_with;
+use pageserver_compaction::helpers::{fully_contains, overlaps_with};
 use pageserver_compaction::interface::*;

 use super::CompactionError;
@@ -64,6 +64,23 @@ use super::CompactionError;
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;

+pub struct GcCompactionJobDescription {
+    /// All layers to read in the compaction job
+    selected_layers: Vec<Layer>,
+    /// GC cutoff of the job
+    gc_cutoff: Lsn,
+    /// LSNs to retain for the job
+    retain_lsns_below_horizon: Vec<Lsn>,
+    /// Maximum layer LSN processed in this compaction
+    max_layer_lsn: Lsn,
+    /// Only compact layers overlapping with this range
+    compaction_key_range: Range<Key>,
+    /// When partial compaction is enabled, these layers need to be rewritten to ensure no overlap.
+    /// This field is here solely for debugging. The field will not be read once the compaction
+    /// description is generated.
+    rewrite_layers: Vec<Arc<PersistentLayerDesc>>,
+}
+
 /// The result of bottom-most compaction for a single key at each LSN.
 #[derive(Debug)]
 #[cfg_attr(test, derive(PartialEq))]
@@ -1722,7 +1739,8 @@ impl Timeline {
        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        self.partial_compact_with_gc(None, cancel, flags, ctx).await
+        self.partial_compact_with_gc(Key::MIN..Key::MAX, cancel, flags, ctx)
+            .await
    }

    /// An experimental compaction building block that combines compaction with garbage collection.
@@ -1732,12 +1750,15 @@ impl Timeline {
    /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
    /// and create delta layers with all deltas >= gc horizon.
    ///
-    /// If `key_range`, it will only compact the keys within the range, aka partial compaction. This functionality
-    /// is not complete yet, and if it is set, only image layers will be generated.
-    ///
+    /// If `key_range` is provided, it will only compact the keys within the range, aka partial compaction.
+    /// Partial compaction will read and process all layers overlapping with the key range, even if it might
+    /// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained
+    /// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing
+    /// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
+    /// part of the range.
    pub(crate) async fn partial_compact_with_gc(
        self: &Arc<Self>,
-        compaction_key_range: Option<Range<Key>>,
+        compaction_key_range: Range<Key>,
        cancel: &CancellationToken,
        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
@@ -1762,9 +1783,8 @@ impl Timeline {
        .await?;

        let dry_run = flags.contains(CompactFlags::DryRun);
-        let partial_compaction = compaction_key_range.is_some();

-        if let Some(ref compaction_key_range) = compaction_key_range {
+        if compaction_key_range == (Key::MIN..Key::MAX) {
            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end);
        } else {
            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
@@ -1780,7 +1800,7 @@ impl Timeline {
        // The layer selection has the following properties:
        // 1. If a layer is in the selection, all layers below it are in the selection.
        // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
-        let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = if !partial_compaction {
+        let job_desc = {
            let guard = self.layers.read().await;
            let layers = guard.layer_map()?;
            let gc_info = self.gc_info.read().unwrap();
@@ -1810,9 +1830,21 @@ impl Timeline {
            };
            // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
            // layers to compact.
+            let mut rewrite_layers = Vec::new();
            for desc in layers.iter_historic_layers() {
-                if desc.get_lsn_range().end <= max_layer_lsn {
+                if desc.get_lsn_range().end <= max_layer_lsn
+                    && overlaps_with(&desc.get_key_range(), &compaction_key_range)
+                {
+                    // If the layer overlaps with the compaction key range, we need to read it to obtain all keys within the range,
+                    // even if it might contain extra keys
                    selected_layers.push(guard.get_from_desc(&desc));
+                    // If the layer is not fully contained within the key range, we need to rewrite it if it's a delta layer (it's fine
+                    // to overlap image layers)
+                    if desc.is_delta()
+                        && !fully_contains(&compaction_key_range, &desc.get_key_range())
+                    {
+                        rewrite_layers.push(desc);
+                    }
                }
            }
            if selected_layers.is_empty() {
@@ -1820,82 +1852,59 @@ impl Timeline {
                return Ok(());
            }
            retain_lsns_below_horizon.sort();
-            (selected_layers, gc_cutoff, retain_lsns_below_horizon)
-        } else {
-            // In case of partial compaction, we currently only support generating image layers, and therefore,
-            // we pick all layers that are below the lowest retain_lsn and does not intersect with any of the layers.
-            let guard = self.layers.read().await;
-            let layers = guard.layer_map()?;
-            let gc_info = self.gc_info.read().unwrap();
-            let mut min_lsn = gc_info.cutoffs.select_min();
-            for (lsn, _, _) in &gc_info.retain_lsns {
-                if lsn < &min_lsn {
-                    min_lsn = *lsn;
-                }
+            GcCompactionJobDescription {
+                selected_layers,
+                gc_cutoff,
+                retain_lsns_below_horizon,
+                max_layer_lsn,
+                compaction_key_range,
+                rewrite_layers,
            }
-            for lsn in gc_info.leases.keys() {
-                if lsn < &min_lsn {
-                    min_lsn = *lsn;
-                }
-            }
-            let mut selected_layers = Vec::new();
-            drop(gc_info);
-            // |-------| |-------| |-------|
-            // | Delta | | Delta | | Delta | -- min_lsn could be intersecting with the layers
-            // |-------| |-------| |-------| <- we want to pick all the layers below min_lsn, so that
-            // | Delta | | Delta | | Delta |    ...we can remove them after compaction
-            // |-------| |-------| |-------|
-            // Pick all the layers intersect or below the min_lsn, get the largest LSN in the selected layers.
-            let Some(compaction_key_range) = compaction_key_range.as_ref() else {
-                unreachable!()
-            };
-            for desc in layers.iter_historic_layers() {
-                if desc.get_lsn_range().end <= min_lsn
-                    && overlaps_with(&desc.key_range, compaction_key_range)
-                {
-                    selected_layers.push(guard.get_from_desc(&desc));
-                }
-            }
-            if selected_layers.is_empty() {
-                info!("no layers to compact with gc");
-                return Ok(());
-            }
-            (selected_layers, min_lsn, Vec::new())
        };
        let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
-            if partial_compaction {
-                warn!("partial compaction cannot run on child branches (for now)");
-                return Ok(());
-            }
            Lsn(self.ancestor_lsn.0 + 1)
        } else {
-            let res = retain_lsns_below_horizon
+            let res = job_desc
+                .retain_lsns_below_horizon
                .first()
                .copied()
-                .unwrap_or(gc_cutoff);
+                .unwrap_or(job_desc.gc_cutoff);
            if cfg!(debug_assertions) {
                assert_eq!(
                    res,
-                    retain_lsns_below_horizon
+                    job_desc
+                        .retain_lsns_below_horizon
                        .iter()
                        .min()
                        .copied()
-                        .unwrap_or(gc_cutoff)
+                        .unwrap_or(job_desc.gc_cutoff)
                );
            }
            res
        };
        info!(
-            "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
-            layer_selection.len(),
-            gc_cutoff,
-            lowest_retain_lsn
+            "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}",
+            job_desc.selected_layers.len(),
+            job_desc.rewrite_layers.len(),
+            job_desc.max_layer_lsn,
+            job_desc.gc_cutoff,
+            lowest_retain_lsn,
+            job_desc.compaction_key_range.start,
+            job_desc.compaction_key_range.end
        );

-        self.check_compaction_space(&layer_selection).await?;
+        for layer in &job_desc.selected_layers {
+            debug!("read layer: {}", layer.layer_desc().key());
+        }
+        for layer in &job_desc.rewrite_layers {
+            debug!("rewrite layer: {}", layer.key());
+        }
+
+        self.check_compaction_space(&job_desc.selected_layers)
+            .await?;

        // Generate statistics for the compaction
-        for layer in &layer_selection {
+        for layer in &job_desc.selected_layers {
            let desc = layer.layer_desc();
            if desc.is_delta() {
                stat.visit_delta_layer(desc.file_size());
@@ -1906,25 +1915,25 @@ impl Timeline {

        // Step 1: construct a k-merge iterator over all layers.
        // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
-        let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
+        let layer_names = job_desc
+            .selected_layers
            .iter()
            .map(|layer| layer.layer_desc().layer_name())
            .collect_vec();
        if let Some(err) = check_valid_layermap(&layer_names) {
-            bail!("cannot run gc-compaction because {}", err);
+            warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
        }
        // The maximum LSN we are processing in this compaction loop
-        let end_lsn = layer_selection
+        let end_lsn = job_desc
+            .selected_layers
            .iter()
            .map(|l| l.layer_desc().lsn_range.end)
            .max()
            .unwrap();
-        // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
-        // as an L0 layer.
        let mut delta_layers = Vec::new();
        let mut image_layers = Vec::new();
        let mut downloaded_layers = Vec::new();
-        for layer in &layer_selection {
+        for layer in &job_desc.selected_layers {
            let resident_layer = layer.download_and_keep_resident().await?;
            downloaded_layers.push(resident_layer);
        }
@@ -1943,8 +1952,8 @@ impl Timeline {
            dense_ks,
            sparse_ks,
        )?;
-        // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
-        // Data of the same key.
+
+        // Step 2: Produce images+deltas.
        let mut accumulated_values = Vec::new();
        let mut last_key: Option<Key> = None;

@@ -1956,10 +1965,7 @@ impl Timeline {
                    self.conf,
                    self.timeline_id,
                    self.tenant_shard_id,
-                    compaction_key_range
-                        .as_ref()
-                        .map(|x| x.start)
-                        .unwrap_or(Key::MIN),
+                    job_desc.compaction_key_range.start,
                    lowest_retain_lsn,
                    self.get_compaction_target_size(),
                    ctx,
@@ -1979,6 +1985,13 @@ impl Timeline {
        )
        .await?;

+        #[derive(Default)]
+        struct RewritingLayers {
+            before: Option<DeltaLayerWriter>,
+            after: Option<DeltaLayerWriter>,
+        }
+        let mut delta_layer_rewriters = HashMap::<Arc<PersistentLayerKey>, RewritingLayers>::new();
+
        /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
        ///
        /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
@@ -2004,10 +2017,51 @@ impl Timeline {
        // the key and LSN range are determined. However, to keep things simple here, we still
        // create this writer, and discard the writer in the end.

-        while let Some((key, lsn, val)) = merge_iter.next().await? {
+        while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? {
            if cancel.is_cancelled() {
                return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
            }
+            if !job_desc.compaction_key_range.contains(&key) {
+                if !desc.is_delta {
+                    continue;
+                }
+                let rewriter = delta_layer_rewriters.entry(desc.clone()).or_default();
+                let rewriter = if key < job_desc.compaction_key_range.start {
+                    if rewriter.before.is_none() {
+                        rewriter.before = Some(
+                            DeltaLayerWriter::new(
+                                self.conf,
+                                self.timeline_id,
+                                self.tenant_shard_id,
+                                desc.key_range.start,
+                                desc.lsn_range.clone(),
+                                ctx,
+                            )
+                            .await?,
+                        );
+                    }
+                    rewriter.before.as_mut().unwrap()
+                } else if key >= job_desc.compaction_key_range.end {
+                    if rewriter.after.is_none() {
+                        rewriter.after = Some(
+                            DeltaLayerWriter::new(
+                                self.conf,
+                                self.timeline_id,
+                                self.tenant_shard_id,
+                                job_desc.compaction_key_range.end,
+                                desc.lsn_range.clone(),
+                                ctx,
+                            )
+                            .await?,
+                        );
+                    }
+                    rewriter.after.as_mut().unwrap()
+                } else {
+                    unreachable!()
+                };
+                rewriter.put_value(key, lsn, val, ctx).await?;
+                continue;
+            }
            match val {
                Value::Image(_) => stat.visit_image_key(&val),
                Value::WalRecord(_) => stat.visit_wal_key(&val),
@@ -2018,35 +2072,27 @@ impl Timeline {
                }
                accumulated_values.push((key, lsn, val));
            } else {
-                let last_key = last_key.as_mut().unwrap();
-                stat.on_unique_key_visited();
-                let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
-                    !compaction_key_range.contains(last_key)
-                } else {
-                    false
-                };
-                if !skip_adding_key {
-                    let retention = self
-                        .generate_key_retention(
-                            *last_key,
-                            &accumulated_values,
-                            gc_cutoff,
-                            &retain_lsns_below_horizon,
-                            COMPACTION_DELTA_THRESHOLD,
-                            get_ancestor_image(self, *last_key, ctx).await?,
-                        )
-                        .await?;
-                    // Put the image into the image layer. Currently we have a single big layer for the compaction.
-                    retention
-                        .pipe_to(
-                            *last_key,
-                            &mut delta_layer_writer,
-                            image_layer_writer.as_mut(),
-                            &mut stat,
-                            ctx,
-                        )
-                        .await?;
-                }
+                let last_key: &mut Key = last_key.as_mut().unwrap();
+                stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction
+                let retention = self
+                    .generate_key_retention(
+                        *last_key,
+                        &accumulated_values,
+                        job_desc.gc_cutoff,
+                        &job_desc.retain_lsns_below_horizon,
+                        COMPACTION_DELTA_THRESHOLD,
+                        get_ancestor_image(self, *last_key, ctx).await?,
+                    )
+                    .await?;
+                retention
+                    .pipe_to(
+                        *last_key,
+                        &mut delta_layer_writer,
+                        image_layer_writer.as_mut(),
+                        &mut stat,
+                        ctx,
+                    )
+                    .await?;
                accumulated_values.clear();
                *last_key = key;
                accumulated_values.push((key, lsn, val));
@@ -2057,35 +2103,43 @@ impl Timeline {
        let last_key = last_key.expect("no keys produced during compaction");
        stat.on_unique_key_visited();

-        let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
-            !compaction_key_range.contains(&last_key)
-        } else {
-            false
-        };
-        if !skip_adding_key {
-            let retention = self
-                .generate_key_retention(
-                    last_key,
-                    &accumulated_values,
-                    gc_cutoff,
-                    &retain_lsns_below_horizon,
-                    COMPACTION_DELTA_THRESHOLD,
-                    get_ancestor_image(self, last_key, ctx).await?,
-                )
-                .await?;
-            // Put the image into the image layer. Currently we have a single big layer for the compaction.
-            retention
-                .pipe_to(
-                    last_key,
-                    &mut delta_layer_writer,
-                    image_layer_writer.as_mut(),
-                    &mut stat,
-                    ctx,
-                )
-                .await?;
-        }
+        let retention = self
+            .generate_key_retention(
+                last_key,
+                &accumulated_values,
+                job_desc.gc_cutoff,
+                &job_desc.retain_lsns_below_horizon,
+                COMPACTION_DELTA_THRESHOLD,
+                get_ancestor_image(self, last_key, ctx).await?,
+            )
+            .await?;
+        retention
+            .pipe_to(
+                last_key,
+                &mut delta_layer_writer,
+                image_layer_writer.as_mut(),
+                &mut stat,
+                ctx,
+            )
+            .await?;
        // end: move the above part to the loop body

+        let mut rewrote_delta_layers = Vec::new();
+        for (key, writers) in delta_layer_rewriters {
+            if let Some(delta_writer_before) = writers.before {
+                let (desc, path) = delta_writer_before
+                    .finish(job_desc.compaction_key_range.start, ctx)
+                    .await?;
+                let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+                rewrote_delta_layers.push(layer);
+            }
+            if let Some(delta_writer_after) = writers.after {
+                let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?;
+                let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+                rewrote_delta_layers.push(layer);
+            }
+        }
+
        let discard = |key: &PersistentLayerKey| {
            let key = key.clone();
            async move { KeyHistoryRetention::discard_key(&key, self, dry_run).await }
@@ -2093,10 +2147,7 @@ impl Timeline {

        let produced_image_layers = if let Some(writer) = image_layer_writer {
            if !dry_run {
-                let end_key = compaction_key_range
-                    .as_ref()
-                    .map(|x| x.end)
-                    .unwrap_or(Key::MAX);
+                let end_key = job_desc.compaction_key_range.end;
                writer
                    .finish_with_discard_fn(self, ctx, end_key, discard)
                    .await?
@@ -2117,10 +2168,8 @@ impl Timeline {
            Vec::new()
        };

-        if partial_compaction && !produced_delta_layers.is_empty() {
-            bail!("implementation error: partial compaction should not be producing delta layers (for now)");
-        }
-
+        // TODO: make image/delta/rewrote_delta layers generation atomic. At this point, we already generated resident layers, and if
+        // compaction is cancelled at this point, we might have some layers that are not cleaned up.
        let mut compact_to = Vec::new();
        let mut keep_layers = HashSet::new();
        let produced_delta_layers_len = produced_delta_layers.len();
@@ -2128,52 +2177,84 @@ impl Timeline {
        for action in produced_delta_layers {
            match action {
                BatchWriterResult::Produced(layer) => {
+                    if cfg!(debug_assertions) {
+                        info!("produced delta layer: {}", layer.layer_desc().key());
+                    }
                    stat.produce_delta_layer(layer.layer_desc().file_size());
                    compact_to.push(layer);
                }
                BatchWriterResult::Discarded(l) => {
+                    if cfg!(debug_assertions) {
+                        info!("discarded delta layer: {}", l);
+                    }
                    keep_layers.insert(l);
                    stat.discard_delta_layer();
                }
            }
        }
+        for layer in &rewrote_delta_layers {
+            debug!(
+                "produced rewritten delta layer: {}",
+                layer.layer_desc().key()
+            );
+        }
+        compact_to.extend(rewrote_delta_layers);
        for action in produced_image_layers {
            match action {
                BatchWriterResult::Produced(layer) => {
+                    debug!("produced image layer: {}", layer.layer_desc().key());
                    stat.produce_image_layer(layer.layer_desc().file_size());
                    compact_to.push(layer);
                }
                BatchWriterResult::Discarded(l) => {
+                    debug!("discarded image layer: {}", l);
                    keep_layers.insert(l);
                    stat.discard_image_layer();
                }
            }
        }
-        let mut layer_selection = layer_selection;
-        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
-        if let Some(ref compaction_key_range) = compaction_key_range {
-            // Partial compaction might select more data than it processes, e.g., if
-            // the compaction_key_range only partially overlaps:
-            //
-            //         [---compaction_key_range---]
-            //   [---A----][----B----][----C----][----D----]
-            //
-            // A,B,C,D are all in the `layer_selection`. The created image layers contain
-            // whatever is needed from B, C, and from `----]` of A, and from  `[--` of D.
-            //
-            // In contrast, `[--A-` and `--D----]` have not been processed, so, we must
-            // keep that data.
-            //
-            // The solution for now is to keep A and D completely.
-            // (layer_selection is what we'll remove from the layer map, so,
-            //  retain what is _not_ fully covered by compaction_key_range).
-            layer_selection.retain(|x| {
-                let key_range = &x.layer_desc().key_range;
-                key_range.start >= compaction_key_range.start
-                    && key_range.end <= compaction_key_range.end
-            });
+
+        let mut layer_selection = job_desc.selected_layers;
+
+        // Partial compaction might select more data than it processes, e.g., if
+        // the compaction_key_range only partially overlaps:
+        //
+        //         [---compaction_key_range---]
+        //   [---A----][----B----][----C----][----D----]
+        //
+        // For delta layers, we will rewrite the layers so that it is cut exactly at
+        // the compaction key range, so we can always discard them. However, for image
+        // layers, as we do not rewrite them for now, we need to handle them differently.
+        // Assume image layers  A, B, C, D are all in the `layer_selection`.
+        //
+        // The created image layers contain whatever is needed from B, C, and from
+        // `----]` of A, and from  `[---` of D.
+        //
+        // In contrast, `[---A` and `D----]` have not been processed, so, we must
+        // keep that data.
+        //
+        // The solution for now is to keep A and D completely if they are image layers.
+        // (layer_selection is what we'll remove from the layer map, so, retain what
+        // is _not_ fully covered by compaction_key_range).
+        for layer in &layer_selection {
+            if !layer.layer_desc().is_delta() {
+                if !overlaps_with(
+                    &layer.layer_desc().key_range,
+                    &job_desc.compaction_key_range,
+                ) {
+                    bail!("violated constraint: image layer outside of compaction key range");
+                }
+                if !fully_contains(
+                    &job_desc.compaction_key_range,
+                    &layer.layer_desc().key_range,
+                ) {
+                    keep_layers.insert(layer.layer_desc().key());
+                }
+            }
        }

+        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
+
        info!(
            "gc-compaction statistics: {}",
            serde_json::to_string(&stat)?
@@ -2192,6 +2273,7 @@ impl Timeline {

        // Step 3: Place back to the layer map.
        {
+            // TODO: sanity check if the layer map is valid (i.e., should not have overlaps)
            let mut guard = self.layers.write().await;
            guard
                .open_mut()?
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -1,10 +1,8 @@
 from __future__ import annotations

 import enum
-import os
 from typing import TYPE_CHECKING

-import pytest
 from typing_extensions import override

 if TYPE_CHECKING:
@@ -18,12 +16,15 @@ This fixture is used to determine which version of Postgres to use for tests.

 # Inherit PgVersion from str rather than int to make it easier to pass as a command-line argument
 # TODO: use enum.StrEnum for Python >= 3.11
-@enum.unique
 class PgVersion(str, enum.Enum):
    V14 = "14"
    V15 = "15"
    V16 = "16"
    V17 = "17"
+
+    # Default Postgres Version for tests that don't really depend on Postgres itself
+    DEFAULT = V16
+
    # Instead of making version an optional parameter in methods, we can use this fake entry
    # to explicitly rely on the default server version (could be different from pg_version fixture value)
    NOT_SET = "<-POSTRGRES VERSION IS NOT SET->"
@@ -59,27 +60,3 @@ class PgVersion(str, enum.Enum):
        # Make mypy happy
        # See https://github.com/python/mypy/issues/3974
        return None
-
-
-DEFAULT_VERSION: PgVersion = PgVersion.V16
-
-
-def skip_on_postgres(version: PgVersion, reason: str):
-    return pytest.mark.skipif(
-        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version,
-        reason=reason,
-    )
-
-
-def xfail_on_postgres(version: PgVersion, reason: str):
-    return pytest.mark.xfail(
-        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version,
-        reason=reason,
-    )
-
-
-def run_only_on_default_postgres(reason: str):
-    return pytest.mark.skipif(
-        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is not DEFAULT_VERSION,
-        reason=reason,
-    )
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -25,6 +25,7 @@ from fixtures.pageserver.common_types import (
    parse_delta_layer,
    parse_image_layer,
 )
+from fixtures.pg_version import PgVersion

 if TYPE_CHECKING:
    from collections.abc import Iterable
@@ -37,6 +38,7 @@ if TYPE_CHECKING:


 Fn = TypeVar("Fn", bound=Callable[..., Any])
+
 COMPONENT_BINARIES = {
    "storage_controller": ("storage_controller",),
    "storage_broker": ("storage_broker",),
@@ -519,7 +521,7 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str
    This is essentially:

    lines=$(comm -3 \
-        <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \
+        <(mkdir left  && cd left  && tar xf "$left"  && find . -type f -print0 | xargs sha256sum | sort -k2) \
        <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \
        | wc -l)
    [ "$lines" = "0" ]
@@ -643,3 +645,40 @@ def allpairs_versions():
        )
        ids.append(f"combination_{''.join(cur_id)}")
    return {"argnames": "combination", "argvalues": tuple(argvalues), "ids": ids}
+
+
+def skip_on_postgres(version: PgVersion, reason: str):
+    return pytest.mark.skipif(
+        PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is version,
+        reason=reason,
+    )
+
+
+def xfail_on_postgres(version: PgVersion, reason: str):
+    return pytest.mark.xfail(
+        PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is version,
+        reason=reason,
+    )
+
+
+def run_only_on_default_postgres(reason: str):
+    return pytest.mark.skipif(
+        PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is not PgVersion.DEFAULT,
+        reason=reason,
+    )
+
+
+def skip_in_debug_build(reason: str):
+    return pytest.mark.skipif(
+        os.getenv("BUILD_TYPE", "debug") == "debug",
+        reason=reason,
+    )
+
+
+def skip_on_ci(reason: str):
+    # `CI` variable is always set to `true` on GitHub
+    # Ref: https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables
+    return pytest.mark.skipif(
+        os.getenv("CI", "false") == "true",
+        reason=reason,
+    )
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -1,7 +1,6 @@
 from __future__ import annotations

 import json
-import os
 from pathlib import Path
 from typing import TYPE_CHECKING

@@ -14,7 +13,7 @@ from fixtures.neon_fixtures import (
    PgBin,
    wait_for_last_flush_lsn,
 )
-from fixtures.utils import get_scale_for_db, humantime_to_ms
+from fixtures.utils import get_scale_for_db, humantime_to_ms, skip_on_ci

 from performance.pageserver.util import (
    setup_pageserver_with_tenants,
@@ -38,9 +37,8 @@ if TYPE_CHECKING:
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
@pytest.mark.parametrize("n_tenants", [500])
@pytest.mark.timeout(10000)
-@pytest.mark.skipif(
-    os.getenv("CI", "false") == "true",
-    reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
+@skip_on_ci(
+    "This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI"
 )
 def test_pageserver_characterize_throughput_with_n_tenants(
    neon_env_builder: NeonEnvBuilder,
@@ -66,9 +64,8 @@ def test_pageserver_characterize_throughput_with_n_tenants(
@pytest.mark.parametrize("n_clients", [1, 64])
@pytest.mark.parametrize("n_tenants", [1])
@pytest.mark.timeout(2400)
-@pytest.mark.skipif(
-    os.getenv("CI", "false") == "true",
-    reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
+@skip_on_ci(
+    "This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI"
 )
 def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant(
    neon_env_builder: NeonEnvBuilder,
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -8,7 +8,7 @@ from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.pageserver.http import TimelineCreate406
-from fixtures.utils import query_scalar
+from fixtures.utils import query_scalar, skip_in_debug_build


 # Test the GC implementation when running with branching.
@@ -48,10 +48,8 @@ from fixtures.utils import query_scalar
 # Because the delta layer D covering lsn1 is corrupted, creating a branch
 # starting from lsn1 should return an error as follows:
 #     could not find data for key ... at LSN ..., for request at LSN ...
-def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str):
-    if build_type == "debug":
-        pytest.skip("times out in debug builds")
-
+@skip_in_debug_build("times out in debug builds")
+def test_branch_and_gc(neon_simple_env: NeonEnv):
    env = neon_simple_env
    pageserver_http_client = env.pageserver.http_client()

--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -2,7 +2,6 @@ from __future__ import annotations

 import enum
 import json
-import os
 import time
 from typing import TYPE_CHECKING

@@ -13,7 +12,7 @@ from fixtures.neon_fixtures import (
    generate_uploads_and_deletions,
 )
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.utils import wait_until
+from fixtures.utils import skip_in_debug_build, wait_until
 from fixtures.workload import Workload

 if TYPE_CHECKING:
@@ -32,7 +31,7 @@ AGGRESIVE_COMPACTION_TENANT_CONF = {
 }


-@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+@skip_in_debug_build("only run with release build")
 def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder):
    """
    This is a smoke test that compaction kicks in. The workload repeatedly churns
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -12,6 +12,7 @@ from fixtures.neon_fixtures import (
    NeonEnvBuilder,
 )
 from fixtures.pg_version import PgVersion
+from fixtures.utils import skip_on_postgres
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -41,17 +42,14 @@ def neon_env_builder_local(
    return neon_env_builder


+@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building")
+@skip_on_postgres(PgVersion.V17, reason="TODO: PG17 extension building")
 def test_remote_extensions(
    httpserver: HTTPServer,
    neon_env_builder_local: NeonEnvBuilder,
    httpserver_listen_address,
    pg_version,
 ):
-    if pg_version == PgVersion.V16:
-        pytest.skip("TODO: PG16 extension building")
-    if pg_version == PgVersion.V17:
-        pytest.skip("TODO: PG17 extension building")
-
    # setup mock http server
    # that expects request for anon.tar.zst
    # and returns the requested file
--- a/test_runner/regress/test_ingestion_layer_size.py
+++ b/test_runner/regress/test_ingestion_layer_size.py
@@ -4,25 +4,22 @@ from collections.abc import Iterable
 from dataclasses import dataclass
 from typing import TYPE_CHECKING

-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
 from fixtures.pageserver.http import HistoricLayerInfo, LayerMapInfo
-from fixtures.utils import human_bytes
+from fixtures.utils import human_bytes, skip_in_debug_build

 if TYPE_CHECKING:
    from typing import Union


-def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder, build_type: str):
+@skip_in_debug_build("debug run is unnecessarily slow")
+def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder):
    """
    Build a non-small GIN index which includes similarly batched up images in WAL stream as does pgvector
    to show that we no longer create oversized layers.
    """

-    if build_type == "debug":
-        pytest.skip("debug run is unnecessarily slow")
-
    minimum_initdb_size = 20 * 1024**2
    checkpoint_distance = 32 * 1024**2
    minimum_good_layer_size = checkpoint_distance * 0.9
--- a/test_runner/regress/test_layer_bloating.py
+++ b/test_runner/regress/test_layer_bloating.py
@@ -2,7 +2,6 @@ from __future__ import annotations

 import os

-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
@@ -10,12 +9,18 @@ from fixtures.neon_fixtures import (
    wait_for_last_flush_lsn,
 )
 from fixtures.pg_version import PgVersion
+from fixtures.utils import skip_on_postgres


+@skip_on_postgres(
+    PgVersion.V14,
+    reason="pg_log_standby_snapshot() function is available since Postgres 16",
+)
+@skip_on_postgres(
+    PgVersion.V15,
+    reason="pg_log_standby_snapshot() function is available since Postgres 16",
+)
 def test_layer_bloating(neon_env_builder: NeonEnvBuilder, vanilla_pg):
-    if neon_env_builder.pg_version != PgVersion.V16:
-        pytest.skip("pg_log_standby_snapshot() function is available only in PG16")
-
    env = neon_env_builder.init_start(
        initial_tenant_conf={
            "gc_period": "0s",
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -2,7 +2,6 @@ from __future__ import annotations

 import time

-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
@@ -12,17 +11,13 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import wait_for_upload
 from fixtures.remote_storage import RemoteStorageKind
+from fixtures.utils import skip_in_debug_build


 # Crates a few layers, ensures that we can evict them (removing locally but keeping track of them anyway)
 # and then download them back.
-def test_basic_eviction(
-    neon_env_builder: NeonEnvBuilder,
-    build_type: str,
-):
-    if build_type == "debug":
-        pytest.skip("times out in debug builds")
-
+@skip_in_debug_build("times out in debug builds")
+def test_basic_eviction(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)

    env = neon_env_builder.init_start(
--- a/test_runner/regress/test_logging.py
+++ b/test_runner/regress/test_logging.py
@@ -5,8 +5,7 @@ import uuid
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
-from fixtures.pg_version import run_only_on_default_postgres
-from fixtures.utils import wait_until
+from fixtures.utils import run_only_on_default_postgres, wait_until


@pytest.mark.parametrize("level", ["trace", "debug", "info", "warn", "error"])
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -1,6 +1,5 @@
 from __future__ import annotations

-import os
 import subprocess
 from pathlib import Path
 from typing import cast
@@ -15,7 +14,7 @@ from fixtures.neon_fixtures import (
    parse_project_git_version_output,
 )
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pg_version import PgVersion, skip_on_postgres
+from fixtures.utils import run_only_on_default_postgres, skip_in_debug_build


 def helper_compare_timeline_list(
@@ -195,10 +194,8 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
    res.check_returncode()


-@skip_on_postgres(PgVersion.V14, reason="does not use postgres")
-@pytest.mark.skipif(
-    os.environ.get("BUILD_TYPE") == "debug", reason="unit test for test support, either build works"
-)
+@run_only_on_default_postgres(reason="does not use postgres")
+@skip_in_debug_build("unit test for test support, either build works")
 def test_parse_project_git_version_output_positive():
    commit = "b6f77b5816cf1dba12a3bc8747941182ce220846"

@@ -217,10 +214,8 @@ def test_parse_project_git_version_output_positive():
        assert parse_project_git_version_output(example) == commit


-@skip_on_postgres(PgVersion.V14, reason="does not use postgres")
-@pytest.mark.skipif(
-    os.environ.get("BUILD_TYPE") == "debug", reason="unit test for test support, either build works"
-)
+@run_only_on_default_postgres(reason="does not use postgres")
+@skip_in_debug_build("unit test for test support, either build works")
 def test_parse_project_git_version_output_local_docker():
    """
    Makes sure the tests don't accept the default version in Dockerfile one gets without providing
@@ -234,10 +229,8 @@ def test_parse_project_git_version_output_local_docker():
    assert input in str(e)


-@skip_on_postgres(PgVersion.V14, reason="does not use postgres")
-@pytest.mark.skipif(
-    os.environ.get("BUILD_TYPE") == "debug", reason="cli api sanity, either build works"
-)
+@run_only_on_default_postgres(reason="does not use postgres")
+@skip_in_debug_build("unit test for test support, either build works")
 def test_binaries_version_parses(neon_binpath: Path):
    """
    Ensures that we can parse the actual outputs of --version from a set of binaries.
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -1,7 +1,6 @@
 from __future__ import annotations

 import asyncio
-import os
 import time
 from typing import TYPE_CHECKING

@@ -16,7 +15,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
-from fixtures.utils import wait_until
+from fixtures.utils import skip_in_debug_build, wait_until

 if TYPE_CHECKING:
    from typing import Optional
@@ -227,12 +226,9 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
        assert get_dirty_bytes(env) >= dirty_after_write


-@pytest.mark.skipif(
-    # We have to use at least ~100MB of data to hit the lowest limit we can configure, which is
-    # prohibitively slow in debug mode
-    os.getenv("BUILD_TYPE") == "debug",
-    reason="Avoid running bulkier ingest tests in debug mode",
-)
+# We have to use at least ~100MB of data to hit the lowest limit we can configure, which is
+# prohibitively slow in debug mode
+@skip_in_debug_build("Avoid running bulkier ingest tests in debug mode")
 def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
    """
    Test that checkpoints are done based on total ephemeral layer size, even if no one timeline is
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -8,7 +8,7 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.remote_storage import s3_storage
-from fixtures.utils import wait_until
+from fixtures.utils import skip_in_debug_build, wait_until


 # Test restarting page server, while safekeeper and compute node keep
@@ -155,12 +155,8 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 # safekeeper and compute node keep running.
@pytest.mark.timeout(540)
@pytest.mark.parametrize("shard_count", [None, 4])
-def test_pageserver_chaos(
-    neon_env_builder: NeonEnvBuilder, build_type: str, shard_count: Optional[int]
-):
-    if build_type == "debug":
-        pytest.skip("times out in debug builds")
-
+@skip_in_debug_build("times out in debug builds")
+def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
    # same rationale as with the immediate stop; we might leave orphan layers behind.
    neon_env_builder.disable_scrub_on_exit()
    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -17,7 +17,7 @@ from fixtures.pageserver.utils import (
    wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage
-from fixtures.utils import wait_until
+from fixtures.utils import skip_in_debug_build, wait_until
 from fixtures.workload import Workload
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -765,7 +765,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
    assert download_rate < expect_download_rate * 2


-@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+@skip_in_debug_build("only run with release build")
@pytest.mark.parametrize("via_controller", [True, False])
 def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool):
    """
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -3,7 +3,6 @@
 #
 from __future__ import annotations

-import os
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import TYPE_CHECKING, cast
@@ -19,6 +18,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import s3_storage
+from fixtures.utils import skip_in_debug_build

 if TYPE_CHECKING:
    from typing import Optional
@@ -329,7 +329,7 @@ def test_sql_regress(
    post_checks(env, test_output_dir, DBNAME, endpoint)


-@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+@skip_in_debug_build("only run with release build")
 def test_tx_abort_with_many_relations(
    neon_env_builder: NeonEnvBuilder,
 ):
--- a/test_runner/regress/test_replica_start.py
+++ b/test_runner/regress/test_replica_start.py
@@ -30,7 +30,7 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup
 from fixtures.pg_version import PgVersion
-from fixtures.utils import query_scalar, wait_until
+from fixtures.utils import query_scalar, skip_on_postgres, wait_until

 CREATE_SUBXACTS_FUNC = """
 create or replace function create_subxacts(n integer) returns void as $$
@@ -137,6 +137,12 @@ def test_replica_start_scan_clog_crashed_xids(neon_simple_env: NeonEnv):
    assert secondary_cur.fetchone() == (1,)


+@skip_on_postgres(
+    PgVersion.V14, reason="pg_log_standby_snapshot() function is available since Postgres 16"
+)
+@skip_on_postgres(
+    PgVersion.V15, reason="pg_log_standby_snapshot() function is available since Postgres 16"
+)
 def test_replica_start_at_running_xacts(neon_simple_env: NeonEnv, pg_version):
    """
    Test that starting a replica works right after the primary has
@@ -149,9 +155,6 @@ def test_replica_start_at_running_xacts(neon_simple_env: NeonEnv, pg_version):
    """
    env = neon_simple_env

-    if env.pg_version == PgVersion.V14 or env.pg_version == PgVersion.V15:
-        pytest.skip("pg_log_standby_snapshot() function is available only in PG16")
-
    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
    primary_conn = primary.connect()
    primary_cur = primary_conn.cursor()
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -20,7 +20,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty
 from fixtures.remote_storage import s3_storage
-from fixtures.utils import wait_until
+from fixtures.utils import skip_in_debug_build, wait_until
 from fixtures.workload import Workload
 from pytest_httpserver import HTTPServer
 from typing_extensions import override
@@ -853,12 +853,9 @@ def test_sharding_split_stripe_size(
    wait_until(10, 1, assert_restart_notification)


-@pytest.mark.skipif(
-    # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
-    # validating in this test don't benefit much from debug assertions.
-    os.getenv("BUILD_TYPE") == "debug",
-    reason="Avoid running bulkier ingest tests in debug mode",
-)
+# The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
+# validating in this test don't benefit much from debug assertions.
+@skip_in_debug_build("Avoid running bulkier ingest tests in debug mode")
 def test_sharding_ingest_layer_sizes(
    neon_env_builder: NeonEnvBuilder,
 ):
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -36,11 +36,12 @@ from fixtures.pageserver.utils import (
    remote_storage_delete_key,
    timeline_delete_wait_completed,
 )
-from fixtures.pg_version import PgVersion, run_only_on_default_postgres
+from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.storage_controller_proxy import StorageControllerProxy
 from fixtures.utils import (
+    run_only_on_default_postgres,
    run_pg_bench_small,
    subprocess_capture,
    wait_until,
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,6 +1,5 @@
 from __future__ import annotations

-import os
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path

@@ -21,7 +20,7 @@ from fixtures.pageserver.utils import (
    wait_until_tenant_active,
 )
 from fixtures.pg_version import PgVersion
-from fixtures.utils import wait_until
+from fixtures.utils import skip_in_debug_build, wait_until


 def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder):
@@ -279,7 +278,7 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa
    size_debug_file.write(size_debug)


-@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+@skip_in_debug_build("only run with release build")
 def test_single_branch_get_tenant_size_grows(
    neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion
 ):
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -869,8 +869,17 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
        assert count == 10000


-@pytest.mark.parametrize("mode", ["delete_timeline", "delete_tenant"])
-@pytest.mark.parametrize("sharded", [False, True])
+@pytest.mark.parametrize(
+    "mode, sharded",
+    [
+        ("delete_timeline", False),
+        ("delete_timeline", True),
+        ("delete_tenant", False),
+        # the shared/exclusive lock for tenant is blocking this:
+        # timeline detach ancestor takes shared, delete tenant takes exclusive
+        # ("delete_tenant", True)
+    ],
+)
 def test_timeline_detach_ancestor_interrupted_by_deletion(
    neon_env_builder: NeonEnvBuilder, mode: str, sharded: bool
 ):
@@ -885,11 +894,6 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
    - shutdown winning over complete, see test_timeline_is_deleted_before_timeline_detach_ancestor_completes
    """

-    if sharded and mode == "delete_tenant":
-        # the shared/exclusive lock for tenant is blocking this:
-        # timeline detach ancestor takes shared, delete tenant takes exclusive
-        pytest.skip("tenant deletion while timeline ancestor detach is underway cannot happen")
-
    shard_count = 2 if sharded else 1

    neon_env_builder.num_pageservers = shard_count
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -54,6 +54,8 @@ from fixtures.utils import (
    PropagatingThread,
    get_dir_size,
    query_scalar,
+    run_only_on_default_postgres,
+    skip_in_debug_build,
    start_in_background,
    wait_until,
 )
@@ -2104,10 +2106,9 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
 # The only way to verify this without manipulating time is to sleep for a while.
 # In this test we sleep for 60 seconds, so this test takes at least 1 minute to run.
 # This is longer than most other tests, we run it only for v16 to save CI resources.
+@run_only_on_default_postgres("run only on release build to save CI resources")
+@skip_in_debug_build("run only on release build to save CI resources")
 def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
-    if os.environ.get("PYTEST_CURRENT_TEST", "").find("[debug-pg16]") == -1:
-        pytest.skip("run only on debug postgres v16 to save CI resources")
-
    neon_env_builder.num_safekeepers = 3
    env = neon_env_builder.init_start()

--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -14,6 +14,7 @@ from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import getLogger
 from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper
 from fixtures.remote_storage import RemoteStorageKind
+from fixtures.utils import skip_in_debug_build

 if TYPE_CHECKING:
    from typing import Optional
@@ -760,10 +761,8 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint, test_output_dir: Pat
 # The test takes more than default 5 minutes on Postgres 16,
 # see https://github.com/neondatabase/neon/issues/5305
@pytest.mark.timeout(600)
+@skip_in_debug_build("times out in debug builds")
 def test_wal_lagging(neon_env_builder: NeonEnvBuilder, test_output_dir: Path, build_type: str):
-    if build_type == "debug":
-        pytest.skip("times out in debug builds")
-
    neon_env_builder.num_safekeepers = 3
    env = neon_env_builder.init_start()