add a4 recondig model

add small models
Fix BecomeLeader, adjust init and target confs.
2026-05-17 13:10:38 +00:00 · 2024-12-02 12:32:51 +01:00 · 2024-12-02 12:26:06 +01:00 · 2024-12-02 12:02:26 +01:00 · 2024-12-02 11:05:39 +01:00 · 2024-12-02 12:11:10 +03:00
250 changed files with 12779 additions and 4155 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -20,3 +20,4 @@ config-variables:
  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
  - DEV_AWS_OIDC_ROLE_ARN
+  - BENCHMARK_INGEST_TARGET_PROJECTID
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -221,6 +221,8 @@ runs:
        REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
      with:
+        # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+        retries: 5
        script: |
          const { REPORT_URL, COMMIT_SHA } = process.env

--- a/.github/actions/set-docker-config-dir/action.yml
+++ b/.github/actions/set-docker-config-dir/action.yml
@@ -1,36 +0,0 @@
-name: "Set custom docker config directory"
-description: "Create a directory for docker config and set DOCKER_CONFIG"
-
-# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-runs:
-  using: "composite"
-  steps:
-  - name: Show warning on GitHub-hosted runners
-    if: runner.environment == 'github-hosted'
-    shell: bash -euo pipefail {0}
-    run: |
-      # Using the following environment variables to find a path to the workflow file
-      # ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch
-      # ${GITHUB_REPOSITORY}   - octocat/hello-world
-      # ${GITHUB_REF}          - refs/heads/my_branch
-      # From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables
-
-      filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"}
-      filename=${filename_with_ref%"@$GITHUB_REF"}
-
-      # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message
-      title='Unnecessary usage of `.github/actions/set-docker-config-dir`'
-      message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners'
-      echo "::warning file=${filename},title=${title}::${message}"
-
-  - uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7
-    env:
-      DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }}
-    with:
-      main: |
-        mkdir -p "${DOCKER_CONFIG}"
-        echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV
-      post: |
-        if [ -d "${DOCKER_CONFIG}" ]; then
-          rm -r "${DOCKER_CONFIG}"
-        fi
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,14 +1,3 @@
 ## Problem

 ## Summary of changes
-
-## Checklist before requesting a review
-
- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.
-
-## Checklist before merging
-
- [ ] Do not forget to reformat commit message to not include the above checklist
--- a/.github/workflows/_check-codestyle-python.yml
+++ b/.github/workflows/_check-codestyle-python.yml
@@ -0,0 +1,37 @@
+name: Check Codestyle Python
+
+on:
+  workflow_call:
+    inputs:
+      build-tools-image:
+        description: 'build-tools image'
+        required: true
+        type: string
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+jobs:
+  check-codestyle-python:
+    runs-on: [ self-hosted, small ]
+    container:
+      image: ${{ inputs.build-tools-image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/cache@v4
+        with:
+          path: ~/.cache/pypoetry/virtualenvs
+          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
+
+      - run: ./scripts/pysync
+
+      - run: poetry run ruff check .
+      - run: poetry run ruff format --check .
+      - run: poetry run mypy .
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -545,12 +545,12 @@ jobs:
        arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')

        cd /home/nonroot
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.0-1.pgdg110+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110+2_${arch}.deb"
-        dpkg -x libpq5_17.0-1.pgdg110+1_${arch}.deb pg
-        dpkg -x postgresql-16_16.4-1.pgdg110+2_${arch}.deb pg
-        dpkg -x postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb pg
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.1-1.pgdg110+1_${arch}.deb"
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.5-1.pgdg110+1_${arch}.deb"
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.5-1.pgdg110+1_${arch}.deb"
+        dpkg -x libpq5_17.1-1.pgdg110+1_${arch}.deb pg
+        dpkg -x postgresql-16_16.5-1.pgdg110+1_${arch}.deb pg
+        dpkg -x postgresql-client-16_16.5-1.pgdg110+1_${arch}.deb pg

        mkdir -p /tmp/neon/pg_install/v16/bin
        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -64,7 +64,7 @@ jobs:

      - uses: actions/checkout@v4

-      - uses: ./.github/actions/set-docker-config-dir
+      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
      - uses: docker/setup-buildx-action@v3
        with:
          cache-binary: false
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -90,35 +90,10 @@ jobs:

  check-codestyle-python:
    needs: [ check-permissions, build-build-tools-image ]
-    runs-on: [ self-hosted, small ]
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      options: --init
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Cache poetry deps
-        uses: actions/cache@v4
-        with:
-          path: ~/.cache/pypoetry/virtualenvs
-          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
-
-      - name: Install Python deps
-        run: ./scripts/pysync
-
-      - name: Run `ruff check` to ensure code format
-        run: poetry run ruff check .
-
-      - name: Run `ruff format` to ensure code format
-        run: poetry run ruff format --check .
-
-      - name: Run mypy to check types
-        run: poetry run mypy .
+    uses: ./.github/workflows/_check-codestyle-python.yml
+    with:
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+    secrets: inherit

  check-codestyle-jsonnet:
    needs: [ check-permissions, build-build-tools-image ]
@@ -141,6 +116,7 @@ jobs:
  # Check that the vendor/postgres-* submodules point to the
  # corresponding REL_*_STABLE_neon branches.
  check-submodules:
+    needs: [ check-permissions ]
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout
@@ -521,6 +497,8 @@ jobs:
          REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
          script: |
            const { REPORT_URL_NEW, COMMIT_SHA } = process.env

@@ -552,7 +530,7 @@ jobs:
        with:
          submodules: true

-      - uses: ./.github/actions/set-docker-config-dir
+      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
      - uses: docker/setup-buildx-action@v3
        with:
          cache-binary: false
@@ -643,7 +621,7 @@ jobs:
        with:
          submodules: true

-      - uses: ./.github/actions/set-docker-config-dir
+      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
      - uses: docker/setup-buildx-action@v3
        with:
          cache-binary: false
@@ -824,7 +802,7 @@ jobs:
          curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
          chmod +x vm-builder

-      - uses: ./.github/actions/set-docker-config-dir
+      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -860,7 +838,7 @@ jobs:
    steps:
      - uses: actions/checkout@v4

-      - uses: ./.github/actions/set-docker-config-dir
+      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -0,0 +1,372 @@
+name: Benchmarking
+
+on:
+  # uncomment to run on push for debugging your PR
+  # push:
+  #   branches: [ your branch ]
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:   '0 9 * * *' # run once a day, timezone is utc
+  workflow_dispatch: # adds ability to run this manually
+    
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow globally because we need dedicated resources which only exist once
+  group: ingest-bench-workflow
+  cancel-in-progress: true
+
+jobs:
+  ingest:
+    strategy:
+      matrix:
+        target_project: [new_empty_project, large_existing_project]  
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
+    env:
+      PG_CONFIG: /tmp/neon/pg_install/v16/bin/pg_config
+      PSQL: /tmp/neon/pg_install/v16/bin/psql
+      PG_16_LIB_PATH: /tmp/neon/pg_install/v16/lib
+      PGCOPYDB: /pgcopydb/bin/pgcopydb
+      PGCOPYDB_LIB_PATH: /pgcopydb/lib
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+    timeout-minutes: 1440
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Configure AWS credentials # necessary to download artefacts
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role 
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Create Neon Project
+      if: ${{ matrix.target_project == 'new_empty_project' }}
+      id: create-neon-project-ingest-target
+      uses: ./.github/actions/neon-project-create
+      with:
+        region_id: aws-us-east-2
+        postgres_version: 16
+        compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Initialize Neon project and retrieve current backpressure seconds
+      if: ${{ matrix.target_project == 'new_empty_project' }}
+      env:
+          NEW_PROJECT_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }}
+          NEW_PROJECT_ID: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
+      run: |
+        echo "Initializing Neon project with project_id: ${NEW_PROJECT_ID}"
+        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
+        ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
+        BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
+        echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV
+        echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV
+
+    - name: Create Neon Branch for large tenant
+      if: ${{ matrix.target_project == 'large_existing_project' }}
+      id: create-neon-branch-ingest-target
+      uses: ./.github/actions/neon-branch-create
+      with:
+        project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Initialize Neon project and retrieve current backpressure seconds
+      if: ${{ matrix.target_project == 'large_existing_project' }}
+      env:
+          NEW_PROJECT_CONNSTR: ${{ steps.create-neon-branch-ingest-target.outputs.dsn }}
+          NEW_BRANCH_ID: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }}
+      run: |
+        echo "Initializing Neon branch with branch_id: ${NEW_BRANCH_ID}"
+        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
+        # Extract the part before the database name
+        base_connstr="${NEW_PROJECT_CONNSTR%/*}"
+        # Extract the query parameters (if any) after the database name
+        query_params="${NEW_PROJECT_CONNSTR#*\?}"
+        # Reconstruct the new connection string
+        if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then
+          new_connstr="${base_connstr}/neondb?${query_params}"
+        else
+          new_connstr="${base_connstr}/neondb"
+        fi
+        ${PSQL} "${new_connstr}" -c "drop database ludicrous;"
+        ${PSQL} "${new_connstr}" -c "CREATE DATABASE ludicrous;"
+        if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then
+          NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous?${query_params}"
+        else
+          NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous"
+        fi
+        ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
+        BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
+        echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV
+        echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV
+      
+        
+    - name: Create pgcopydb filter file
+      run: |
+        cat << EOF > /tmp/pgcopydb_filter.txt
+          [include-only-table]
+          public.events
+          public.emails
+          public.email_transmissions
+          public.payments
+          public.editions
+          public.edition_modules
+          public.sp_content
+          public.email_broadcasts
+          public.user_collections
+          public.devices
+          public.user_accounts
+          public.lessons
+          public.lesson_users
+          public.payment_methods
+          public.orders
+          public.course_emails
+          public.modules
+          public.users
+          public.module_users
+          public.courses
+          public.payment_gateway_keys
+          public.accounts
+          public.roles
+          public.payment_gateways
+          public.management
+          public.event_names
+        EOF
+
+    - name: Invoke pgcopydb
+      env:
+          BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }}
+      run: |
+        export LD_LIBRARY_PATH=${PGCOPYDB_LIB_PATH}:${PG_16_LIB_PATH}
+        export PGCOPYDB_SOURCE_PGURI="${BENCHMARK_INGEST_SOURCE_CONNSTR}"
+        export PGCOPYDB_TARGET_PGURI="${NEW_PROJECT_CONNSTR}"
+        export PGOPTIONS="-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
+        ${PG_CONFIG} --bindir
+        ${PGCOPYDB} --version
+        ${PGCOPYDB} clone --skip-vacuum  --no-owner --no-acl --skip-db-properties --table-jobs 4 \
+          --index-jobs 4 --restore-jobs 4 --split-tables-larger-than 10GB --skip-extensions \
+          --use-copy-binary --filters /tmp/pgcopydb_filter.txt 2>&1 | tee /tmp/pgcopydb_${{ matrix.target_project }}.log
+
+    # create dummy pgcopydb log to test parsing
+    # - name: create dummy log for parser test
+    #   run: |
+    #     cat << EOF > /tmp/pgcopydb_${{ matrix.target_project }}.log
+    #     2024-11-04 18:00:53.433 500861 INFO   main.c:136                Running pgcopydb version 0.17.10.g8361a93 from "/usr/lib/postgresql/17/bin/pgcopydb"
+    #     2024-11-04 18:00:53.434 500861 INFO   cli_common.c:1225         [SOURCE] Copying database from "postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60"
+    #     2024-11-04 18:00:53.434 500861 INFO   cli_common.c:1226         [TARGET] Copying database into "postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60"
+    #     2024-11-04 18:00:53.442 500861 INFO   copydb.c:105              Using work dir "/tmp/pgcopydb"
+    #     2024-11-04 18:00:53.541 500861 INFO   snapshot.c:107            Exported snapshot "00000008-00000033-1" from the source database
+    #     2024-11-04 18:00:53.556 500865 INFO   cli_clone_follow.c:543    STEP 1: fetch source database tables, indexes, and sequences
+    #     2024-11-04 18:00:54.570 500865 INFO   copydb_schema.c:716       Splitting source candidate tables larger than 10 GB
+    #     2024-11-04 18:00:54.570 500865 INFO   copydb_schema.c:829       Table public.events is 96 GB large which is larger than --split-tables-larger-than 10 GB, and does not have a unique column of type integer: splitting by CTID
+    #     2024-11-04 18:01:05.538 500865 INFO   copydb_schema.c:905       Table public.events is 96 GB large, 10 COPY processes will be used, partitioning on ctid.
+    #     2024-11-04 18:01:05.564 500865 INFO   copydb_schema.c:905       Table public.email_transmissions is 27 GB large, 4 COPY processes will be used, partitioning on id.
+    #     2024-11-04 18:01:05.584 500865 INFO   copydb_schema.c:905       Table public.lessons is 25 GB large, 4 COPY processes will be used, partitioning on id.
+    #     2024-11-04 18:01:05.605 500865 INFO   copydb_schema.c:905       Table public.lesson_users is 16 GB large, 3 COPY processes will be used, partitioning on id.
+    #     2024-11-04 18:01:05.605 500865 INFO   copydb_schema.c:761       Fetched information for 26 tables (including 4 tables split in 21 partitions total), with an estimated total of 907 million tuples and 175 GB on-disk
+    #     2024-11-04 18:01:05.687 500865 INFO   copydb_schema.c:968       Fetched information for 57 indexes (supporting 25 constraints)
+    #     2024-11-04 18:01:05.753 500865 INFO   sequences.c:78            Fetching information for 24 sequences
+    #     2024-11-04 18:01:05.903 500865 INFO   copydb_schema.c:1122      Fetched information for 4 extensions
+    #     2024-11-04 18:01:06.178 500865 INFO   copydb_schema.c:1538      Found 0 indexes (supporting 0 constraints) in the target database
+    #     2024-11-04 18:01:06.184 500865 INFO   cli_clone_follow.c:584    STEP 2: dump the source database schema (pre/post data)
+    #     2024-11-04 18:01:06.186 500865 INFO   pgcmd.c:468                /usr/lib/postgresql/16/bin/pg_dump -Fc --snapshot 00000008-00000033-1 --section=pre-data --section=post-data --file /tmp/pgcopydb/schema/schema.dump 'postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60'
+    #     2024-11-04 18:01:06.952 500865 INFO   cli_clone_follow.c:592    STEP 3: restore the pre-data section to the target database
+    #     2024-11-04 18:01:07.004 500865 INFO   pgcmd.c:1001               /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section pre-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/pre-filtered.list /tmp/pgcopydb/schema/schema.dump
+    #     2024-11-04 18:01:07.438 500874 INFO   table-data.c:656          STEP 4: starting 4 table-data COPY processes
+    #     2024-11-04 18:01:07.451 500877 INFO   vacuum.c:139              STEP 8: skipping VACUUM jobs per --skip-vacuum
+    #     2024-11-04 18:01:07.457 500875 INFO   indexes.c:182             STEP 6: starting 4 CREATE INDEX processes
+    #     2024-11-04 18:01:07.457 500875 INFO   indexes.c:183             STEP 7: constraints are built by the CREATE INDEX processes
+    #     2024-11-04 18:01:07.507 500865 INFO   blobs.c:74                Skipping large objects: none found.
+    #     2024-11-04 18:01:07.509 500865 INFO   sequences.c:194           STEP 9: reset sequences values
+    #     2024-11-04 18:01:07.510 500886 INFO   sequences.c:290           Set sequences values on the target database
+    #     2024-11-04 20:49:00.587 500865 INFO   cli_clone_follow.c:608    STEP 10: restore the post-data section to the target database
+    #     2024-11-04 20:49:00.600 500865 INFO   pgcmd.c:1001               /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section post-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/post-filtered.list /tmp/pgcopydb/schema/schema.dump
+    #     2024-11-05 10:50:58.508 500865 INFO   cli_clone_follow.c:639    All step are now done, 16h49m elapsed
+    #     2024-11-05 10:50:58.508 500865 INFO   summary.c:3155            Printing summary for 26 tables and 57 indexes
+
+    #       OID | Schema |                 Name | Parts | copy duration | transmitted bytes | indexes | create index duration 
+    #     ------+--------+----------------------+-------+---------------+-------------------+---------+----------------------
+    #     24654 | public |               events |    10 |         1d11h |            878 GB |       1 |                 1h41m
+    #     24623 | public |  email_transmissions |     4 |         4h46m |             99 GB |       3 |                 2h04m
+    #     24665 | public |              lessons |     4 |         4h42m |            161 GB |       4 |                 1m11s
+    #     24661 | public |         lesson_users |     3 |         2h46m |             49 GB |       3 |                39m35s
+    #     24631 | public |               emails |     1 |        34m07s |             10 GB |       2 |                   17s
+    #     24739 | public |             payments |     1 |         5m47s |           1848 MB |       4 |                 4m40s
+    #     24681 | public |         module_users |     1 |         4m57s |           1610 MB |       3 |                 1m50s
+    #     24694 | public |               orders |     1 |         2m50s |            835 MB |       3 |                 1m05s
+    #     24597 | public |              devices |     1 |         1m45s |            498 MB |       2 |                   40s
+    #     24723 | public |      payment_methods |     1 |         1m24s |            548 MB |       2 |                   31s
+    #     24765 | public |     user_collections |     1 |         2m17s |           1005 MB |       2 |                 968ms
+    #     24774 | public |                users |     1 |           52s |            291 MB |       4 |                   27s
+    #     24760 | public |        user_accounts |     1 |           16s |            172 MB |       3 |                   16s
+    #     24606 | public |      edition_modules |     1 |         8s983 |             46 MB |       3 |                 4s749
+    #     24583 | public |        course_emails |     1 |         8s526 |             26 MB |       2 |                 996ms
+    #     24685 | public |              modules |     1 |         1s592 |             21 MB |       3 |                 1s696
+    #     24610 | public |             editions |     1 |         2s199 |           7483 kB |       2 |                 1s032
+    #     24755 | public |           sp_content |     1 |         1s555 |           4177 kB |       0 |                   0ms
+    #     24619 | public |     email_broadcasts |     1 |         744ms |           2645 kB |       2 |                 677ms
+    #     24590 | public |              courses |     1 |         387ms |           1540 kB |       2 |                 367ms
+    #     24704 | public | payment_gateway_keys |     1 |         1s972 |            164 kB |       2 |                  27ms
+    #     24576 | public |             accounts |     1 |          58ms |             24 kB |       1 |                  14ms
+    #     24647 | public |          event_names |     1 |          32ms |             397 B |       1 |                   8ms
+    #     24716 | public |     payment_gateways |     1 |         1s675 |             117 B |       1 |                  11ms
+    #     24748 | public |                roles |     1 |          71ms |             173 B |       1 |                   8ms
+    #     24676 | public |           management |     1 |          33ms |              40 B |       1 |                  19ms
+
+
+    #                                                   Step   Connection    Duration    Transfer   Concurrency
+    #     --------------------------------------------------   ----------  ----------  ----------  ------------
+    #       Catalog Queries (table ordering, filtering, etc)       source         12s                         1
+    #                                             Dump Schema       source       765ms                         1
+    #                                         Prepare Schema       target       466ms                         1
+    #           COPY, INDEX, CONSTRAINTS, VACUUM (wall clock)         both       2h47m                        12
+    #                                       COPY (cumulative)         both       7h46m     1225 GB             4
+    #                               CREATE INDEX (cumulative)       target       4h36m                         4
+    #                               CONSTRAINTS (cumulative)       target       8s493                         4
+    #                                     VACUUM (cumulative)       target         0ms                         4
+    #                                         Reset Sequences         both        60ms                         1
+    #                             Large Objects (cumulative)       (null)         0ms                         0
+    #                                         Finalize Schema         both      14h01m                         4
+    #     --------------------------------------------------   ----------  ----------  ----------  ------------
+    #                               Total Wall Clock Duration         both      16h49m                        20
+
+
+    #     EOF
+
+
+    - name: show tables sizes and retrieve current backpressure seconds
+      run: |
+        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
+        ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "\dt+"
+        BACKPRESSURE_TIME_AFTER_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
+        echo "BACKPRESSURE_TIME_AFTER_INGEST=${BACKPRESSURE_TIME_AFTER_INGEST}" >> $GITHUB_ENV
+
+    - name: Parse pgcopydb log and report performance metrics
+      env:
+        PERF_TEST_RESULT_CONNSTR: ${{ secrets.PERF_TEST_RESULT_CONNSTR }}
+      run: |
+        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
+
+        # Define the log file path
+        LOG_FILE="/tmp/pgcopydb_${{ matrix.target_project }}.log"
+        
+        # Get the current git commit hash
+        git config --global --add safe.directory /__w/neon/neon
+        COMMIT_HASH=$(git rev-parse --short HEAD)
+        
+        # Define the platform and test suite
+        PLATFORM="pg16-${{ matrix.target_project }}-us-east-2-staging"
+        SUIT="pgcopydb_ingest_bench"
+        
+        # Function to convert time (e.g., "2h47m", "4h36m", "118ms", "8s493") to seconds
+        convert_to_seconds() {
+          local duration=$1
+          local total_seconds=0
+    
+          # Check for hours (h)
+          if [[ "$duration" =~ ([0-9]+)h ]]; then
+            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 3600))
+          fi
+    
+          # Check for seconds (s)
+          if [[ "$duration" =~ ([0-9]+)s ]]; then
+            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0}))
+          fi
+    
+          # Check for milliseconds (ms) (if applicable)
+          if [[ "$duration" =~ ([0-9]+)ms ]]; then
+            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} / 1000))
+            duration=${duration/${BASH_REMATCH[0]}/} # need to remove it to avoid double counting with m 
+          fi
+
+          # Check for minutes (m) - must be checked after ms because m is contained in ms
+          if [[ "$duration" =~ ([0-9]+)m ]]; then
+            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 60))
+          fi
+    
+          echo $total_seconds
+        }
+
+        # Calculate the backpressure difference in seconds
+        BACKPRESSURE_TIME_DIFF=$(awk "BEGIN {print $BACKPRESSURE_TIME_AFTER_INGEST - $BACKPRESSURE_TIME_BEFORE_INGEST}")
+
+        # Insert the backpressure time difference into the performance database
+        if [ -n "$BACKPRESSURE_TIME_DIFF" ]; then
+          PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \"
+          INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp)
+          VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', 'backpressure_time', ${BACKPRESSURE_TIME_DIFF}, 'seconds', 'lower_is_better', now());
+          \""
+          echo "Inserting backpressure time difference: ${BACKPRESSURE_TIME_DIFF} seconds"
+          eval $PSQL_CMD
+        fi
+
+        # Extract and process log lines
+        while IFS= read -r line; do
+          METRIC_NAME=""
+          # Match each desired line and extract the relevant information
+          if [[ "$line" =~ COPY,\ INDEX,\ CONSTRAINTS,\ VACUUM.* ]]; then
+            METRIC_NAME="COPY, INDEX, CONSTRAINTS, VACUUM (wall clock)"
+          elif [[ "$line" =~ COPY\ \(cumulative\).* ]]; then
+            METRIC_NAME="COPY (cumulative)"
+          elif [[ "$line" =~ CREATE\ INDEX\ \(cumulative\).* ]]; then
+            METRIC_NAME="CREATE INDEX (cumulative)"
+          elif [[ "$line" =~ CONSTRAINTS\ \(cumulative\).* ]]; then
+            METRIC_NAME="CONSTRAINTS (cumulative)"
+          elif [[ "$line" =~ Finalize\ Schema.* ]]; then
+            METRIC_NAME="Finalize Schema"
+          elif [[ "$line" =~ Total\ Wall\ Clock\ Duration.* ]]; then
+            METRIC_NAME="Total Wall Clock Duration"
+          fi
+          
+          # If a metric was matched, insert it into the performance database
+          if [ -n "$METRIC_NAME" ]; then
+            DURATION=$(echo "$line" | grep -oP '\d+h\d+m|\d+s|\d+ms|\d{1,2}h\d{1,2}m|\d+\.\d+s' | head -n 1)
+            METRIC_VALUE=$(convert_to_seconds "$DURATION")
+            PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \"
+            INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp)
+            VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', '${METRIC_NAME}', ${METRIC_VALUE}, 'seconds', 'lower_is_better', now());
+            \""
+            echo "Inserting ${METRIC_NAME} with value ${METRIC_VALUE} seconds"
+            eval $PSQL_CMD
+          fi
+        done < "$LOG_FILE"
+      
+    - name: Delete Neon Project
+      if: ${{ always() && matrix.target_project == 'new_empty_project' }}
+      uses: ./.github/actions/neon-project-delete
+      with:
+        project_id: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Delete Neon Branch for large tenant
+      if: ${{ always() && matrix.target_project == 'large_existing_project' }}
+      uses: ./.github/actions/neon-branch-delete
+      with:
+        project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }}
+        branch_id: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -201,6 +201,8 @@ jobs:
          REPORT_URL: ${{ steps.upload-stats.outputs.report-url }}
          SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
          script: |
            const { REPORT_URL, SHA } = process.env

--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -0,0 +1,94 @@
+name: Pre-merge checks
+
+on:
+  merge_group:
+    branches:
+      - main
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+jobs:
+  get-changed-files:
+    runs-on: ubuntu-22.04
+    outputs:
+      python-changed: ${{ steps.python-src.outputs.any_changed }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
+        id: python-src
+        with:
+          files: |
+            .github/workflows/pre-merge-checks.yml
+            **/**.py
+            poetry.lock
+            pyproject.toml
+
+      - name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES
+        env:
+          PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }}
+        run: |
+          echo "${PYTHON_CHANGED_FILES}"
+
+  check-build-tools-image:
+    if: needs.get-changed-files.outputs.python-changed == 'true'
+    needs: [ get-changed-files ]
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  build-build-tools-image:
+    needs: [ check-build-tools-image ]
+    uses: ./.github/workflows/build-build-tools-image.yml
+    with:
+      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
+    secrets: inherit
+
+  check-codestyle-python:
+    if: needs.get-changed-files.outputs.python-changed == 'true'
+    needs: [ get-changed-files, build-build-tools-image ]
+    uses: ./.github/workflows/_check-codestyle-python.yml
+    with:
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+    secrets: inherit
+
+  # To get items from the merge queue merged into main we need to satisfy "Status checks that are required".
+  # Currently we require 2 jobs (checks with exact name):
+  # - conclusion
+  # - neon-cloud-e2e
+  conclusion:
+    if: always()
+    permissions:
+      statuses: write # for `github.repos.createCommitStatus(...)`
+    needs:
+      - get-changed-files
+      - check-codestyle-python
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Create fake `neon-cloud-e2e` check
+        uses: actions/github-script@v7
+        with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
+          script: |
+            const { repo, owner } = context.repo;
+            const targetUrl = `${context.serverUrl}/${owner}/${repo}/actions/runs/${context.runId}`;
+
+            await github.rest.repos.createCommitStatus({
+              owner: owner,
+              repo: repo,
+              sha: context.sha,
+              context: `neon-cloud-e2e`,
+              state: `success`,
+              target_url: targetUrl,
+              description: `fake check for merge queue`,
+            });
+
+      - name: Fail the job if any of the dependencies do not succeed or skipped
+        run: exit 1
+        if: |
+          (contains(needs.check-codestyle-python.result, 'skipped') && needs.get-changed-files.outputs.python-changed == 'true')
+          || contains(needs.*.result, 'failure')
+          || contains(needs.*.result, 'cancelled')
--- a/.github/workflows/report-workflow-stats-batch.yml
+++ b/.github/workflows/report-workflow-stats-batch.yml
@@ -0,0 +1,29 @@
+name: Report Workflow Stats Batch
+
+on:
+  schedule:
+    - cron: '*/15 * * * *'
+    - cron: '25 0 * * *'
+
+jobs:
+  gh-workflow-stats-batch:
+    name: GitHub Workflow Stats Batch
+    runs-on: ubuntu-22.04
+    permissions:
+      actions: read
+    steps:
+    - name: Export Workflow Run for the past 2 hours
+      uses: neondatabase/gh-workflow-stats-action@v0.2.1
+      with:
+        db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
+        db_table: "gh_workflow_stats_batch_neon"
+        gh_token: ${{ secrets.GITHUB_TOKEN }}
+        duration: '2h'
+    - name: Export Workflow Run for the past 24 hours
+      if: github.event.schedule == '25 0 * * *'
+      uses: neondatabase/gh-workflow-stats-action@v0.2.1
+      with:
+        db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
+        db_table: "gh_workflow_stats_batch_neon"
+        gh_token: ${{ secrets.GITHUB_TOKEN }}
+        duration: '24h'
--- a/.github/workflows/report-workflow-stats.yml
+++ b/.github/workflows/report-workflow-stats.yml
@@ -23,6 +23,7 @@ on:
    - Test Postgres client libraries
    - Trigger E2E Tests
    - cleanup caches by a branch
+    - Pre-merge checks
    types: [completed]

 jobs:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -34,7 +34,7 @@ dependencies = [
 "getrandom 0.2.11",
 "once_cell",
 "version_check",
- "zerocopy 0.7.31",
+ "zerocopy",
 ]

 [[package]]
@@ -310,33 +310,6 @@ dependencies = [
 "zeroize",
 ]

-[[package]]
-name = "aws-lc-rs"
-version = "1.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f95446d919226d587817a7d21379e6eb099b97b45110a7f272a444ca5c54070"
-dependencies = [
- "aws-lc-sys",
- "mirai-annotations",
- "paste",
- "zeroize",
-]
-
-[[package]]
-name = "aws-lc-sys"
-version = "0.21.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3ddc4a5b231dd6958b140ff3151b6412b3f4321fab354f399eec8f14b06df62"
-dependencies = [
- "bindgen 0.69.5",
- "cc",
- "cmake",
- "dunce",
- "fs_extra",
- "libc",
- "paste",
-]
-
 [[package]]
 name = "aws-runtime"
 version = "1.4.3"
@@ -942,29 +915,6 @@ dependencies = [
 "serde",
 ]

-[[package]]
-name = "bindgen"
-version = "0.69.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
-dependencies = [
- "bitflags 2.4.1",
- "cexpr",
- "clang-sys",
- "itertools 0.12.1",
- "lazy_static",
- "lazycell",
- "log",
- "prettyplease",
- "proc-macro2",
- "quote",
- "regex",
- "rustc-hash",
- "shlex",
- "syn 2.0.52",
- "which",
-]
-
 [[package]]
 name = "bindgen"
 version = "0.70.1"
@@ -1220,15 +1170,6 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"

-[[package]]
-name = "cmake"
-version = "0.1.51"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
-dependencies = [
- "cc",
-]
-
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -1288,12 +1229,15 @@ dependencies = [
 "flate2",
 "futures",
 "hyper 0.14.30",
+ "metrics",
 "nix 0.27.1",
 "notify",
 "num_cpus",
+ "once_cell",
 "opentelemetry",
 "opentelemetry_sdk",
 "postgres",
+ "prometheus",
 "regex",
 "remote_storage",
 "reqwest 0.12.4",
@@ -1329,9 +1273,9 @@ dependencies = [

 [[package]]
 name = "const-oid"
-version = "0.9.5"
+version = "0.9.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f"
+checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"

 [[package]]
 name = "const-random"
@@ -1815,12 +1759,6 @@ dependencies = [
 "syn 2.0.52",
 ]

-[[package]]
-name = "dunce"
-version = "1.0.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
-
 [[package]]
 name = "dyn-clone"
 version = "1.0.14"
@@ -2125,12 +2063,6 @@ dependencies = [
 "tokio-util",
 ]

-[[package]]
-name = "fs_extra"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
-
 [[package]]
 name = "fsevent-sys"
 version = "4.1.0"
@@ -2484,15 +2416,6 @@ dependencies = [
 "digest",
 ]

-[[package]]
-name = "home"
-version = "0.5.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
-dependencies = [
- "windows-sys 0.52.0",
-]
-
 [[package]]
 name = "hostname"
 version = "0.4.0"
@@ -2988,12 +2911,6 @@ dependencies = [
 "spin",
 ]

-[[package]]
-name = "lazycell"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
-
 [[package]]
 name = "libc"
 version = "0.2.150"
@@ -3224,12 +3141,6 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

-[[package]]
-name = "mirai-annotations"
-version = "1.12.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1"
-
 [[package]]
 name = "multimap"
 version = "0.8.3"
@@ -4098,7 +4009,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4111,7 +4022,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -4130,7 +4041,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4147,7 +4058,7 @@ dependencies = [
 "bytes",
 "once_cell",
 "pq_proto",
- "rustls 0.23.7",
+ "rustls 0.23.16",
 "rustls-pemfile 2.1.1",
 "serde",
 "thiserror",
@@ -4176,7 +4087,7 @@ name = "postgres_ffi"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "bindgen 0.70.1",
+ "bindgen",
 "bytes",
 "crc32c",
 "env_logger",
@@ -4422,7 +4333,7 @@ dependencies = [
 "rsa",
 "rstest",
 "rustc-hash",
- "rustls 0.23.7",
+ "rustls 0.23.16",
 "rustls-native-certs 0.8.0",
 "rustls-pemfile 2.1.1",
 "scopeguard",
@@ -4457,7 +4368,7 @@ dependencies = [
 "walkdir",
 "workspace_hack",
 "x509-parser",
- "zerocopy 0.8.8",
+ "zerocopy",
 ]

 [[package]]
@@ -4835,6 +4746,7 @@ dependencies = [
 "percent-encoding",
 "pin-project-lite",
 "rustls 0.22.4",
+ "rustls-native-certs 0.7.0",
 "rustls-pemfile 2.1.1",
 "rustls-pki-types",
 "serde",
@@ -5109,23 +5021,22 @@ dependencies = [
 "log",
 "ring",
 "rustls-pki-types",
- "rustls-webpki 0.102.2",
+ "rustls-webpki 0.102.8",
 "subtle",
 "zeroize",
 ]

 [[package]]
 name = "rustls"
-version = "0.23.7"
+version = "0.23.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ebbbdb961df0ad3f2652da8f3fdc4b36122f568f968f45ad3316f26c025c677b"
+checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e"
 dependencies = [
- "aws-lc-rs",
 "log",
 "once_cell",
 "ring",
 "rustls-pki-types",
- "rustls-webpki 0.102.2",
+ "rustls-webpki 0.102.8",
 "subtle",
 "zeroize",
 ]
@@ -5205,11 +5116,10 @@ dependencies = [

 [[package]]
 name = "rustls-webpki"
-version = "0.102.2"
+version = "0.102.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
+checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
 dependencies = [
- "aws-lc-rs",
 "ring",
 "rustls-pki-types",
 "untrusted",
@@ -5240,6 +5150,7 @@ dependencies = [
 "chrono",
 "clap",
 "crc32c",
+ "criterion",
 "desim",
 "fail",
 "futures",
@@ -5247,6 +5158,7 @@ dependencies = [
 "http 1.1.0",
 "humantime",
 "hyper 0.14.30",
+ "itertools 0.10.5",
 "metrics",
 "once_cell",
 "parking_lot 0.12.1",
@@ -5751,9 +5663,9 @@ dependencies = [

 [[package]]
 name = "smallvec"
-version = "1.13.1"
+version = "1.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"
+checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"

 [[package]]
 name = "smol_str"
@@ -5826,6 +5738,7 @@ dependencies = [
 "once_cell",
 "parking_lot 0.12.1",
 "prost",
+ "rustls 0.23.16",
 "tokio",
 "tonic",
 "tonic-build",
@@ -5908,7 +5821,7 @@ dependencies = [
 "postgres_ffi",
 "remote_storage",
 "reqwest 0.12.4",
- "rustls 0.23.7",
+ "rustls 0.23.16",
 "rustls-native-certs 0.8.0",
 "serde",
 "serde_json",
@@ -6161,9 +6074,9 @@ dependencies = [

 [[package]]
 name = "tikv-jemalloc-ctl"
-version = "0.5.4"
+version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "619bfed27d807b54f7f776b9430d4f8060e66ee138a28632ca898584d462c31c"
+checksum = "f21f216790c8df74ce3ab25b534e0718da5a1916719771d3fec23315c99e468b"
 dependencies = [
 "libc",
 "paste",
@@ -6172,9 +6085,9 @@ dependencies = [

 [[package]]
 name = "tikv-jemalloc-sys"
-version = "0.5.4+5.3.0-patched"
+version = "0.6.0+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9402443cb8fd499b6f327e40565234ff34dbda27460c5b47db0db77443dd85d1"
+checksum = "cd3c60906412afa9c2b5b5a48ca6a5abe5736aec9eb48ad05037a677e52e4e2d"
 dependencies = [
 "cc",
 "libc",
@@ -6182,9 +6095,9 @@ dependencies = [

 [[package]]
 name = "tikv-jemallocator"
-version = "0.5.4"
+version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "965fe0c26be5c56c94e38ba547249074803efd52adfb66de62107d95aab3eaca"
+checksum = "4cec5ff18518d81584f477e9bfdf957f5bb0979b0bac3af4ca30b5b3ae2d2865"
 dependencies = [
 "libc",
 "tikv-jemalloc-sys",
@@ -6314,7 +6227,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -6341,7 +6254,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab"
 dependencies = [
 "ring",
- "rustls 0.23.7",
+ "rustls 0.23.16",
 "tokio",
 "tokio-postgres",
 "tokio-rustls 0.26.0",
@@ -6375,7 +6288,7 @@ version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
 dependencies = [
- "rustls 0.23.7",
+ "rustls 0.23.16",
 "rustls-pki-types",
 "tokio",
 ]
@@ -6784,7 +6697,7 @@ dependencies = [
 "base64 0.22.1",
 "log",
 "once_cell",
- "rustls 0.23.7",
+ "rustls 0.23.16",
 "rustls-pki-types",
 "url",
 "webpki-roots 0.26.1",
@@ -6988,7 +6901,7 @@ name = "walproposer"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "bindgen 0.70.1",
+ "bindgen",
 "postgres_ffi",
 "utils",
 ]
@@ -7163,18 +7076,6 @@ dependencies = [
 "rustls-pki-types",
 ]

-[[package]]
-name = "which"
-version = "4.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
-dependencies = [
- "either",
- "home",
- "once_cell",
- "rustix",
-]
-
 [[package]]
 name = "whoami"
 version = "1.5.1"
@@ -7434,7 +7335,7 @@ dependencies = [
 "hyper-util",
 "indexmap 1.9.3",
 "indexmap 2.0.1",
- "itertools 0.10.5",
+ "itertools 0.12.1",
 "lazy_static",
 "libc",
 "log",
@@ -7455,8 +7356,7 @@ dependencies = [
 "regex-automata 0.4.3",
 "regex-syntax 0.8.2",
 "reqwest 0.12.4",
- "rustls 0.23.7",
- "rustls-webpki 0.102.2",
+ "rustls 0.23.16",
 "scopeguard",
 "serde",
 "serde_json",
@@ -7481,6 +7381,7 @@ dependencies = [
 "tracing",
 "tracing-core",
 "url",
+ "zerocopy",
 "zeroize",
 "zstd",
 "zstd-safe",
@@ -7553,16 +7454,8 @@ version = "0.7.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1c4061bedbb353041c12f413700357bec76df2c7e2ca8e4df8bac24c6bf68e3d"
 dependencies = [
- "zerocopy-derive 0.7.31",
-]
-
-[[package]]
-name = "zerocopy"
-version = "0.8.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a4e33e6dce36f2adba29746927f8e848ba70989fdb61c772773bbdda8b5d6a7"
-dependencies = [
- "zerocopy-derive 0.8.8",
+ "byteorder",
+ "zerocopy-derive",
 ]

 [[package]]
@@ -7576,17 +7469,6 @@ dependencies = [
 "syn 2.0.52",
 ]

-[[package]]
-name = "zerocopy-derive"
-version = "0.8.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3cd137b4cc21bde6ecce3bbbb3350130872cda0be2c6888874279ea76e17d4c1"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.52",
-]
-
 [[package]]
 name = "zeroize"
 version = "1.7.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -143,7 +143,7 @@ reqwest-retry = "0.5"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
-rustls = "0.23"
+rustls = { version = "0.23.16", default-features = false }
 rustls-pemfile = "2"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
@@ -168,13 +168,13 @@ sync_wrapper = "0.1.2"
 tar = "0.4"
 test-context = "0.3"
 thiserror = "1.0"
-tikv-jemallocator = "0.5"
-tikv-jemalloc-ctl = "0.5"
+tikv-jemallocator = { version = "0.6", features = ["stats"] }
+tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
-tokio-rustls = "0.26"
+tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
@@ -196,27 +196,17 @@ walkdir = "2.3.2"
 rustls-native-certs = "0.8"
 x509-parser = "0.16"
 whoami = "1.5.1"
+zerocopy = { version = "0.7", features = ["derive"] }

 ## TODO replace this with tracing
 env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-
-# We want to use the 'neon' branch for these, but there's currently one
-# incompatible change on the branch. See:
-#
-# - PR #8076 which contained changes that depended on the new changes in
-#   the rust-postgres crate, and
-# - PR #8654 which reverted those changes and made the code in proxy incompatible
-#   with the tip of the 'neon' branch again.
-#
-# When those proxy changes are re-applied (see PR #8747), we can switch using
-# the tip of the 'neon' branch again.
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }

 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
@@ -254,7 +244,7 @@ tonic-build = "0.12"
 [patch.crates-io]

 # Needed to get `tokio-postgres-rustls` to depend on our fork.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }

 ################# Binary contents sections

--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -1,12 +1,66 @@
 ARG DEBIAN_VERSION=bullseye

-FROM debian:${DEBIAN_VERSION}-slim
+FROM debian:bookworm-slim AS pgcopydb_builder
+ARG DEBIAN_VERSION
+
+RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
+        set -e && \
+        apt update && \
+        apt install -y --no-install-recommends \
+        ca-certificates wget gpg && \
+        wget -qO - https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor -o /usr/share/keyrings/postgresql-keyring.gpg && \
+        echo "deb [signed-by=/usr/share/keyrings/postgresql-keyring.gpg] http://apt.postgresql.org/pub/repos/apt bookworm-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \
+        apt-get update && \
+        apt install -y --no-install-recommends \
+        build-essential \
+        autotools-dev \
+        libedit-dev \
+        libgc-dev \
+        libpam0g-dev \
+        libreadline-dev \
+        libselinux1-dev \
+        libxslt1-dev \
+        libssl-dev \
+        libkrb5-dev \
+        zlib1g-dev \
+        liblz4-dev \
+        libpq5 \
+        libpq-dev \
+        libzstd-dev \
+        postgresql-16 \
+        postgresql-server-dev-16 \
+        postgresql-common  \
+        python3-sphinx && \
+        wget -O /tmp/pgcopydb.tar.gz https://github.com/dimitri/pgcopydb/archive/refs/tags/v0.17.tar.gz && \
+        mkdir /tmp/pgcopydb && \
+        tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \
+        cd /tmp/pgcopydb && \
+        make -s clean && \
+        make -s -j12 install && \
+        libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \
+        mkdir -p /pgcopydb/lib && \
+        cp "$libpq_path" /pgcopydb/lib/; \
+    else \
+        # copy command below will fail if we don't have dummy files, so we create them for other debian versions
+        mkdir -p /usr/lib/postgresql/16/bin && touch /usr/lib/postgresql/16/bin/pgcopydb && \
+        mkdir -p mkdir -p /pgcopydb/lib && touch /pgcopydb/lib/libpq.so.5; \
+    fi
+
+FROM debian:${DEBIAN_VERSION}-slim AS build_tools
 ARG DEBIAN_VERSION

 # Add nonroot user
 RUN useradd -ms /bin/bash nonroot -b /home
 SHELL ["/bin/bash", "-c"]

+RUN mkdir -p /pgcopydb/bin && \
+    mkdir -p /pgcopydb/lib && \
+    chmod -R 755 /pgcopydb && \
+    chown -R nonroot:nonroot /pgcopydb
+        
+COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb 
+COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5 
+
 # System deps
 #
 # 'gdb' is included so that we get backtraces of core dumps produced in
@@ -38,7 +92,7 @@ RUN set -e \
        libseccomp-dev \
        libsqlite3-dev \
        libssl-dev \
-        $([[ "${DEBIAN_VERSION}" = "bullseye" ]] && libstdc++-10-dev || libstdc++-11-dev) \
+        $([[ "${DEBIAN_VERSION}" = "bullseye" ]] && echo libstdc++-10-dev || echo libstdc++-11-dev) \
        libtool \
        libxml2-dev \
        libxmlsec1-dev \
@@ -235,7 +289,13 @@ RUN whoami \
    && cargo --version --verbose \
    && rustup --version --verbose \
    && rustc --version --verbose \
-    && clang --version
+    && clang --version 
+
+RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
+    LD_LIBRARY_PATH=/pgcopydb/lib /pgcopydb/bin/pgcopydb --version; \
+else \
+    echo "pgcopydb is not available for ${DEBIAN_VERSION}"; \
+fi

 # Set following flag to check in Makefile if its running in Docker
 RUN touch /home/nonroot/.docker_build
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -559,8 +559,8 @@ RUN case "${PG_VERSION}" in \
        export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
        ;; \
      "v17") \
-        export TIMESCALEDB_VERSION=2.17.0 \
-        export TIMESCALEDB_CHECKSUM=155bf64391d3558c42f31ca0e523cfc6252921974f75298c9039ccad1c89811a \
+        export TIMESCALEDB_VERSION=2.17.1 \
+        export TIMESCALEDB_CHECKSUM=6277cf43f5695e23dae1c5cfeba00474d730b66ed53665a84b787a6bb1a57e28 \
        ;; \
    esac && \
    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
@@ -624,16 +624,12 @@ FROM build-deps AS pg-cron-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-# 1.6.4 available, supports v17
 # This is an experimental extension that we do not support on prod yet.
 # !Do not remove!
 # We set it in shared_preload_libraries and computes will fail to start if library is not found.
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
-    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \
+    echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \
    mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -1151,8 +1147,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # The topmost commit in the `neon` branch at the time of writing this
 # https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
-# https://github.com/Mooncake-Labs/pg_mooncake/commit/568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
-ENV PG_MOONCAKE_VERSION=568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
+# https://github.com/Mooncake-Labs/pg_mooncake/commit/077c92c452bb6896a7b7776ee95f039984f076af
+ENV PG_MOONCAKE_VERSION=077c92c452bb6896a7b7776ee95f039984f076af
 ENV PATH="/usr/local/pgsql/bin/:$PATH"

 RUN case "${PG_VERSION}" in \
@@ -1475,6 +1471,8 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
 COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
 COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter

+COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
+
 COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml               /etc/sql_exporter.yml
 COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml             /etc/neon_collector.yml
 COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml   /etc/sql_exporter_autoscaling.yml
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -3,9 +3,10 @@
  metrics: [
    import 'sql_exporter/checkpoints_req.libsonnet',
    import 'sql_exporter/checkpoints_timed.libsonnet',
-    import 'sql_exporter/compute_backpressure_throttling_ms.libsonnet',
+    import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet',
    import 'sql_exporter/compute_current_lsn.libsonnet',
    import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
+    import 'sql_exporter/compute_max_connections.libsonnet',
    import 'sql_exporter/compute_receive_lsn.libsonnet',
    import 'sql_exporter/compute_subscriptions_count.libsonnet',
    import 'sql_exporter/connection_counts.libsonnet',
--- a/compute/etc/postgres_exporter.yml
+++ b/compute/etc/postgres_exporter.yml
--- a/compute/etc/sql_exporter/compute_backpressure_throttling_ms.sql
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_ms.sql
@@ -1 +0,0 @@
-SELECT neon.backpressure_throttling_time() AS throttled;
--- a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet
@@ -1,10 +1,10 @@
 {
-  metric_name: 'compute_backpressure_throttling_ms',
+  metric_name: 'compute_backpressure_throttling_seconds',
  type: 'gauge',
  help: 'Time compute has spent throttled',
  key_labels: null,
  values: [
    'throttled',
  ],
-  query: importstr 'sql_exporter/compute_backpressure_throttling_ms.sql',
+  query: importstr 'sql_exporter/compute_backpressure_throttling_seconds.sql',
 }
--- a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql
@@ -0,0 +1 @@
+SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled;
--- a/compute/etc/sql_exporter/compute_max_connections.libsonnet
+++ b/compute/etc/sql_exporter/compute_max_connections.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'compute_max_connections',
+  type: 'gauge',
+  help: 'Max connections allowed for Postgres',
+  key_labels: null,
+  values: [
+    'max_connections',
+  ],
+  query: importstr 'sql_exporter/compute_max_connections.sql',
+}
--- a/compute/etc/sql_exporter/compute_max_connections.sql
+++ b/compute/etc/sql_exporter/compute_max_connections.sql
@@ -0,0 +1 @@
+SELECT current_setting('max_connections') as max_connections;
--- a/compute/patches/cloud_regress_pg16.patch
+++ b/compute/patches/cloud_regress_pg16.patch
@@ -147,7 +147,7 @@ index 542c2e098c..0062d3024f 100644
 ALTER TABLE ptnowner1 OWNER TO regress_ptnowner;
 ALTER TABLE ptnowner OWNER TO regress_ptnowner;
 diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
-index 97bbe53b64..eac3d42a79 100644
+index 3f9a8f539c..0a51b52940 100644
 --- a/src/test/regress/expected/collate.icu.utf8.out
 +++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1016,7 +1016,7 @@ select * from collate_test1 where b ilike 'ABC';
@@ -309,7 +309,7 @@ index b48365ec98..a6ef910055 100644
 -- the wrong partition. This test is *not* guaranteed to trigger that bug, but
 -- does so when shared_buffers is small enough.  To test if we encountered the
 diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out
-index faf1a4d1b0..a44c97db52 100644
+index 9a74820ee8..22400a5551 100644
 --- a/src/test/regress/expected/copy2.out
 +++ b/src/test/regress/expected/copy2.out
@@ -553,8 +553,8 @@ select * from check_con_tbl;
@@ -573,7 +573,7 @@ index 93302a07ef..1a73f083ac 100644
 -- that does not match with what's expected.
 -- This checks all the object types that include schema qualifications.
 diff --git a/src/test/regress/expected/create_view.out b/src/test/regress/expected/create_view.out
-index f3f8c7b5a2..3e3e54ff4c 100644
+index f551624afb..57f1e432d4 100644
 --- a/src/test/regress/expected/create_view.out
 +++ b/src/test/regress/expected/create_view.out
@@ -18,7 +18,8 @@ CREATE TABLE real_city (
@@ -700,12 +700,12 @@ index 6ed50fdcfa..caa00a345d 100644
 COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
 CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator;
 diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out
-index 12e523c737..8872e23935 100644
+index 6b8c2f2414..8e13b7fa46 100644
 --- a/src/test/regress/expected/foreign_key.out
 +++ b/src/test/regress/expected/foreign_key.out
-@@ -1968,7 +1968,7 @@ ALTER TABLE fk_partitioned_fk ATTACH PARTITION fk_partitioned_fk_2
-   FOR VALUES IN (1600);
- -- leave these tables around intentionally
+@@ -1985,7 +1985,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES
+ ERROR:  cannot ALTER TABLE "fk_partitioned_pk_61" because it is being used by active queries in this session
+ DROP TABLE fk_partitioned_pk_6, fk_partitioned_fk_6;
 -- test the case when the referenced table is owned by a different user
 -create role regress_other_partitioned_fk_owner;
 +create role regress_other_partitioned_fk_owner PASSWORD NEON_PASSWORD_PLACEHOLDER;
@@ -713,7 +713,7 @@ index 12e523c737..8872e23935 100644
 set role regress_other_partitioned_fk_owner;
 create table other_partitioned_fk(a int, b int) partition by list (a);
 diff --git a/src/test/regress/expected/generated.out b/src/test/regress/expected/generated.out
-index 0f623f7119..b48588a54e 100644
+index 5881420388..4ae21aa43c 100644
 --- a/src/test/regress/expected/generated.out
 +++ b/src/test/regress/expected/generated.out
@@ -534,7 +534,7 @@ CREATE TABLE gtest10a (a int PRIMARY KEY, b int GENERATED ALWAYS AS (a * 2) STOR
@@ -762,7 +762,7 @@ index a2036a1597..805d73b9d2 100644
 -- fields, leading to long bucket chains and lots of table expansion.
 -- this is therefore a stress test of the bucket overflow code (unlike
 diff --git a/src/test/regress/expected/identity.out b/src/test/regress/expected/identity.out
-index cc7772349f..98a08eb48d 100644
+index 1b74958de9..078187b542 100644
 --- a/src/test/regress/expected/identity.out
 +++ b/src/test/regress/expected/identity.out
@@ -520,7 +520,7 @@ ALTER TABLE itest7 ALTER COLUMN a SET GENERATED BY DEFAULT;
@@ -775,10 +775,10 @@ index cc7772349f..98a08eb48d 100644
 GRANT SELECT, INSERT ON itest8 TO regress_identity_user1;
 SET ROLE regress_identity_user1;
 diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out
-index 4943429e9b..0257f22b15 100644
+index 8f831c95c3..ec681b52af 100644
 --- a/src/test/regress/expected/inherit.out
 +++ b/src/test/regress/expected/inherit.out
-@@ -2606,7 +2606,7 @@ create index on permtest_parent (left(c, 3));
+@@ -2636,7 +2636,7 @@ create index on permtest_parent (left(c, 3));
 insert into permtest_parent
   select 1, 'a', left(fipshash(i::text), 5) from generate_series(0, 100) i;
 analyze permtest_parent;
@@ -1133,7 +1133,7 @@ index 8475231735..1afae5395f 100644
 SELECT rolname, rolpassword
     FROM pg_authid
 diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
-index fbb0489a4f..2905194e2c 100644
+index 5b9dba7b32..cc408dad42 100644
 --- a/src/test/regress/expected/privileges.out
 +++ b/src/test/regress/expected/privileges.out
@@ -20,19 +20,19 @@ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3
@@ -1185,7 +1185,7 @@ index fbb0489a4f..2905194e2c 100644
 GRANT pg_read_all_data TO regress_priv_user6;
 GRANT pg_write_all_data TO regress_priv_user7;
 GRANT pg_read_all_settings TO regress_priv_user8 WITH ADMIN OPTION;
-@@ -145,8 +145,8 @@ REVOKE pg_read_all_settings FROM regress_priv_user8;
+@@ -212,8 +212,8 @@ REVOKE pg_read_all_settings FROM regress_priv_user8;
 DROP USER regress_priv_user10;
 DROP USER regress_priv_user9;
 DROP USER regress_priv_user8;
@@ -1196,7 +1196,7 @@ index fbb0489a4f..2905194e2c 100644
 ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4;
 GRANT regress_priv_group2 TO regress_priv_user2 GRANTED BY regress_priv_user1;
 SET SESSION AUTHORIZATION regress_priv_user1;
-@@ -172,12 +172,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre
+@@ -239,12 +239,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre
 ERROR:  permission denied to grant privileges as role "regress_priv_role"
 DETAIL:  The grantor must have the ADMIN option on role "regress_priv_role".
 GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY CURRENT_ROLE;
@@ -1213,7 +1213,7 @@ index fbb0489a4f..2905194e2c 100644
 DROP ROLE regress_priv_role;
 SET SESSION AUTHORIZATION regress_priv_user1;
 SELECT session_user, current_user;
-@@ -1709,7 +1713,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
+@@ -1776,7 +1780,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
 
 -- security-restricted operations
 \c -
@@ -1222,7 +1222,7 @@ index fbb0489a4f..2905194e2c 100644
 -- Check that index expressions and predicates are run as the table's owner
 -- A dummy index function checking current_user
 CREATE FUNCTION sro_ifun(int) RETURNS int AS $$
-@@ -2601,8 +2605,8 @@ drop cascades to function testns.priv_testagg(integer)
+@@ -2668,8 +2672,8 @@ drop cascades to function testns.priv_testagg(integer)
 drop cascades to function testns.priv_testproc(integer)
 -- Change owner of the schema & and rename of new schema owner
 \c -
@@ -1233,7 +1233,7 @@ index fbb0489a4f..2905194e2c 100644
 SET SESSION ROLE regress_schemauser1;
 CREATE SCHEMA testns;
 SELECT nspname, rolname FROM pg_namespace, pg_roles WHERE pg_namespace.nspname = 'testns' AND pg_namespace.nspowner = pg_roles.oid;
-@@ -2725,7 +2729,7 @@ DROP USER regress_priv_user7;
+@@ -2792,7 +2796,7 @@ DROP USER regress_priv_user7;
 DROP USER regress_priv_user8; -- does not exist
 ERROR:  role "regress_priv_user8" does not exist
 -- permissions with LOCK TABLE
@@ -1242,7 +1242,7 @@ index fbb0489a4f..2905194e2c 100644
 CREATE TABLE lock_table (a int);
 -- LOCK TABLE and SELECT permission
 GRANT SELECT ON lock_table TO regress_locktable_user;
-@@ -2807,7 +2811,7 @@ DROP USER regress_locktable_user;
+@@ -2874,7 +2878,7 @@ DROP USER regress_locktable_user;
 -- pg_backend_memory_contexts.
 -- switch to superuser
 \c -
@@ -1251,7 +1251,7 @@ index fbb0489a4f..2905194e2c 100644
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
  has_table_privilege 
 ---------------------
-@@ -2851,10 +2855,10 @@ RESET ROLE;
+@@ -2918,10 +2922,10 @@ RESET ROLE;
 -- clean up
 DROP ROLE regress_readallstats;
 -- test role grantor machinery
@@ -1266,7 +1266,7 @@ index fbb0489a4f..2905194e2c 100644
 GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE;
 GRANT regress_group_direct_manager TO regress_group_indirect_manager;
 SET SESSION AUTHORIZATION regress_group_direct_manager;
-@@ -2883,9 +2887,9 @@ DROP ROLE regress_group_direct_manager;
+@@ -2950,9 +2954,9 @@ DROP ROLE regress_group_direct_manager;
 DROP ROLE regress_group_indirect_manager;
 DROP ROLE regress_group_member;
 -- test SET and INHERIT options with object ownership changes
@@ -1813,7 +1813,7 @@ index 5e6969b173..2c4d52237f 100644
 
 -- clean up roles
 diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out
-index 97ca9bf72c..b2a7a6f710 100644
+index 218c0c2863..f7af0cfb12 100644
 --- a/src/test/regress/expected/rowsecurity.out
 +++ b/src/test/regress/expected/rowsecurity.out
@@ -14,13 +14,13 @@ DROP ROLE IF EXISTS regress_rls_group2;
@@ -1917,6 +1917,19 @@ index b79fe9a1c0..e29fab88ab 100644
 ALTER DEFAULT PRIVILEGES FOR ROLE regress_selinto_user
 	  REVOKE INSERT ON TABLES FROM regress_selinto_user;
 GRANT ALL ON SCHEMA selinto_schema TO public;
+diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out
+index afc6ab08c2..dfcd891af3 100644
+--- a/src/test/regress/expected/select_parallel.out
+++ b/src/test/regress/expected/select_parallel.out
+@@ -1220,7 +1220,7 @@ SELECT 1 FROM tenk1_vw_sec
+ 
+ rollback;
+ -- test that function option SET ROLE works in parallel workers.
+-create role regress_parallel_worker;
+create role regress_parallel_worker PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ create function set_and_report_role() returns text as
+   $$ select current_setting('role') $$ language sql parallel safe
+   set role = regress_parallel_worker;
 diff --git a/src/test/regress/expected/select_views.out b/src/test/regress/expected/select_views.out
 index 1aeed8452b..7d9427d070 100644
 --- a/src/test/regress/expected/select_views.out
@@ -2369,7 +2382,7 @@ index 6cb9c926c0..5e689e4062 100644
 ALTER TABLE ptnowner1 OWNER TO regress_ptnowner;
 ALTER TABLE ptnowner OWNER TO regress_ptnowner;
 diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
-index 3db9e25913..c66d5aa2c2 100644
+index 8aa902d5ab..24bb823b86 100644
 --- a/src/test/regress/sql/collate.icu.utf8.sql
 +++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -353,7 +353,7 @@ reset enable_seqscan;
@@ -2532,7 +2545,7 @@ index 43d2e906dd..6c993d70f0 100644
 -- An earlier bug (see commit b1ecb9b3fcf) could end up using a buffer from
 -- the wrong partition. This test is *not* guaranteed to trigger that bug, but
 diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql
-index d759635068..d58e50dcc5 100644
+index cf3828c16e..cf3ca38175 100644
 --- a/src/test/regress/sql/copy2.sql
 +++ b/src/test/regress/sql/copy2.sql
@@ -365,8 +365,8 @@ copy check_con_tbl from stdin;
@@ -2774,7 +2787,7 @@ index 1b7064247a..be5b662ce1 100644
 -- Cases where schema creation fails as objects are qualified with a schema
 -- that does not match with what's expected.
 diff --git a/src/test/regress/sql/create_view.sql b/src/test/regress/sql/create_view.sql
-index 3a78be1b0c..617d2dc8d6 100644
+index ae6841308b..47bc792e30 100644
 --- a/src/test/regress/sql/create_view.sql
 +++ b/src/test/regress/sql/create_view.sql
@@ -23,7 +23,8 @@ CREATE TABLE real_city (
@@ -2901,11 +2914,11 @@ index aa147b14a9..370e0dd570 100644
 CREATE FOREIGN DATA WRAPPER dummy;
 COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
 diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql
-index 22e177f89b..7138d5e1d4 100644
+index 45c7a534cb..32dd26b8cd 100644
 --- a/src/test/regress/sql/foreign_key.sql
 +++ b/src/test/regress/sql/foreign_key.sql
-@@ -1418,7 +1418,7 @@ ALTER TABLE fk_partitioned_fk ATTACH PARTITION fk_partitioned_fk_2
- -- leave these tables around intentionally
+@@ -1435,7 +1435,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES
+ DROP TABLE fk_partitioned_pk_6, fk_partitioned_fk_6;
 
 -- test the case when the referenced table is owned by a different user
 -create role regress_other_partitioned_fk_owner;
@@ -2963,7 +2976,7 @@ index 527024f710..de49c0b85f 100644
 -- the data in this file has a lot of duplicates in the index key
 -- fields, leading to long bucket chains and lots of table expansion.
 diff --git a/src/test/regress/sql/identity.sql b/src/test/regress/sql/identity.sql
-index 91d2e443b4..241c93f373 100644
+index 7537258a75..9041e35e34 100644
 --- a/src/test/regress/sql/identity.sql
 +++ b/src/test/regress/sql/identity.sql
@@ -287,7 +287,7 @@ ALTER TABLE itest7 ALTER COLUMN a RESTART;
@@ -2976,10 +2989,10 @@ index 91d2e443b4..241c93f373 100644
 GRANT SELECT, INSERT ON itest8 TO regress_identity_user1;
 SET ROLE regress_identity_user1;
 diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql
-index fe699c54d5..bdd5993f45 100644
+index b5b554a125..109889ad24 100644
 --- a/src/test/regress/sql/inherit.sql
 +++ b/src/test/regress/sql/inherit.sql
-@@ -950,7 +950,7 @@ create index on permtest_parent (left(c, 3));
+@@ -958,7 +958,7 @@ create index on permtest_parent (left(c, 3));
 insert into permtest_parent
   select 1, 'a', left(fipshash(i::text), 5) from generate_series(0, 100) i;
 analyze permtest_parent;
@@ -3218,7 +3231,7 @@ index 53e86b0b6c..f07cf1ec54 100644
 CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
 
 diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
-index 3f68cafcd1..004b26831d 100644
+index 249df17a58..b258e7f26a 100644
 --- a/src/test/regress/sql/privileges.sql
 +++ b/src/test/regress/sql/privileges.sql
@@ -24,18 +24,18 @@ RESET client_min_messages;
@@ -3269,7 +3282,7 @@ index 3f68cafcd1..004b26831d 100644
 
 GRANT pg_read_all_data TO regress_priv_user6;
 GRANT pg_write_all_data TO regress_priv_user7;
-@@ -130,8 +130,8 @@ DROP USER regress_priv_user10;
+@@ -163,8 +163,8 @@ DROP USER regress_priv_user10;
 DROP USER regress_priv_user9;
 DROP USER regress_priv_user8;
 
@@ -3280,7 +3293,7 @@ index 3f68cafcd1..004b26831d 100644
 
 ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4;
 
-@@ -1124,7 +1124,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
+@@ -1157,7 +1157,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
 
 -- security-restricted operations
 \c -
@@ -3289,7 +3302,7 @@ index 3f68cafcd1..004b26831d 100644
 
 -- Check that index expressions and predicates are run as the table's owner
 
-@@ -1620,8 +1620,8 @@ DROP SCHEMA testns CASCADE;
+@@ -1653,8 +1653,8 @@ DROP SCHEMA testns CASCADE;
 -- Change owner of the schema & and rename of new schema owner
 \c -
 
@@ -3300,7 +3313,7 @@ index 3f68cafcd1..004b26831d 100644
 
 SET SESSION ROLE regress_schemauser1;
 CREATE SCHEMA testns;
-@@ -1715,7 +1715,7 @@ DROP USER regress_priv_user8; -- does not exist
+@@ -1748,7 +1748,7 @@ DROP USER regress_priv_user8; -- does not exist
 
 
 -- permissions with LOCK TABLE
@@ -3309,7 +3322,7 @@ index 3f68cafcd1..004b26831d 100644
 CREATE TABLE lock_table (a int);
 
 -- LOCK TABLE and SELECT permission
-@@ -1803,7 +1803,7 @@ DROP USER regress_locktable_user;
+@@ -1836,7 +1836,7 @@ DROP USER regress_locktable_user;
 -- switch to superuser
 \c -
 
@@ -3318,7 +3331,7 @@ index 3f68cafcd1..004b26831d 100644
 
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
 SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
-@@ -1823,10 +1823,10 @@ RESET ROLE;
+@@ -1856,10 +1856,10 @@ RESET ROLE;
 DROP ROLE regress_readallstats;
 
 -- test role grantor machinery
@@ -3333,7 +3346,7 @@ index 3f68cafcd1..004b26831d 100644
 
 GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE;
 GRANT regress_group_direct_manager TO regress_group_indirect_manager;
-@@ -1848,9 +1848,9 @@ DROP ROLE regress_group_indirect_manager;
+@@ -1881,9 +1881,9 @@ DROP ROLE regress_group_indirect_manager;
 DROP ROLE regress_group_member;
 
 -- test SET and INHERIT options with object ownership changes
@@ -3625,7 +3638,7 @@ index c961b2d730..0859b89c4f 100644
 -- clean up roles
 DROP ROLE regress_test_def_superuser;
 diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql
-index dec7340538..cdbc03a5cc 100644
+index d3bfd53e23..919ce1d0c6 100644
 --- a/src/test/regress/sql/rowsecurity.sql
 +++ b/src/test/regress/sql/rowsecurity.sql
@@ -20,13 +20,13 @@ DROP SCHEMA IF EXISTS regress_rls_schema CASCADE;
@@ -3701,6 +3714,19 @@ index 689c448cc2..223ceb1d75 100644
 ALTER DEFAULT PRIVILEGES FOR ROLE regress_selinto_user
 	  REVOKE INSERT ON TABLES FROM regress_selinto_user;
 GRANT ALL ON SCHEMA selinto_schema TO public;
+diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql
+index 33d78e16dc..cb193c9b27 100644
+--- a/src/test/regress/sql/select_parallel.sql
+++ b/src/test/regress/sql/select_parallel.sql
+@@ -464,7 +464,7 @@ SELECT 1 FROM tenk1_vw_sec
+ rollback;
+ 
+ -- test that function option SET ROLE works in parallel workers.
+-create role regress_parallel_worker;
+create role regress_parallel_worker PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ create function set_and_report_role() returns text as
+   $$ select current_setting('role') $$ language sql parallel safe
 diff --git a/src/test/regress/sql/select_views.sql b/src/test/regress/sql/select_views.sql
 index e742f13699..7bd0255df8 100644
 --- a/src/test/regress/sql/select_views.sql
--- a/compute/patches/pg_anon.patch
+++ b/compute/patches/pg_anon.patch
@@ -1,3 +1,45 @@
+commit 00aa659afc9c7336ab81036edec3017168aabf40
+Author: Heikki Linnakangas <heikki@neon.tech>
+Date:   Tue Nov 12 16:59:19 2024 +0200
+
+    Temporarily disable test that depends on timezone
+
+diff --git a/tests/expected/generalization.out b/tests/expected/generalization.out
+index 23ef5fa..9e60deb 100644
+--- a/ext-src/pg_anon-src/tests/expected/generalization.out
+++ b/ext-src/pg_anon-src/tests/expected/generalization.out
+@@ -284,12 +284,9 @@ SELECT anon.generalize_tstzrange('19041107','century');
+  ["Tue Jan 01 00:00:00 1901 PST","Mon Jan 01 00:00:00 2001 PST")
+ (1 row)
+ 
+-SELECT anon.generalize_tstzrange('19041107','millennium');
+-                      generalize_tstzrange                       
+------------------------------------------------------------------
+- ["Thu Jan 01 00:00:00 1001 PST","Mon Jan 01 00:00:00 2001 PST")
+-(1 row)
+-
+-- temporarily disabled, see:
+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
+--SELECT anon.generalize_tstzrange('19041107','millennium');
+ -- generalize_daterange
+ SELECT anon.generalize_daterange('19041107');
+   generalize_daterange   
+diff --git a/tests/sql/generalization.sql b/tests/sql/generalization.sql
+index b868344..b4fc977 100644
+--- a/ext-src/pg_anon-src/tests/sql/generalization.sql
+++ b/ext-src/pg_anon-src/tests/sql/generalization.sql
+@@ -61,7 +61,9 @@ SELECT anon.generalize_tstzrange('19041107','month');
+ SELECT anon.generalize_tstzrange('19041107','year');
+ SELECT anon.generalize_tstzrange('19041107','decade');
+ SELECT anon.generalize_tstzrange('19041107','century');
+-SELECT anon.generalize_tstzrange('19041107','millennium');
+-- temporarily disabled, see:
+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
+--SELECT anon.generalize_tstzrange('19041107','millennium');
+ 
+ -- generalize_daterange
+ SELECT anon.generalize_daterange('19041107');
+
 commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
 Author: Alexey Masterov <alexeymasterov@neon.tech>
 Date:   Fri May 31 06:34:26 2024 +0000
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -26,7 +26,7 @@ commands:
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
-    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
  - name: sql-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -26,7 +26,7 @@ commands:
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
-    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
  - name: sql-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -18,9 +18,11 @@ clap.workspace = true
 flate2.workspace = true
 futures.workspace = true
 hyper0 = { workspace = true, features = ["full"] }
+metrics.workspace = true
 nix.workspace = true
 notify.workspace = true
 num_cpus.workspace = true
+once_cell.workspace = true
 opentelemetry.workspace = true
 opentelemetry_sdk.workspace = true
 postgres.workspace = true
@@ -39,6 +41,7 @@ tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 thiserror.workspace = true
 url.workspace = true
+prometheus.workspace = true

 compute_api.workspace = true
 utils.workspace = true
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -364,11 +364,29 @@ impl ComputeNode {
        let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;

        let basebackup_cmd = match lsn {
-            Lsn(0) => format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id),
-            _ => format!(
-                "basebackup {} {} {} --gzip",
-                spec.tenant_id, spec.timeline_id, lsn
-            ),
+            Lsn(0) => {
+                if spec.spec.mode != ComputeMode::Primary {
+                    format!(
+                        "basebackup {} {} --gzip --replica",
+                        spec.tenant_id, spec.timeline_id
+                    )
+                } else {
+                    format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id)
+                }
+            }
+            _ => {
+                if spec.spec.mode != ComputeMode::Primary {
+                    format!(
+                        "basebackup {} {} {} --gzip --replica",
+                        spec.tenant_id, spec.timeline_id, lsn
+                    )
+                } else {
+                    format!(
+                        "basebackup {} {} {} --gzip",
+                        spec.tenant_id, spec.timeline_id, lsn
+                    )
+                }
+            }
        };

        let copyreader = client.copy_out(basebackup_cmd.as_str())?;
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -73,6 +73,19 @@ pub fn write_postgres_conf(
        )?;
    }

+    // Locales
+    if cfg!(target_os = "macos") {
+        writeln!(file, "lc_messages='C'")?;
+        writeln!(file, "lc_monetary='C'")?;
+        writeln!(file, "lc_time='C'")?;
+        writeln!(file, "lc_numeric='C'")?;
+    } else {
+        writeln!(file, "lc_messages='C.UTF-8'")?;
+        writeln!(file, "lc_monetary='C.UTF-8'")?;
+        writeln!(file, "lc_time='C.UTF-8'")?;
+        writeln!(file, "lc_numeric='C.UTF-8'")?;
+    }
+
    match spec.mode {
        ComputeMode::Primary => {}
        ComputeMode::Static(lsn) => {
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -9,6 +9,7 @@ use crate::catalog::SchemaDumpError;
 use crate::catalog::{get_database_schema, get_dbs_and_roles};
 use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
+use crate::installed_extensions;
 use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
 use compute_api::responses::{
    ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
@@ -19,6 +20,8 @@ use anyhow::Result;
 use hyper::header::CONTENT_TYPE;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
+use metrics::Encoder;
+use metrics::TextEncoder;
 use tokio::task;
 use tracing::{debug, error, info, warn};
 use tracing_utils::http::OtelName;
@@ -65,6 +68,28 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
        }

+        // Prometheus metrics
+        (&Method::GET, "/metrics") => {
+            debug!("serving /metrics GET request");
+
+            let mut buffer = vec![];
+            let metrics = installed_extensions::collect();
+            let encoder = TextEncoder::new();
+            encoder.encode(&metrics, &mut buffer).unwrap();
+
+            match Response::builder()
+                .status(StatusCode::OK)
+                .header(CONTENT_TYPE, encoder.format_type())
+                .body(Body::from(buffer))
+            {
+                Ok(response) => response,
+                Err(err) => {
+                    let msg = format!("error handling /metrics request: {err}");
+                    error!(msg);
+                    render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
        // Collect Postgres current usage insights
        (&Method::GET, "/insights") => {
            info!("serving /insights GET request");
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -37,6 +37,21 @@ paths:
              schema:
                $ref: "#/components/schemas/ComputeMetrics"

+  /metrics
+    get:
+      tags:
+      - Info
+      summary: Get compute node metrics in text format.
+      description: ""
+      operationId: getComputeMetrics
+      responses:
+        200:
+          description: ComputeMetrics
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Metrics in text format.
  /insights:
    get:
      tags:
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -1,4 +1,5 @@
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
+use metrics::proto::MetricFamily;
 use std::collections::HashMap;
 use std::collections::HashSet;
 use tracing::info;
@@ -8,6 +9,10 @@ use anyhow::Result;
 use postgres::{Client, NoTls};
 use tokio::task;

+use metrics::core::Collector;
+use metrics::{register_uint_gauge_vec, UIntGaugeVec};
+use once_cell::sync::Lazy;
+
 /// We don't reuse get_existing_dbs() just for code clarity
 /// and to make database listing query here more explicit.
 ///
@@ -59,6 +64,12 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension

            for (extname, v) in extensions.iter() {
                let version = v.to_string();
+
+                // increment the number of databases where the version of extension is installed
+                INSTALLED_EXTENSIONS
+                    .with_label_values(&[extname, &version])
+                    .inc();
+
                extensions_map
                    .entry(extname.to_string())
                    .and_modify(|e| {
@@ -74,9 +85,11 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension
            }
        }

-        Ok(InstalledExtensions {
+        let res = InstalledExtensions {
            extensions: extensions_map.values().cloned().collect(),
-        })
+        };
+
+        Ok(res)
    })
    .await?
 }
@@ -97,6 +110,18 @@ pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> {
        "[NEON_EXT_STAT] {}",
        serde_json::to_string(&result).expect("failed to serialize extensions list")
    );
-
    Ok(())
 }
+
+static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "installed_extensions",
+        "Number of databases where the version of extension is installed",
+        &["extension_name", "version"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub fn collect() -> Vec<MetricFamily> {
+    INSTALLED_EXTENSIONS.collect()
+}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -944,6 +944,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
                        pg_auth_type: AuthType::Trust,
                        http_auth_type: AuthType::Trust,
                        other: Default::default(),
+                        // Typical developer machines use disks with slow fsync, and we don't care
+                        // about data integrity: disable disk syncs.
+                        no_sync: true,
                    }
                })
                .collect(),
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -225,6 +225,7 @@ pub struct PageServerConf {
    pub listen_http_addr: String,
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,
+    pub no_sync: bool,
 }

 impl Default for PageServerConf {
@@ -235,6 +236,7 @@ impl Default for PageServerConf {
            listen_http_addr: String::new(),
            pg_auth_type: AuthType::Trust,
            http_auth_type: AuthType::Trust,
+            no_sync: false,
        }
    }
 }
@@ -249,6 +251,8 @@ pub struct NeonLocalInitPageserverConf {
    pub listen_http_addr: String,
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,
+    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
+    pub no_sync: bool,
    #[serde(flatten)]
    pub other: HashMap<String, toml::Value>,
 }
@@ -261,6 +265,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
            listen_http_addr,
            pg_auth_type,
            http_auth_type,
+            no_sync,
            other: _,
        } = conf;
        Self {
@@ -269,6 +274,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
            listen_http_addr: listen_http_addr.clone(),
            pg_auth_type: *pg_auth_type,
            http_auth_type: *http_auth_type,
+            no_sync: *no_sync,
        }
    }
 }
@@ -569,6 +575,8 @@ impl LocalEnv {
                    listen_http_addr: String,
                    pg_auth_type: AuthType,
                    http_auth_type: AuthType,
+                    #[serde(default)]
+                    no_sync: bool,
                }
                let config_toml_path = dentry.path().join("pageserver.toml");
                let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
@@ -591,6 +599,7 @@ impl LocalEnv {
                    listen_http_addr,
                    pg_auth_type,
                    http_auth_type,
+                    no_sync,
                } = config_toml;
                let IdentityTomlSubset {
                    id: identity_toml_id,
@@ -607,6 +616,7 @@ impl LocalEnv {
                    listen_http_addr,
                    pg_auth_type,
                    http_auth_type,
+                    no_sync,
                };
                pageservers.push(conf);
            }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -273,6 +273,7 @@ impl PageServerNode {
            )
        })?;
        let args = vec!["-D", datadir_path_str];
+
        background_process::start_process(
            "pageserver",
            &datadir,
@@ -334,17 +335,20 @@ impl PageServerNode {
            checkpoint_distance: settings
                .remove("checkpoint_distance")
                .map(|x| x.parse::<u64>())
-                .transpose()?,
+                .transpose()
+                .context("Failed to parse 'checkpoint_distance' as an integer")?,
            checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()),
            compaction_target_size: settings
                .remove("compaction_target_size")
                .map(|x| x.parse::<u64>())
-                .transpose()?,
+                .transpose()
+                .context("Failed to parse 'compaction_target_size' as an integer")?,
            compaction_period: settings.remove("compaction_period").map(|x| x.to_string()),
            compaction_threshold: settings
                .remove("compaction_threshold")
                .map(|x| x.parse::<usize>())
-                .transpose()?,
+                .transpose()
+                .context("Failed to parse 'compaction_threshold' as an integer")?,
            compaction_algorithm: settings
                .remove("compaction_algorithm")
                .map(serde_json::from_str)
@@ -353,16 +357,19 @@ impl PageServerNode {
            gc_horizon: settings
                .remove("gc_horizon")
                .map(|x| x.parse::<u64>())
-                .transpose()?,
+                .transpose()
+                .context("Failed to parse 'gc_horizon' as an integer")?,
            gc_period: settings.remove("gc_period").map(|x| x.to_string()),
            image_creation_threshold: settings
                .remove("image_creation_threshold")
                .map(|x| x.parse::<usize>())
-                .transpose()?,
+                .transpose()
+                .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
            image_layer_creation_check_threshold: settings
                .remove("image_layer_creation_check_threshold")
                .map(|x| x.parse::<u8>())
-                .transpose()?,
+                .transpose()
+                .context("Failed to parse 'image_creation_check_threshold' as integer")?,
            pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
            walreceiver_connect_timeout: settings
                .remove("walreceiver_connect_timeout")
@@ -403,6 +410,11 @@ impl PageServerNode {
            lsn_lease_length_for_ts: settings
                .remove("lsn_lease_length_for_ts")
                .map(|x| x.to_string()),
+            timeline_offloading: settings
+                .remove("timeline_offloading")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'timeline_offloading' as bool")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
@@ -414,97 +426,9 @@ impl PageServerNode {
    pub async fn tenant_config(
        &self,
        tenant_id: TenantId,
-        mut settings: HashMap<&str, &str>,
+        settings: HashMap<&str, &str>,
    ) -> anyhow::Result<()> {
-        let config = {
-            // Braces to make the diff easier to read
-            models::TenantConfig {
-                checkpoint_distance: settings
-                    .remove("checkpoint_distance")
-                    .map(|x| x.parse::<u64>())
-                    .transpose()
-                    .context("Failed to parse 'checkpoint_distance' as an integer")?,
-                checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()),
-                compaction_target_size: settings
-                    .remove("compaction_target_size")
-                    .map(|x| x.parse::<u64>())
-                    .transpose()
-                    .context("Failed to parse 'compaction_target_size' as an integer")?,
-                compaction_period: settings.remove("compaction_period").map(|x| x.to_string()),
-                compaction_threshold: settings
-                    .remove("compaction_threshold")
-                    .map(|x| x.parse::<usize>())
-                    .transpose()
-                    .context("Failed to parse 'compaction_threshold' as an integer")?,
-                compaction_algorithm: settings
-                    .remove("compactin_algorithm")
-                    .map(serde_json::from_str)
-                    .transpose()
-                    .context("Failed to parse 'compaction_algorithm' json")?,
-                gc_horizon: settings
-                    .remove("gc_horizon")
-                    .map(|x| x.parse::<u64>())
-                    .transpose()
-                    .context("Failed to parse 'gc_horizon' as an integer")?,
-                gc_period: settings.remove("gc_period").map(|x| x.to_string()),
-                image_creation_threshold: settings
-                    .remove("image_creation_threshold")
-                    .map(|x| x.parse::<usize>())
-                    .transpose()
-                    .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
-                image_layer_creation_check_threshold: settings
-                    .remove("image_layer_creation_check_threshold")
-                    .map(|x| x.parse::<u8>())
-                    .transpose()
-                    .context("Failed to parse 'image_creation_check_threshold' as integer")?,
-
-                pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
-                walreceiver_connect_timeout: settings
-                    .remove("walreceiver_connect_timeout")
-                    .map(|x| x.to_string()),
-                lagging_wal_timeout: settings
-                    .remove("lagging_wal_timeout")
-                    .map(|x| x.to_string()),
-                max_lsn_wal_lag: settings
-                    .remove("max_lsn_wal_lag")
-                    .map(|x| x.parse::<NonZeroU64>())
-                    .transpose()
-                    .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
-                eviction_policy: settings
-                    .remove("eviction_policy")
-                    .map(serde_json::from_str)
-                    .transpose()
-                    .context("Failed to parse 'eviction_policy' json")?,
-                min_resident_size_override: settings
-                    .remove("min_resident_size_override")
-                    .map(|x| x.parse::<u64>())
-                    .transpose()
-                    .context("Failed to parse 'min_resident_size_override' as an integer")?,
-                evictions_low_residence_duration_metric_threshold: settings
-                    .remove("evictions_low_residence_duration_metric_threshold")
-                    .map(|x| x.to_string()),
-                heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
-                lazy_slru_download: settings
-                    .remove("lazy_slru_download")
-                    .map(|x| x.parse::<bool>())
-                    .transpose()
-                    .context("Failed to parse 'lazy_slru_download' as bool")?,
-                timeline_get_throttle: settings
-                    .remove("timeline_get_throttle")
-                    .map(serde_json::from_str)
-                    .transpose()
-                    .context("parse `timeline_get_throttle` from json")?,
-                lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
-                lsn_lease_length_for_ts: settings
-                    .remove("lsn_lease_length_for_ts")
-                    .map(|x| x.to_string()),
-            }
-        };
-
-        if !settings.is_empty() {
-            bail!("Unrecognized tenant settings: {settings:?}")
-        }
-
+        let config = Self::parse_config(settings)?;
        self.http_client
            .tenant_config(&models::TenantConfigRequest { tenant_id, config })
            .await?;
--- a/deny.toml
+++ b/deny.toml
@@ -37,6 +37,7 @@ allow = [
    "BSD-2-Clause",
    "BSD-3-Clause",
    "CC0-1.0",
+    "CDDL-1.0",
    "ISC",
    "MIT",
    "MPL-2.0",
--- a/docs/rfcs/038-aux-file-v2.md
+++ b/docs/rfcs/038-aux-file-v2.md
@@ -91,7 +91,7 @@ generating the basebackup by scanning the `REPL_ORIGIN_KEY_PREFIX` keyspace.
 There are two places we need to read the aux files from the pageserver:

 * On the write path, when the compute node adds an aux file to the pageserver, we will retrieve the key from the storage, append the file to the hashed key, and write it back. The current `get` API already supports that.
-*  We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API will attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error.
+*  We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API used to always attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error. Furthermore, as aux file reads usually need all layer files intersecting with that key range within the branch and cover a big keyspace, it incurs large overhead for tracking keyspaces that have not been read. Therefore, for sparse keyspaces, we [do not track](https://github.com/neondatabase/neon/pull/9631) `ummapped_keyspace`.

 ## Compaction and Image Layer Generation

--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -110,6 +110,23 @@ static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
 pub const DISK_FSYNC_SECONDS_BUCKETS: &[f64] =
    &[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0];

+/// Constructs histogram buckets that are powers of two starting at 1 (i.e. 2^0), covering the end
+/// points. For example, passing start=5,end=20 yields 4,8,16,32 as does start=4,end=32.
+pub fn pow2_buckets(start: usize, end: usize) -> Vec<f64> {
+    assert_ne!(start, 0);
+    assert!(start <= end);
+    let start = match start.checked_next_power_of_two() {
+        Some(n) if n == start => n, // start already power of two
+        Some(n) => n >> 1,          // power of two below start
+        None => panic!("start too large"),
+    };
+    let end = end.checked_next_power_of_two().expect("end too large");
+    std::iter::successors(Some(start), |n| n.checked_mul(2))
+        .take_while(|n| n <= &end)
+        .map(|n| n as f64)
+        .collect()
+}
+
 pub struct BuildInfo {
    pub revision: &'static str,
    pub build_tag: &'static str,
@@ -595,3 +612,67 @@ where
        self.dec.collect_into(metadata, labels, name, &mut enc.0)
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const POW2_BUCKETS_MAX: usize = 1 << (usize::BITS - 1);
+
+    #[test]
+    fn pow2_buckets_cases() {
+        assert_eq!(pow2_buckets(1, 1), vec![1.0]);
+        assert_eq!(pow2_buckets(1, 2), vec![1.0, 2.0]);
+        assert_eq!(pow2_buckets(1, 3), vec![1.0, 2.0, 4.0]);
+        assert_eq!(pow2_buckets(1, 4), vec![1.0, 2.0, 4.0]);
+        assert_eq!(pow2_buckets(1, 5), vec![1.0, 2.0, 4.0, 8.0]);
+        assert_eq!(pow2_buckets(1, 6), vec![1.0, 2.0, 4.0, 8.0]);
+        assert_eq!(pow2_buckets(1, 7), vec![1.0, 2.0, 4.0, 8.0]);
+        assert_eq!(pow2_buckets(1, 8), vec![1.0, 2.0, 4.0, 8.0]);
+        assert_eq!(
+            pow2_buckets(1, 200),
+            vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0]
+        );
+
+        assert_eq!(pow2_buckets(1, 8), vec![1.0, 2.0, 4.0, 8.0]);
+        assert_eq!(pow2_buckets(2, 8), vec![2.0, 4.0, 8.0]);
+        assert_eq!(pow2_buckets(3, 8), vec![2.0, 4.0, 8.0]);
+        assert_eq!(pow2_buckets(4, 8), vec![4.0, 8.0]);
+        assert_eq!(pow2_buckets(5, 8), vec![4.0, 8.0]);
+        assert_eq!(pow2_buckets(6, 8), vec![4.0, 8.0]);
+        assert_eq!(pow2_buckets(7, 8), vec![4.0, 8.0]);
+        assert_eq!(pow2_buckets(8, 8), vec![8.0]);
+        assert_eq!(pow2_buckets(20, 200), vec![16.0, 32.0, 64.0, 128.0, 256.0]);
+
+        // Largest valid values.
+        assert_eq!(
+            pow2_buckets(1, POW2_BUCKETS_MAX).len(),
+            usize::BITS as usize
+        );
+        assert_eq!(pow2_buckets(POW2_BUCKETS_MAX, POW2_BUCKETS_MAX).len(), 1);
+    }
+
+    #[test]
+    #[should_panic]
+    fn pow2_buckets_zero_start() {
+        pow2_buckets(0, 1);
+    }
+
+    #[test]
+    #[should_panic]
+    fn pow2_buckets_end_lt_start() {
+        pow2_buckets(2, 1);
+    }
+
+    #[test]
+    #[should_panic]
+    fn pow2_buckets_end_overflow_min() {
+        pow2_buckets(1, POW2_BUCKETS_MAX + 1);
+    }
+
+    #[test]
+    #[should_panic]
+    fn pow2_buckets_end_overflow_max() {
+        pow2_buckets(1, usize::MAX);
+    }
+}
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -64,6 +64,7 @@ pub struct ConfigToml {
    #[serde(with = "humantime_serde")]
    pub wal_redo_timeout: Duration,
    pub superuser: String,
+    pub locale: String,
    pub page_cache_size: usize,
    pub max_file_descriptors: usize,
    pub pg_distrib_dir: Option<Utf8PathBuf>,
@@ -106,6 +107,8 @@ pub struct ConfigToml {
    pub ephemeral_bytes_per_memory_kb: usize,
    pub l0_flush: Option<crate::models::L0FlushConfig>,
    pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub no_sync: Option<bool>,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -259,6 +262,10 @@ pub struct TenantConfigToml {
    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
    #[serde(with = "humantime_serde")]
    pub lsn_lease_length_for_ts: Duration,
+
+    /// Enable auto-offloading of timelines.
+    /// (either this flag or the pageserver-global one need to be set)
+    pub timeline_offloading: bool,
 }

 pub mod defaults {
@@ -270,6 +277,11 @@ pub mod defaults {
    pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";

    pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
+    pub const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
+        "C"
+    } else {
+        "C.UTF-8"
+    };

    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
@@ -320,6 +332,7 @@ impl Default for ConfigToml {
            wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
                .expect("cannot parse default wal redo timeout")),
            superuser: (DEFAULT_SUPERUSER.to_string()),
+            locale: DEFAULT_LOCALE.to_string(),
            page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
            max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
            pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
@@ -385,6 +398,7 @@ impl Default for ConfigToml {
            l0_flush: None,
            virtual_file_io_mode: None,
            tenant_config: TenantConfigToml::default(),
+            no_sync: None,
        }
    }
 }
@@ -471,6 +485,7 @@ impl Default for TenantConfigToml {
            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
+            timeline_offloading: false,
        }
    }
 }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -24,7 +24,7 @@ pub struct Key {

 /// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
 /// a struct of fields.
-#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd)]
+#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
 pub struct CompactKey(i128);

 /// The storage key size.
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -310,6 +310,7 @@ pub struct TenantConfig {
    pub image_layer_creation_check_threshold: Option<u8>,
    pub lsn_lease_length: Option<String>,
    pub lsn_lease_length_for_ts: Option<String>,
+    pub timeline_offloading: Option<bool>,
 }

 /// The policy for the aux file storage.
--- a/libs/pageserver_api/src/record.rs
+++ b/libs/pageserver_api/src/record.rs
@@ -41,6 +41,11 @@ pub enum NeonWalRecord {
        file_path: String,
        content: Option<Bytes>,
    },
+    // Truncate visibility map page
+    TruncateVisibilityMap {
+        trunc_byte: usize,
+        trunc_offs: usize,
+    },

    /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
    #[cfg(feature = "testing")]
@@ -80,18 +85,18 @@ impl NeonWalRecord {
    }

    #[cfg(feature = "testing")]
-    pub fn wal_clear() -> Self {
+    pub fn wal_clear(s: impl AsRef<str>) -> Self {
        Self::Test {
-            append: "".to_string(),
+            append: s.as_ref().to_string(),
            clear: true,
            will_init: false,
        }
    }

    #[cfg(feature = "testing")]
-    pub fn wal_init() -> Self {
+    pub fn wal_init(s: impl AsRef<str>) -> Self {
        Self::Test {
-            append: "".to_string(),
+            append: s.as_ref().to_string(),
            clear: true,
            will_init: true,
        }
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -24,7 +24,7 @@ use postgres_ffi::Oid;
 // FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
 // Then we could replace the custom Ord and PartialOrd implementations below with
 // deriving them. This will require changes in walredoproc.c.
-#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize)]
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
 pub struct RelTag {
    pub forknum: u8,
    pub spcnode: Oid,
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -2,7 +2,7 @@
 use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
 use pq_proto::{BeMessage, RowDescriptor};
-use rustls::crypto::aws_lc_rs;
+use rustls::crypto::ring;
 use std::io::Cursor;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -94,7 +94,7 @@ async fn simple_select_ssl() {
    let (client_sock, server_sock) = make_tcp_pair().await;

    let server_cfg =
-        rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+        rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
            .with_safe_default_protocol_versions()
            .expect("aws_lc_rs should support the default protocol versions")
            .with_no_client_auth()
@@ -110,7 +110,7 @@ async fn simple_select_ssl() {
    });

    let client_cfg =
-        rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+        rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
            .with_safe_default_protocol_versions()
            .expect("aws_lc_rs should support the default protocol versions")
            .with_root_certificates({
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -243,8 +243,11 @@ const FSM_LEAF_NODES_PER_PAGE: usize = FSM_NODES_PER_PAGE - FSM_NON_LEAF_NODES_P
 pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32;

 /* From visibilitymap.c */
-pub const VM_HEAPBLOCKS_PER_PAGE: u32 =
-    (BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
+
+pub const VM_MAPSIZE: usize = BLCKSZ as usize - MAXALIGN_SIZE_OF_PAGE_HEADER_DATA;
+pub const VM_BITS_PER_HEAPBLOCK: usize = 2;
+pub const VM_HEAPBLOCKS_PER_BYTE: usize = 8 / VM_BITS_PER_HEAPBLOCK;
+pub const VM_HEAPBLOCKS_PER_PAGE: usize = VM_MAPSIZE * VM_HEAPBLOCKS_PER_BYTE;

 /* From origin.c */
 pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE;
--- a/libs/postgres_ffi/src/wal_generator.rs
+++ b/libs/postgres_ffi/src/wal_generator.rs
@@ -1,10 +1,10 @@
-use std::ffi::CStr;
+use std::ffi::{CStr, CString};

 use bytes::{Bytes, BytesMut};
 use crc32c::crc32c_append;
 use utils::lsn::Lsn;

-use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC};
+use super::bindings::{RmgrId, XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC};
 use super::xlog_utils::{
    XlLogicalMessage, XLOG_RECORD_CRC_OFFS, XLOG_SIZE_OF_XLOG_RECORD, XLP_BKP_REMOVABLE,
    XLP_FIRST_IS_CONTRECORD,
@@ -16,11 +16,65 @@ use crate::pg_constants::{
 };
 use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};

-/// Generates binary WAL records for use in tests and benchmarks. Currently only generates logical
-/// messages (effectively noops) with a fixed payload. It is used as an iterator which yields
-/// encoded bytes for a single WAL record, including internal page headers if it spans pages.
-/// Concatenating the bytes will yield a complete, well-formed WAL, which can be chunked at segment
-/// boundaries if desired. Not optimized for performance.
+/// A WAL record payload. Will be prefixed by an XLogRecord header when encoded.
+pub struct Record {
+    pub rmid: RmgrId,
+    pub info: u8,
+    pub data: Bytes,
+}
+
+impl Record {
+    /// Encodes the WAL record including an XLogRecord header. prev_lsn is the start position of
+    /// the previous record in the WAL -- this is ignored by the Safekeeper, but not Postgres.
+    pub fn encode(&self, prev_lsn: Lsn) -> Bytes {
+        // Prefix data with block ID and length.
+        let data_header = Bytes::from(match self.data.len() {
+            0 => vec![],
+            1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, self.data.len() as u8],
+            256.. => {
+                let len_bytes = (self.data.len() as u32).to_le_bytes();
+                [&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat()
+            }
+        });
+
+        // Construct the WAL record header.
+        let mut header = XLogRecord {
+            xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + self.data.len()) as u32,
+            xl_xid: 0,
+            xl_prev: prev_lsn.into(),
+            xl_info: self.info,
+            xl_rmid: self.rmid,
+            __bindgen_padding_0: [0; 2],
+            xl_crc: 0, // see below
+        };
+
+        // Compute the CRC checksum for the data, and the header up to the CRC field.
+        let mut crc = 0;
+        crc = crc32c_append(crc, &data_header);
+        crc = crc32c_append(crc, &self.data);
+        crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]);
+        header.xl_crc = crc;
+
+        // Encode the final header and record.
+        let header = header.encode().unwrap();
+
+        [header, data_header, self.data.clone()].concat().into()
+    }
+}
+
+/// Generates WAL record payloads.
+///
+/// TODO: currently only provides LogicalMessageGenerator for trivial noop messages. Add a generator
+/// that creates a table and inserts rows.
+pub trait RecordGenerator: Iterator<Item = Record> {}
+
+impl<I: Iterator<Item = Record>> RecordGenerator for I {}
+
+/// Generates binary WAL for use in tests and benchmarks. The provided record generator constructs
+/// the WAL records. It is used as an iterator which yields encoded bytes for a single WAL record,
+/// including internal page headers if it spans pages. Concatenating the bytes will yield a
+/// complete, well-formed WAL, which can be chunked at segment boundaries if desired. Not optimized
+/// for performance.
 ///
 /// The WAL format is version-dependant (see e.g. `XLOG_PAGE_MAGIC`), so make sure to import this
 /// for the appropriate Postgres version (e.g. `postgres_ffi::v17::wal_generator::WalGenerator`).
@@ -31,10 +85,10 @@ use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
 /// |        Segment 1         |        Segment 2         |        Segment 3         |
 /// | Page 1 | Page 2 | Page 3 | Page 4 | Page 5 | Page 6 | Page 7 | Page 8 | Page 9 |
 /// | R1 |   R2  |R3|  R4  | R5  |  R6  |                 R7            | R8  |
-///
-/// TODO: support generating actual tables and rows.
 #[derive(Default)]
-pub struct WalGenerator {
+pub struct WalGenerator<R: RecordGenerator> {
+    /// Generates record payloads for the WAL.
+    pub record_generator: R,
    /// Current LSN to append the next record at.
    ///
    /// Callers can modify this (and prev_lsn) to restart generation at a different LSN, but should
@@ -46,73 +100,35 @@ pub struct WalGenerator {
    pub prev_lsn: Lsn,
 }

-impl WalGenerator {
-    // For now, hardcode the message payload.
-    // TODO: support specifying the payload size.
-    const PREFIX: &CStr = c"prefix";
-    const MESSAGE: &[u8] = b"message";
-
-    // Hardcode the sys, timeline, and DB IDs. We can make them configurable if we care about them.
+impl<R: RecordGenerator> WalGenerator<R> {
+    // Hardcode the sys and timeline ID. We can make them configurable if we care about them.
    const SYS_ID: u64 = 0;
    const TIMELINE_ID: u32 = 1;
-    const DB_ID: u32 = 0;

-    /// Creates a new WAL generator, which emits logical message records (noops).
-    pub fn new() -> Self {
-        Self::default()
+    /// Creates a new WAL generator with the given record generator.
+    pub fn new(record_generator: R) -> WalGenerator<R> {
+        Self {
+            record_generator,
+            lsn: Lsn(0),
+            prev_lsn: Lsn(0),
+        }
    }

-    /// Encodes a logical message (basically a noop), with the given prefix and message.
-    pub(crate) fn encode_logical_message(prefix: &CStr, message: &[u8]) -> Bytes {
-        let prefix = prefix.to_bytes_with_nul();
-        let header = XlLogicalMessage {
-            db_id: Self::DB_ID,
-            transactional: 0,
-            prefix_size: prefix.len() as u64,
-            message_size: message.len() as u64,
-        };
-        [&header.encode(), prefix, message].concat().into()
+    /// Appends a record with an arbitrary payload at the current LSN, then increments the LSN.
+    /// Returns the WAL bytes for the record, including page headers and padding, and the start LSN.
+    fn append_record(&mut self, record: Record) -> (Lsn, Bytes) {
+        let record = record.encode(self.prev_lsn);
+        let record = Self::insert_pages(record, self.lsn);
+        let record = Self::pad_record(record, self.lsn);
+        let lsn = self.lsn;
+        self.prev_lsn = self.lsn;
+        self.lsn += record.len() as u64;
+        (lsn, record)
    }

-    /// Encode a WAL record with the given payload data (e.g. a logical message).
-    pub(crate) fn encode_record(data: Bytes, rmid: u8, info: u8, prev_lsn: Lsn) -> Bytes {
-        // Prefix data with block ID and length.
-        let data_header = Bytes::from(match data.len() {
-            0 => vec![],
-            1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, data.len() as u8],
-            256.. => {
-                let len_bytes = (data.len() as u32).to_le_bytes();
-                [&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat()
-            }
-        });
-
-        // Construct the WAL record header.
-        let mut header = XLogRecord {
-            xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + data.len()) as u32,
-            xl_xid: 0,
-            xl_prev: prev_lsn.into(),
-            xl_info: info,
-            xl_rmid: rmid,
-            __bindgen_padding_0: [0; 2],
-            xl_crc: 0, // see below
-        };
-
-        // Compute the CRC checksum for the data, and the header up to the CRC field.
-        let mut crc = 0;
-        crc = crc32c_append(crc, &data_header);
-        crc = crc32c_append(crc, &data);
-        crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]);
-        header.xl_crc = crc;
-
-        // Encode the final header and record.
-        let header = header.encode().unwrap();
-
-        [header, data_header, data].concat().into()
-    }
-
-    /// Injects page headers on 8KB page boundaries. Takes the current LSN position where the record
+    /// Inserts page headers on 8KB page boundaries. Takes the current LSN position where the record
    /// is to be appended.
-    fn encode_pages(record: Bytes, mut lsn: Lsn) -> Bytes {
+    fn insert_pages(record: Bytes, mut lsn: Lsn) -> Bytes {
        // Fast path: record fits in current page, and the page already has a header.
        if lsn.remaining_in_block() as usize >= record.len() && lsn.block_offset() > 0 {
            return record;
@@ -173,31 +189,71 @@ impl WalGenerator {
        }
        [record, Bytes::from(vec![0; padding])].concat().into()
    }
-
-    /// Generates a record with an arbitrary payload at the current LSN, then increments the LSN.
-    pub fn generate_record(&mut self, data: Bytes, rmid: u8, info: u8) -> Bytes {
-        let record = Self::encode_record(data, rmid, info, self.prev_lsn);
-        let record = Self::encode_pages(record, self.lsn);
-        let record = Self::pad_record(record, self.lsn);
-        self.prev_lsn = self.lsn;
-        self.lsn += record.len() as u64;
-        record
-    }
-
-    /// Generates a logical message at the current LSN. Can be used to construct arbitrary messages.
-    pub fn generate_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> Bytes {
-        let data = Self::encode_logical_message(prefix, message);
-        self.generate_record(data, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE)
-    }
 }

-/// Generate WAL records as an iterator.
-impl Iterator for WalGenerator {
+/// Generates WAL records as an iterator.
+impl<R: RecordGenerator> Iterator for WalGenerator<R> {
    type Item = (Lsn, Bytes);

    fn next(&mut self) -> Option<Self::Item> {
-        let lsn = self.lsn;
-        let record = self.generate_logical_message(Self::PREFIX, Self::MESSAGE);
-        Some((lsn, record))
+        let record = self.record_generator.next()?;
+        Some(self.append_record(record))
+    }
+}
+
+/// Generates logical message records (effectively noops) with a fixed message.
+pub struct LogicalMessageGenerator {
+    prefix: CString,
+    message: Vec<u8>,
+}
+
+impl LogicalMessageGenerator {
+    const DB_ID: u32 = 0; // hardcoded for now
+    const RM_ID: RmgrId = RM_LOGICALMSG_ID;
+    const INFO: u8 = XLOG_LOGICAL_MESSAGE;
+
+    /// Creates a new LogicalMessageGenerator.
+    pub fn new(prefix: &CStr, message: &[u8]) -> Self {
+        Self {
+            prefix: prefix.to_owned(),
+            message: message.to_owned(),
+        }
+    }
+
+    /// Encodes a logical message.
+    fn encode(prefix: &CStr, message: &[u8]) -> Bytes {
+        let prefix = prefix.to_bytes_with_nul();
+        let header = XlLogicalMessage {
+            db_id: Self::DB_ID,
+            transactional: 0,
+            prefix_size: prefix.len() as u64,
+            message_size: message.len() as u64,
+        };
+        [&header.encode(), prefix, message].concat().into()
+    }
+}
+
+impl Iterator for LogicalMessageGenerator {
+    type Item = Record;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        Some(Record {
+            rmid: Self::RM_ID,
+            info: Self::INFO,
+            data: Self::encode(&self.prefix, &self.message),
+        })
+    }
+}
+
+impl WalGenerator<LogicalMessageGenerator> {
+    /// Convenience method for appending a WAL record with an arbitrary logical message at the
+    /// current WAL LSN position. Returns the start LSN and resulting WAL bytes.
+    pub fn append_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> (Lsn, Bytes) {
+        let record = Record {
+            rmid: LogicalMessageGenerator::RM_ID,
+            info: LogicalMessageGenerator::INFO,
+            data: LogicalMessageGenerator::encode(prefix, message),
+        };
+        self.append_record(record)
    }
 }
--- a/libs/postgres_ffi/src/walrecord.rs
+++ b/libs/postgres_ffi/src/walrecord.rs
@@ -16,7 +16,7 @@ use utils::bin_ser::DeserializeError;
 use utils::lsn::Lsn;

 #[repr(C)]
-#[derive(Debug)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct XlMultiXactCreate {
    pub mid: MultiXactId,
    /* new MultiXact's ID */
@@ -46,7 +46,7 @@ impl XlMultiXactCreate {
 }

 #[repr(C)]
-#[derive(Debug)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct XlMultiXactTruncate {
    pub oldest_multi_db: Oid,
    /* to-be-truncated range of multixact offsets */
@@ -72,7 +72,7 @@ impl XlMultiXactTruncate {
 }

 #[repr(C)]
-#[derive(Debug)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct XlRelmapUpdate {
    pub dbid: Oid,   /* database ID, or 0 for shared map */
    pub tsid: Oid,   /* database's tablespace, or pg_global */
@@ -90,7 +90,7 @@ impl XlRelmapUpdate {
 }

 #[repr(C)]
-#[derive(Debug)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct XlReploriginDrop {
    pub node_id: RepOriginId,
 }
@@ -104,7 +104,7 @@ impl XlReploriginDrop {
 }

 #[repr(C)]
-#[derive(Debug)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct XlReploriginSet {
    pub remote_lsn: Lsn,
    pub node_id: RepOriginId,
@@ -120,7 +120,7 @@ impl XlReploriginSet {
 }

 #[repr(C)]
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub struct RelFileNode {
    pub spcnode: Oid, /* tablespace */
    pub dbnode: Oid,  /* database */
@@ -911,7 +911,7 @@ impl XlSmgrCreate {
 }

 #[repr(C)]
-#[derive(Debug)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct XlSmgrTruncate {
    pub blkno: BlockNumber,
    pub rnode: RelFileNode,
@@ -984,7 +984,7 @@ impl XlDropDatabase {
 /// xl_xact_parsed_abort structs in PostgreSQL, but we use the same
 /// struct for commits and aborts.
 ///
-#[derive(Debug)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct XlXactParsedRecord {
    pub xid: TransactionId,
    pub info: u8,
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -12,9 +12,9 @@ use super::bindings::{
    CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz,
    XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
 };
-use super::wal_generator::WalGenerator;
+use super::wal_generator::LogicalMessageGenerator;
 use super::PG_MAJORVERSION;
-use crate::pg_constants::{self, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE};
+use crate::pg_constants;
 use crate::PG_TLI;
 use crate::{uint32, uint64, Oid};
 use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
@@ -493,12 +493,10 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Bytes {
    // This function can take untrusted input, so discard any NUL bytes in the prefix string.
    let prefix = CString::new(prefix.replace('\0', "")).expect("no NULs");
    let message = message.as_bytes();
-    WalGenerator::encode_record(
-        WalGenerator::encode_logical_message(&prefix, message),
-        RM_LOGICALMSG_ID,
-        XLOG_LOGICAL_MESSAGE,
-        Lsn(0),
-    )
+    LogicalMessageGenerator::new(&prefix, message)
+        .next()
+        .unwrap()
+        .encode(Lsn(0))
 }

 #[cfg(test)]
--- a/libs/remote_storage/src/error.rs
+++ b/libs/remote_storage/src/error.rs
@@ -15,6 +15,9 @@ pub enum DownloadError {
    ///
    /// Concurrency control is not timed within timeout.
    Timeout,
+    /// Some integrity/consistency check failed during download. This is used during
+    /// timeline loads to cancel the load of a tenant if some timeline detects fatal corruption.
+    Fatal(String),
    /// The file was found in the remote storage, but the download failed.
    Other(anyhow::Error),
 }
@@ -29,6 +32,7 @@ impl std::fmt::Display for DownloadError {
            DownloadError::Unmodified => write!(f, "File was not modified"),
            DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
            DownloadError::Timeout => write!(f, "timeout"),
+            DownloadError::Fatal(why) => write!(f, "Fatal read error: {why}"),
            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
        }
    }
@@ -41,7 +45,7 @@ impl DownloadError {
    pub fn is_permanent(&self) -> bool {
        use DownloadError::*;
        match self {
-            BadInput(_) | NotFound | Unmodified | Cancelled => true,
+            BadInput(_) | NotFound | Unmodified | Fatal(_) | Cancelled => true,
            Timeout | Other(_) => false,
        }
    }
--- a/libs/utils/scripts/restore_from_wal.sh
+++ b/libs/utils/scripts/restore_from_wal.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 set -euxo pipefail

@@ -6,17 +6,52 @@ PG_BIN=$1
 WAL_PATH=$2
 DATA_DIR=$3
 PORT=$4
+PG_VERSION=$5
 SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
+
+# The way that initdb is invoked must match how the pageserver runs initdb.
+function initdb_with_args {
+    local cmd=(
+        "$PG_BIN"/initdb
+        -E utf8
+        -U cloud_admin
+        -D "$DATA_DIR"
+        --locale 'C.UTF-8'
+        --lc-collate 'C.UTF-8'
+        --lc-ctype 'C.UTF-8'
+        --lc-messages 'C.UTF-8'
+        --lc-monetary 'C.UTF-8'
+        --lc-numeric 'C.UTF-8'
+        --lc-time 'C.UTF-8'
+        --sysid="$SYSID"
+    )
+
+    case "$PG_VERSION" in
+        14)
+            # Postgres 14 and below didn't support --locale-provider
+            ;;
+        15 | 16)
+            cmd+=(--locale-provider 'libc')
+            ;;
+        *)
+            # Postgres 17 added the builtin provider
+            cmd+=(--locale-provider 'builtin')
+            ;;
+    esac
+
+    eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}"
+}
+
 rm -fr "$DATA_DIR"
-env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
+initdb_with_args
 echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
 echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
 REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
 declare -i WAL_SIZE=$REDO_POS+114
 "$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start
 "$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate
-cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
+cp "$DATA_DIR"/pg_wal/000000010000000000000001 "$DATA_DIR"
 cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
 for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
-dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
-rm -f 000000010000000000000001
+dd if="$DATA_DIR"/000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
+rm -f "$DATA_DIR"/000000010000000000000001
--- a/libs/utils/scripts/restore_from_wal_initdb.sh
+++ b/libs/utils/scripts/restore_from_wal_initdb.sh
@@ -14,8 +14,8 @@ REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"|
 declare -i WAL_SIZE=$REDO_POS+114
 "$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start
 "$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate
-cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
+cp "$DATA_DIR"/pg_wal/000000010000000000000001 "$DATA_DIR"
 cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
 for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
-dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
-rm -f 000000010000000000000001
+dd if="$DATA_DIR"/000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
+rm -f "$DATA_DIR"/000000010000000000000001
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -40,6 +40,11 @@ pub enum Scope {
    /// Allows access to storage controller APIs used by the scrubber, to interrogate the state
    /// of a tenant & post scrub results.
    Scrubber,
+
+    /// This scope is used for communication with other storage controller instances.
+    /// At the time of writing, this is only used for the step down request.
+    #[serde(rename = "controller_peer")]
+    ControllerPeer,
 }

 /// JWT payload. See docs/authentication.md for the format
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -123,15 +123,27 @@ pub async fn fsync_async_opt(
    Ok(())
 }

-/// Like postgres' durable_rename, renames file issuing fsyncs do make it
-/// durable. After return, file and rename are guaranteed to be persisted.
+/// Like postgres' durable_rename, renames a file and issues fsyncs to make it durable. After
+/// returning, both the file and rename are guaranteed to be persisted. Both paths must be on the
+/// same file system.
 ///
-/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make
-/// contents durable; 2) its directory entry to make rename durable 3) again to
-/// already renamed file, which is not required by standards but postgres does
-/// it, let's stick to that. Postgres additionally fsyncs newpath *before*
-/// rename if it exists to ensure that at least one of the files survives, but
-/// current callers don't need that.
+/// Unlike postgres, it only fsyncs 1) the file to make contents durable, and 2) the directory to
+/// make the rename durable. This sequence ensures the target file will never be incomplete.
+///
+/// Postgres also:
+///
+/// * Fsyncs the target file, if it exists, before the rename, to ensure either the new or existing
+///   file survives a crash. Current callers don't need this as it should already be fsynced if
+///   durability is needed.
+///
+/// * Fsyncs the file after the rename. This can be required with certain OSes or file systems (e.g.
+///   NFS), but not on Linux with most common file systems like ext4 (which we currently use).
+///
+/// An audit of 8 other databases found that none fsynced the file after a rename:
+/// <https://github.com/neondatabase/neon/pull/9686#discussion_r1837180535>
+///
+/// eBPF probes confirmed that this is sufficient with ext4, XFS, and ZFS, but possibly not Btrfs:
+/// <https://github.com/neondatabase/neon/pull/9686#discussion_r1837926218>
 ///
 /// virtual_file.rs has similar code, but it doesn't use vfs.
 ///
@@ -149,9 +161,6 @@ pub async fn durable_rename(
    // Time to do the real deal.
    tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?;

-    // Postgres'ish fsync of renamed file.
-    fsync_async_opt(new_path.as_ref(), do_fsync).await?;
-
    // Now fsync the parent
    let parent = match new_path.as_ref().parent() {
        Some(p) => p,
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -138,6 +138,11 @@ impl Lsn {
        self.0.checked_sub(other).map(Lsn)
    }

+    /// Subtract a number, saturating at numeric bounds instead of overflowing.
+    pub fn saturating_sub<T: Into<u64>>(self, other: T) -> Lsn {
+        Lsn(self.0.saturating_sub(other.into()))
+    }
+
    /// Subtract a number, returning the difference as i128 to avoid overflow.
    pub fn widening_sub<T: Into<u64>>(self, other: T) -> i128 {
        let other: u64 = other.into();
--- a/libs/wal_decoder/Cargo.toml
+++ b/libs/wal_decoder/Cargo.toml
@@ -5,7 +5,7 @@ edition.workspace = true
 license.workspace = true

 [features]
-testing = []
+testing = ["pageserver_api/testing"]

 [dependencies]
 anyhow.workspace = true
--- a/libs/wal_decoder/src/decoder.rs
+++ b/libs/wal_decoder/src/decoder.rs
@@ -2,15 +2,13 @@
 //! raw bytes which represent a raw Postgres WAL record.

 use crate::models::*;
-use bytes::{Buf, Bytes, BytesMut};
-use pageserver_api::key::rel_block_to_key;
-use pageserver_api::record::NeonWalRecord;
+use crate::serialized_batch::SerializedValueBatch;
+use bytes::{Buf, Bytes};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
-use pageserver_api::value::Value;
+use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
 use postgres_ffi::walrecord::*;
-use postgres_ffi::{page_is_new, page_set_lsn, pg_constants, BLCKSZ};
 use utils::lsn::Lsn;

 impl InterpretedWalRecord {
@@ -21,11 +19,12 @@ impl InterpretedWalRecord {
    pub fn from_bytes_filtered(
        buf: Bytes,
        shard: &ShardIdentity,
-        lsn: Lsn,
+        next_record_lsn: Lsn,
        pg_version: u32,
    ) -> anyhow::Result<InterpretedWalRecord> {
        let mut decoded = DecodedWALRecord::default();
        decode_wal_record(buf, &mut decoded, pg_version)?;
+        let xid = decoded.xl_xid;

        let flush_uncommitted = if decoded.is_dbase_create_copy(pg_version) {
            FlushUncommittedRecords::Yes
@@ -33,96 +32,20 @@ impl InterpretedWalRecord {
            FlushUncommittedRecords::No
        };

-        let metadata_record = MetadataRecord::from_decoded(&decoded, lsn, pg_version)?;
-
-        let mut blocks = Vec::default();
-        for blk in decoded.blocks.iter() {
-            let rel = RelTag {
-                spcnode: blk.rnode_spcnode,
-                dbnode: blk.rnode_dbnode,
-                relnode: blk.rnode_relnode,
-                forknum: blk.forknum,
-            };
-
-            let key = rel_block_to_key(rel, blk.blkno);
-
-            if !key.is_valid_key_on_write_path() {
-                anyhow::bail!("Unsupported key decoded at LSN {}: {}", lsn, key);
-            }
-
-            let key_is_local = shard.is_key_local(&key);
-
-            tracing::debug!(
-                lsn=%lsn,
-                key=%key,
-                "ingest: shard decision {}",
-                if !key_is_local { "drop" } else { "keep" },
-            );
-
-            if !key_is_local {
-                if shard.is_shard_zero() {
-                    // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
-                    // its blkno in case it implicitly extends a relation.
-                    blocks.push((key.to_compact(), None));
-                }
-
-                continue;
-            }
-
-            // Instead of storing full-page-image WAL record,
-            // it is better to store extracted image: we can skip wal-redo
-            // in this case. Also some FPI records may contain multiple (up to 32) pages,
-            // so them have to be copied multiple times.
-            //
-            let value = if blk.apply_image
-                && blk.has_image
-                && decoded.xl_rmid == pg_constants::RM_XLOG_ID
-                && (decoded.xl_info == pg_constants::XLOG_FPI
-                || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
-                // compression of WAL is not yet supported: fall back to storing the original WAL record
-                && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)
-                // do not materialize null pages because them most likely be soon replaced with real data
-                && blk.bimg_len != 0
-            {
-                // Extract page image from FPI record
-                let img_len = blk.bimg_len as usize;
-                let img_offs = blk.bimg_offset as usize;
-                let mut image = BytesMut::with_capacity(BLCKSZ as usize);
-                // TODO(vlad): skip the copy
-                image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
-
-                if blk.hole_length != 0 {
-                    let tail = image.split_off(blk.hole_offset as usize);
-                    image.resize(image.len() + blk.hole_length as usize, 0u8);
-                    image.unsplit(tail);
-                }
-                //
-                // Match the logic of XLogReadBufferForRedoExtended:
-                // The page may be uninitialized. If so, we can't set the LSN because
-                // that would corrupt the page.
-                //
-                if !page_is_new(&image) {
-                    page_set_lsn(&mut image, lsn)
-                }
-                assert_eq!(image.len(), BLCKSZ as usize);
-
-                Value::Image(image.freeze())
-            } else {
-                Value::WalRecord(NeonWalRecord::Postgres {
-                    will_init: blk.will_init || blk.apply_image,
-                    rec: decoded.record.clone(),
-                })
-            };
-
-            blocks.push((key.to_compact(), Some(value)));
-        }
+        let metadata_record = MetadataRecord::from_decoded(&decoded, next_record_lsn, pg_version)?;
+        let batch = SerializedValueBatch::from_decoded_filtered(
+            decoded,
+            shard,
+            next_record_lsn,
+            pg_version,
+        )?;

        Ok(InterpretedWalRecord {
            metadata_record,
-            blocks,
-            lsn,
+            batch,
+            next_record_lsn,
            flush_uncommitted,
-            xid: decoded.xl_xid,
+            xid,
        })
    }
 }
@@ -130,7 +53,7 @@ impl InterpretedWalRecord {
 impl MetadataRecord {
    fn from_decoded(
        decoded: &DecodedWALRecord,
-        lsn: Lsn,
+        next_record_lsn: Lsn,
        pg_version: u32,
    ) -> anyhow::Result<Option<MetadataRecord>> {
        // Note: this doesn't actually copy the bytes since
@@ -151,7 +74,9 @@ impl MetadataRecord {
                Ok(None)
            }
            pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version),
-            pg_constants::RM_XACT_ID => Self::decode_xact_record(&mut buf, decoded, lsn),
+            pg_constants::RM_XACT_ID => {
+                Self::decode_xact_record(&mut buf, decoded, next_record_lsn)
+            }
            pg_constants::RM_MULTIXACT_ID => {
                Self::decode_multixact_record(&mut buf, decoded, pg_version)
            }
@@ -163,7 +88,9 @@ impl MetadataRecord {
            //
            // Alternatively, one can make the checkpoint part of the subscription protocol
            // to the pageserver. This should work fine, but can be done at a later point.
-            pg_constants::RM_XLOG_ID => Self::decode_xlog_record(&mut buf, decoded, lsn),
+            pg_constants::RM_XLOG_ID => {
+                Self::decode_xlog_record(&mut buf, decoded, next_record_lsn)
+            }
            pg_constants::RM_LOGICALMSG_ID => {
                Self::decode_logical_message_record(&mut buf, decoded)
            }
--- a/libs/wal_decoder/src/lib.rs
+++ b/libs/wal_decoder/src/lib.rs
@@ -1,2 +1,3 @@
 pub mod decoder;
 pub mod models;
+pub mod serialized_batch;
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -2,7 +2,8 @@
 //! ready for the pageserver to interpret. They are derived from the original
 //! WAL records, so that each struct corresponds closely to one WAL record of
 //! a specific kind. They contain the same information as the original WAL records,
-//! just decoded into structs and fields for easier access.
+//! but the values are already serialized in a [`SerializedValueBatch`], which
+//! is the format that the pageserver is expecting them in.
 //!
 //! The ingestion code uses these structs to help with parsing the WAL records,
 //! and it splits them into a stream of modifications to the key-value pairs that
@@ -25,32 +26,36 @@
 //!                     |--> write to KV store within the pageserver

 use bytes::Bytes;
-use pageserver_api::key::CompactKey;
 use pageserver_api::reltag::{RelTag, SlruKind};
-use pageserver_api::value::Value;
 use postgres_ffi::walrecord::{
    XlMultiXactCreate, XlMultiXactTruncate, XlRelmapUpdate, XlReploriginDrop, XlReploriginSet,
    XlSmgrTruncate, XlXactParsedRecord,
 };
 use postgres_ffi::{Oid, TransactionId};
+use serde::{Deserialize, Serialize};
 use utils::lsn::Lsn;

+use crate::serialized_batch::SerializedValueBatch;
+
+#[derive(Serialize, Deserialize)]
 pub enum FlushUncommittedRecords {
    Yes,
    No,
 }

 /// An interpreted Postgres WAL record, ready to be handled by the pageserver
+#[derive(Serialize, Deserialize)]
 pub struct InterpretedWalRecord {
    /// Optional metadata record - may cause writes to metadata keys
    /// in the storage engine
    pub metadata_record: Option<MetadataRecord>,
-    /// Images or deltas for blocks modified in the original WAL record.
-    /// The [`Value`] is optional to avoid sending superfluous data to
-    /// shard 0 for relation size tracking.
-    pub blocks: Vec<(CompactKey, Option<Value>)>,
-    /// Byte offset within WAL for the end of the original PG WAL record
-    pub lsn: Lsn,
+    /// A pre-serialized batch along with the required metadata for ingestion
+    /// by the pageserver
+    pub batch: SerializedValueBatch,
+    /// Byte offset within WAL for the start of the next PG WAL record.
+    /// Usually this is the end LSN of the current record, but in case of
+    /// XLOG SWITCH records it will be within the next segment.
+    pub next_record_lsn: Lsn,
    /// Whether to flush all uncommitted modifications to the storage engine
    /// before ingesting this record. This is currently only used for legacy PG
    /// database creations which read pages from a template database. Such WAL
@@ -62,6 +67,7 @@ pub struct InterpretedWalRecord {

 /// The interpreted part of the Postgres WAL record which requires metadata
 /// writes to the underlying storage engine.
+#[derive(Serialize, Deserialize)]
 pub enum MetadataRecord {
    Heapam(HeapamRecord),
    Neonrmgr(NeonrmgrRecord),
@@ -77,10 +83,12 @@ pub enum MetadataRecord {
    Replorigin(ReploriginRecord),
 }

+#[derive(Serialize, Deserialize)]
 pub enum HeapamRecord {
    ClearVmBits(ClearVmBits),
 }

+#[derive(Serialize, Deserialize)]
 pub struct ClearVmBits {
    pub new_heap_blkno: Option<u32>,
    pub old_heap_blkno: Option<u32>,
@@ -88,24 +96,29 @@ pub struct ClearVmBits {
    pub flags: u8,
 }

+#[derive(Serialize, Deserialize)]
 pub enum NeonrmgrRecord {
    ClearVmBits(ClearVmBits),
 }

+#[derive(Serialize, Deserialize)]
 pub enum SmgrRecord {
    Create(SmgrCreate),
    Truncate(XlSmgrTruncate),
 }

+#[derive(Serialize, Deserialize)]
 pub struct SmgrCreate {
    pub rel: RelTag,
 }

+#[derive(Serialize, Deserialize)]
 pub enum DbaseRecord {
    Create(DbaseCreate),
    Drop(DbaseDrop),
 }

+#[derive(Serialize, Deserialize)]
 pub struct DbaseCreate {
    pub db_id: Oid,
    pub tablespace_id: Oid,
@@ -113,27 +126,32 @@ pub struct DbaseCreate {
    pub src_tablespace_id: Oid,
 }

+#[derive(Serialize, Deserialize)]
 pub struct DbaseDrop {
    pub db_id: Oid,
    pub tablespace_ids: Vec<Oid>,
 }

+#[derive(Serialize, Deserialize)]
 pub enum ClogRecord {
    ZeroPage(ClogZeroPage),
    Truncate(ClogTruncate),
 }

+#[derive(Serialize, Deserialize)]
 pub struct ClogZeroPage {
    pub segno: u32,
    pub rpageno: u32,
 }

+#[derive(Serialize, Deserialize)]
 pub struct ClogTruncate {
    pub pageno: u32,
    pub oldest_xid: TransactionId,
    pub oldest_xid_db: Oid,
 }

+#[derive(Serialize, Deserialize)]
 pub enum XactRecord {
    Commit(XactCommon),
    Abort(XactCommon),
@@ -142,6 +160,7 @@ pub enum XactRecord {
    Prepare(XactPrepare),
 }

+#[derive(Serialize, Deserialize)]
 pub struct XactCommon {
    pub parsed: XlXactParsedRecord,
    pub origin_id: u16,
@@ -150,61 +169,73 @@ pub struct XactCommon {
    pub lsn: Lsn,
 }

+#[derive(Serialize, Deserialize)]
 pub struct XactPrepare {
    pub xl_xid: TransactionId,
    pub data: Bytes,
 }

+#[derive(Serialize, Deserialize)]
 pub enum MultiXactRecord {
    ZeroPage(MultiXactZeroPage),
    Create(XlMultiXactCreate),
    Truncate(XlMultiXactTruncate),
 }

+#[derive(Serialize, Deserialize)]
 pub struct MultiXactZeroPage {
    pub slru_kind: SlruKind,
    pub segno: u32,
    pub rpageno: u32,
 }

+#[derive(Serialize, Deserialize)]
 pub enum RelmapRecord {
    Update(RelmapUpdate),
 }

+#[derive(Serialize, Deserialize)]
 pub struct RelmapUpdate {
    pub update: XlRelmapUpdate,
    pub buf: Bytes,
 }

+#[derive(Serialize, Deserialize)]
 pub enum XlogRecord {
    Raw(RawXlogRecord),
 }

+#[derive(Serialize, Deserialize)]
 pub struct RawXlogRecord {
    pub info: u8,
    pub lsn: Lsn,
    pub buf: Bytes,
 }

+#[derive(Serialize, Deserialize)]
 pub enum LogicalMessageRecord {
    Put(PutLogicalMessage),
    #[cfg(feature = "testing")]
    Failpoint,
 }

+#[derive(Serialize, Deserialize)]
 pub struct PutLogicalMessage {
    pub path: String,
    pub buf: Bytes,
 }

+#[derive(Serialize, Deserialize)]
 pub enum StandbyRecord {
    RunningXacts(StandbyRunningXacts),
 }

+#[derive(Serialize, Deserialize)]
 pub struct StandbyRunningXacts {
    pub oldest_running_xid: TransactionId,
 }

+#[derive(Serialize, Deserialize)]
 pub enum ReploriginRecord {
    Set(XlReploriginSet),
    Drop(XlReploriginDrop),
--- a/libs/wal_decoder/src/serialized_batch.rs
+++ b/libs/wal_decoder/src/serialized_batch.rs
@@ -0,0 +1,871 @@
+//! This module implements batch type for serialized [`pageserver_api::value::Value`]
+//! instances. Each batch contains a raw buffer (serialized values)
+//! and a list of metadata for each (key, LSN) tuple present in the batch.
+//!
+//! Such batches are created from decoded PG wal records and ingested
+//! by the pageserver by writing directly to the ephemeral file.
+
+use std::collections::BTreeSet;
+
+use bytes::{Bytes, BytesMut};
+use pageserver_api::key::rel_block_to_key;
+use pageserver_api::keyspace::KeySpace;
+use pageserver_api::record::NeonWalRecord;
+use pageserver_api::reltag::RelTag;
+use pageserver_api::shard::ShardIdentity;
+use pageserver_api::{key::CompactKey, value::Value};
+use postgres_ffi::walrecord::{DecodedBkpBlock, DecodedWALRecord};
+use postgres_ffi::{page_is_new, page_set_lsn, pg_constants, BLCKSZ};
+use serde::{Deserialize, Serialize};
+use utils::bin_ser::BeSer;
+use utils::lsn::Lsn;
+
+use pageserver_api::key::Key;
+
+static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
+
+/// Accompanying metadata for the batch
+/// A value may be serialized and stored into the batch or just "observed".
+/// Shard 0 currently "observes" all values in order to accurately track
+/// relation sizes. In the case of "observed" values, we only need to know
+/// the key and LSN, so two types of metadata are supported to save on network
+/// bandwidth.
+#[derive(Serialize, Deserialize)]
+pub enum ValueMeta {
+    Serialized(SerializedValueMeta),
+    Observed(ObservedValueMeta),
+}
+
+impl ValueMeta {
+    pub fn key(&self) -> CompactKey {
+        match self {
+            Self::Serialized(ser) => ser.key,
+            Self::Observed(obs) => obs.key,
+        }
+    }
+
+    pub fn lsn(&self) -> Lsn {
+        match self {
+            Self::Serialized(ser) => ser.lsn,
+            Self::Observed(obs) => obs.lsn,
+        }
+    }
+}
+
+/// Wrapper around [`ValueMeta`] that implements ordering by
+/// (key, LSN) tuples
+struct OrderedValueMeta(ValueMeta);
+
+impl Ord for OrderedValueMeta {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        (self.0.key(), self.0.lsn()).cmp(&(other.0.key(), other.0.lsn()))
+    }
+}
+
+impl PartialOrd for OrderedValueMeta {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl PartialEq for OrderedValueMeta {
+    fn eq(&self, other: &Self) -> bool {
+        (self.0.key(), self.0.lsn()) == (other.0.key(), other.0.lsn())
+    }
+}
+
+impl Eq for OrderedValueMeta {}
+
+/// Metadata for a [`Value`] serialized into the batch.
+#[derive(Serialize, Deserialize)]
+pub struct SerializedValueMeta {
+    pub key: CompactKey,
+    pub lsn: Lsn,
+    /// Starting offset of the value for the (key, LSN) tuple
+    /// in [`SerializedValueBatch::raw`]
+    pub batch_offset: u64,
+    pub len: usize,
+    pub will_init: bool,
+}
+
+/// Metadata for a [`Value`] observed by the batch
+#[derive(Serialize, Deserialize)]
+pub struct ObservedValueMeta {
+    pub key: CompactKey,
+    pub lsn: Lsn,
+}
+
+/// Batch of serialized [`Value`]s.
+#[derive(Serialize, Deserialize)]
+pub struct SerializedValueBatch {
+    /// [`Value`]s serialized in EphemeralFile's native format,
+    /// ready for disk write by the pageserver
+    pub raw: Vec<u8>,
+
+    /// Metadata to make sense of the bytes in [`Self::raw`]
+    /// and represent "observed" values.
+    ///
+    /// Invariant: Metadata entries for any given key are ordered
+    /// by LSN. Note that entries for a key do not have to be contiguous.
+    pub metadata: Vec<ValueMeta>,
+
+    /// The highest LSN of any value in the batch
+    pub max_lsn: Lsn,
+
+    /// Number of values encoded by [`Self::raw`]
+    pub len: usize,
+}
+
+impl Default for SerializedValueBatch {
+    fn default() -> Self {
+        Self {
+            raw: Default::default(),
+            metadata: Default::default(),
+            max_lsn: Lsn(0),
+            len: 0,
+        }
+    }
+}
+
+impl SerializedValueBatch {
+    /// Build a batch of serialized values from a decoded PG WAL record
+    ///
+    /// The batch will only contain values for keys targeting the specifiec
+    /// shard. Shard 0 is a special case, where any keys that don't belong to
+    /// it are "observed" by the batch (i.e. present in [`SerializedValueBatch::metadata`],
+    /// but absent from the raw buffer [`SerializedValueBatch::raw`]).
+    pub(crate) fn from_decoded_filtered(
+        decoded: DecodedWALRecord,
+        shard: &ShardIdentity,
+        next_record_lsn: Lsn,
+        pg_version: u32,
+    ) -> anyhow::Result<SerializedValueBatch> {
+        // First determine how big the buffer needs to be and allocate it up-front.
+        // This duplicates some of the work below, but it's empirically much faster.
+        let estimated_buffer_size = Self::estimate_buffer_size(&decoded, shard, pg_version);
+        let mut buf = Vec::<u8>::with_capacity(estimated_buffer_size);
+
+        let mut metadata: Vec<ValueMeta> = Vec::with_capacity(decoded.blocks.len());
+        let mut max_lsn: Lsn = Lsn(0);
+        let mut len: usize = 0;
+        for blk in decoded.blocks.iter() {
+            let relative_off = buf.len() as u64;
+
+            let rel = RelTag {
+                spcnode: blk.rnode_spcnode,
+                dbnode: blk.rnode_dbnode,
+                relnode: blk.rnode_relnode,
+                forknum: blk.forknum,
+            };
+
+            let key = rel_block_to_key(rel, blk.blkno);
+
+            if !key.is_valid_key_on_write_path() {
+                anyhow::bail!(
+                    "Unsupported key decoded at LSN {}: {}",
+                    next_record_lsn,
+                    key
+                );
+            }
+
+            let key_is_local = shard.is_key_local(&key);
+
+            tracing::debug!(
+                lsn=%next_record_lsn,
+                key=%key,
+                "ingest: shard decision {}",
+                if !key_is_local { "drop" } else { "keep" },
+            );
+
+            if !key_is_local {
+                if shard.is_shard_zero() {
+                    // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
+                    // its blkno in case it implicitly extends a relation.
+                    metadata.push(ValueMeta::Observed(ObservedValueMeta {
+                        key: key.to_compact(),
+                        lsn: next_record_lsn,
+                    }))
+                }
+
+                continue;
+            }
+
+            // Instead of storing full-page-image WAL record,
+            // it is better to store extracted image: we can skip wal-redo
+            // in this case. Also some FPI records may contain multiple (up to 32) pages,
+            // so them have to be copied multiple times.
+            //
+            let val = if Self::block_is_image(&decoded, blk, pg_version) {
+                // Extract page image from FPI record
+                let img_len = blk.bimg_len as usize;
+                let img_offs = blk.bimg_offset as usize;
+                let mut image = BytesMut::with_capacity(BLCKSZ as usize);
+                // TODO(vlad): skip the copy
+                image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
+
+                if blk.hole_length != 0 {
+                    let tail = image.split_off(blk.hole_offset as usize);
+                    image.resize(image.len() + blk.hole_length as usize, 0u8);
+                    image.unsplit(tail);
+                }
+                //
+                // Match the logic of XLogReadBufferForRedoExtended:
+                // The page may be uninitialized. If so, we can't set the LSN because
+                // that would corrupt the page.
+                //
+                if !page_is_new(&image) {
+                    page_set_lsn(&mut image, next_record_lsn)
+                }
+                assert_eq!(image.len(), BLCKSZ as usize);
+
+                Value::Image(image.freeze())
+            } else {
+                Value::WalRecord(NeonWalRecord::Postgres {
+                    will_init: blk.will_init || blk.apply_image,
+                    rec: decoded.record.clone(),
+                })
+            };
+
+            val.ser_into(&mut buf)
+                .expect("Writing into in-memory buffer is infallible");
+
+            let val_ser_size = buf.len() - relative_off as usize;
+
+            metadata.push(ValueMeta::Serialized(SerializedValueMeta {
+                key: key.to_compact(),
+                lsn: next_record_lsn,
+                batch_offset: relative_off,
+                len: val_ser_size,
+                will_init: val.will_init(),
+            }));
+            max_lsn = std::cmp::max(max_lsn, next_record_lsn);
+            len += 1;
+        }
+
+        if cfg!(any(debug_assertions, test)) {
+            let batch = Self {
+                raw: buf,
+                metadata,
+                max_lsn,
+                len,
+            };
+
+            batch.validate_lsn_order();
+
+            return Ok(batch);
+        }
+
+        Ok(Self {
+            raw: buf,
+            metadata,
+            max_lsn,
+            len,
+        })
+    }
+
+    /// Look into the decoded PG WAL record and determine
+    /// roughly how large the buffer for serialized values needs to be.
+    fn estimate_buffer_size(
+        decoded: &DecodedWALRecord,
+        shard: &ShardIdentity,
+        pg_version: u32,
+    ) -> usize {
+        let mut estimate: usize = 0;
+
+        for blk in decoded.blocks.iter() {
+            let rel = RelTag {
+                spcnode: blk.rnode_spcnode,
+                dbnode: blk.rnode_dbnode,
+                relnode: blk.rnode_relnode,
+                forknum: blk.forknum,
+            };
+
+            let key = rel_block_to_key(rel, blk.blkno);
+
+            if !shard.is_key_local(&key) {
+                continue;
+            }
+
+            if Self::block_is_image(decoded, blk, pg_version) {
+                // 4 bytes for the Value::Image discriminator
+                // 8 bytes for encoding the size of the buffer
+                // BLCKSZ for the raw image
+                estimate += (4 + 8 + BLCKSZ) as usize;
+            } else {
+                // 4 bytes for the Value::WalRecord discriminator
+                // 4 bytes for the NeonWalRecord::Postgres discriminator
+                // 1 bytes for NeonWalRecord::Postgres::will_init
+                // 8 bytes for encoding the size of the buffer
+                // length of the raw record
+                estimate += 8 + 1 + 8 + decoded.record.len();
+            }
+        }
+
+        estimate
+    }
+
+    fn block_is_image(decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, pg_version: u32) -> bool {
+        blk.apply_image
+            && blk.has_image
+            && decoded.xl_rmid == pg_constants::RM_XLOG_ID
+            && (decoded.xl_info == pg_constants::XLOG_FPI
+            || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
+            // compression of WAL is not yet supported: fall back to storing the original WAL record
+            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)
+            // do not materialize null pages because them most likely be soon replaced with real data
+            && blk.bimg_len != 0
+    }
+
+    /// Encode a list of values and metadata into a serialized batch
+    ///
+    /// This is used by the pageserver ingest code to conveniently generate
+    /// batches for metadata writes.
+    pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self {
+        // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
+        // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
+        let buffer_size = batch.iter().map(|i| i.2).sum::<usize>();
+        let mut buf = Vec::<u8>::with_capacity(buffer_size);
+
+        let mut metadata: Vec<ValueMeta> = Vec::with_capacity(batch.len());
+        let mut max_lsn: Lsn = Lsn(0);
+        let len = batch.len();
+        for (key, lsn, val_ser_size, val) in batch {
+            let relative_off = buf.len() as u64;
+
+            val.ser_into(&mut buf)
+                .expect("Writing into in-memory buffer is infallible");
+
+            metadata.push(ValueMeta::Serialized(SerializedValueMeta {
+                key,
+                lsn,
+                batch_offset: relative_off,
+                len: val_ser_size,
+                will_init: val.will_init(),
+            }));
+            max_lsn = std::cmp::max(max_lsn, lsn);
+        }
+
+        // Assert that we didn't do any extra allocations while building buffer.
+        debug_assert!(buf.len() <= buffer_size);
+
+        if cfg!(any(debug_assertions, test)) {
+            let batch = Self {
+                raw: buf,
+                metadata,
+                max_lsn,
+                len,
+            };
+
+            batch.validate_lsn_order();
+
+            return batch;
+        }
+
+        Self {
+            raw: buf,
+            metadata,
+            max_lsn,
+            len,
+        }
+    }
+
+    /// Add one value to the batch
+    ///
+    /// This is used by the pageserver ingest code to include metadata block
+    /// updates for a single key.
+    pub fn put(&mut self, key: CompactKey, value: Value, lsn: Lsn) {
+        let relative_off = self.raw.len() as u64;
+        value.ser_into(&mut self.raw).unwrap();
+
+        let val_ser_size = self.raw.len() - relative_off as usize;
+        self.metadata
+            .push(ValueMeta::Serialized(SerializedValueMeta {
+                key,
+                lsn,
+                batch_offset: relative_off,
+                len: val_ser_size,
+                will_init: value.will_init(),
+            }));
+
+        self.max_lsn = std::cmp::max(self.max_lsn, lsn);
+        self.len += 1;
+
+        if cfg!(any(debug_assertions, test)) {
+            self.validate_lsn_order();
+        }
+    }
+
+    /// Extend with the contents of another batch
+    ///
+    /// One batch is generated for each decoded PG WAL record.
+    /// They are then merged to accumulate reasonably sized writes.
+    pub fn extend(&mut self, mut other: SerializedValueBatch) {
+        let extend_batch_start_offset = self.raw.len() as u64;
+
+        self.raw.extend(other.raw);
+
+        // Shift the offsets in the batch we are extending with
+        other.metadata.iter_mut().for_each(|meta| match meta {
+            ValueMeta::Serialized(ser) => {
+                ser.batch_offset += extend_batch_start_offset;
+                if cfg!(debug_assertions) {
+                    let value_end = ser.batch_offset + ser.len as u64;
+                    assert!((value_end as usize) <= self.raw.len());
+                }
+            }
+            ValueMeta::Observed(_) => {}
+        });
+        self.metadata.extend(other.metadata);
+
+        self.max_lsn = std::cmp::max(self.max_lsn, other.max_lsn);
+
+        self.len += other.len;
+
+        if cfg!(any(debug_assertions, test)) {
+            self.validate_lsn_order();
+        }
+    }
+
+    /// Add zero images for the (key, LSN) tuples specified
+    ///
+    /// PG versions below 16 do not zero out pages before extending
+    /// a relation and may leave gaps. Such gaps need to be identified
+    /// by the pageserver ingest logic and get patched up here.
+    ///
+    /// Note that this function does not validate that the gaps have been
+    /// identified correctly (it does not know relation sizes), so it's up
+    /// to the call-site to do it properly.
+    pub fn zero_gaps(&mut self, gaps: Vec<(KeySpace, Lsn)>) {
+        // Implementation note:
+        //
+        // Values within [`SerializedValueBatch::raw`] do not have any ordering requirements,
+        // but the metadata entries should be ordered properly (see
+        // [`SerializedValueBatch::metadata`]).
+        //
+        // Exploiting this observation we do:
+        // 1. Drain all the metadata entries into an ordered set.
+        // The use of a BTreeSet keyed by (Key, Lsn) relies on the observation that Postgres never
+        // includes more than one update to the same block in the same WAL record.
+        // 2. For each (key, LSN) gap tuple, append a zero image to the raw buffer
+        // and add an index entry to the ordered metadata set.
+        // 3. Drain the ordered set back into a metadata vector
+
+        let mut ordered_metas = self
+            .metadata
+            .drain(..)
+            .map(OrderedValueMeta)
+            .collect::<BTreeSet<_>>();
+        for (keyspace, lsn) in gaps {
+            self.max_lsn = std::cmp::max(self.max_lsn, lsn);
+
+            for gap_range in keyspace.ranges {
+                let mut key = gap_range.start;
+                while key != gap_range.end {
+                    let relative_off = self.raw.len() as u64;
+
+                    // TODO(vlad): Can we be cheeky and write only one zero image, and
+                    // make all index entries requiring a zero page point to it?
+                    // Alternatively, we can change the index entry format to represent zero pages
+                    // without writing them at all.
+                    Value::Image(ZERO_PAGE.clone())
+                        .ser_into(&mut self.raw)
+                        .unwrap();
+                    let val_ser_size = self.raw.len() - relative_off as usize;
+
+                    ordered_metas.insert(OrderedValueMeta(ValueMeta::Serialized(
+                        SerializedValueMeta {
+                            key: key.to_compact(),
+                            lsn,
+                            batch_offset: relative_off,
+                            len: val_ser_size,
+                            will_init: true,
+                        },
+                    )));
+
+                    self.len += 1;
+
+                    key = key.next();
+                }
+            }
+        }
+
+        self.metadata = ordered_metas.into_iter().map(|ord| ord.0).collect();
+
+        if cfg!(any(debug_assertions, test)) {
+            self.validate_lsn_order();
+        }
+    }
+
+    /// Checks if the batch is empty
+    ///
+    /// A batch is empty when it contains no serialized values.
+    /// Note that it may still contain observed values.
+    pub fn is_empty(&self) -> bool {
+        let empty = self.raw.is_empty();
+
+        if cfg!(debug_assertions) && empty {
+            assert!(self
+                .metadata
+                .iter()
+                .all(|meta| matches!(meta, ValueMeta::Observed(_))));
+        }
+
+        empty
+    }
+
+    /// Returns the number of values serialized in the batch
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    /// Returns the size of the buffer wrapped by the batch
+    pub fn buffer_size(&self) -> usize {
+        self.raw.len()
+    }
+
+    pub fn updates_key(&self, key: &Key) -> bool {
+        self.metadata.iter().any(|meta| match meta {
+            ValueMeta::Serialized(ser) => key.to_compact() == ser.key,
+            ValueMeta::Observed(_) => false,
+        })
+    }
+
+    pub fn validate_lsn_order(&self) {
+        use std::collections::HashMap;
+
+        let mut last_seen_lsn_per_key: HashMap<CompactKey, Lsn> = HashMap::default();
+
+        for meta in self.metadata.iter() {
+            let lsn = meta.lsn();
+            let key = meta.key();
+
+            if let Some(prev_lsn) = last_seen_lsn_per_key.insert(key, lsn) {
+                assert!(
+                    lsn >= prev_lsn,
+                    "Ordering violated by {}: {} < {}",
+                    Key::from_compact(key),
+                    lsn,
+                    prev_lsn
+                );
+            }
+        }
+    }
+}
+
+#[cfg(all(test, feature = "testing"))]
+mod tests {
+    use super::*;
+
+    fn validate_batch(
+        batch: &SerializedValueBatch,
+        values: &[(CompactKey, Lsn, usize, Value)],
+        gaps: Option<&Vec<(KeySpace, Lsn)>>,
+    ) {
+        // Invariant 1: The metadata for a given entry in the batch
+        // is correct and can be used to deserialize back to the original value.
+        for (key, lsn, size, value) in values.iter() {
+            let meta = batch
+                .metadata
+                .iter()
+                .find(|meta| (meta.key(), meta.lsn()) == (*key, *lsn))
+                .unwrap();
+            let meta = match meta {
+                ValueMeta::Serialized(ser) => ser,
+                ValueMeta::Observed(_) => unreachable!(),
+            };
+
+            assert_eq!(meta.len, *size);
+            assert_eq!(meta.will_init, value.will_init());
+
+            let start = meta.batch_offset as usize;
+            let end = meta.batch_offset as usize + meta.len;
+            let value_from_batch = Value::des(&batch.raw[start..end]).unwrap();
+            assert_eq!(&value_from_batch, value);
+        }
+
+        let mut expected_buffer_size: usize = values.iter().map(|(_, _, size, _)| size).sum();
+        let mut gap_pages_count: usize = 0;
+
+        // Invariant 2: Zero pages were added for identified gaps and their metadata
+        // is correct.
+        if let Some(gaps) = gaps {
+            for (gap_keyspace, lsn) in gaps {
+                for gap_range in &gap_keyspace.ranges {
+                    let mut gap_key = gap_range.start;
+                    while gap_key != gap_range.end {
+                        let meta = batch
+                            .metadata
+                            .iter()
+                            .find(|meta| (meta.key(), meta.lsn()) == (gap_key.to_compact(), *lsn))
+                            .unwrap();
+                        let meta = match meta {
+                            ValueMeta::Serialized(ser) => ser,
+                            ValueMeta::Observed(_) => unreachable!(),
+                        };
+
+                        let zero_value = Value::Image(ZERO_PAGE.clone());
+                        let zero_value_size = zero_value.serialized_size().unwrap() as usize;
+
+                        assert_eq!(meta.len, zero_value_size);
+                        assert_eq!(meta.will_init, zero_value.will_init());
+
+                        let start = meta.batch_offset as usize;
+                        let end = meta.batch_offset as usize + meta.len;
+                        let value_from_batch = Value::des(&batch.raw[start..end]).unwrap();
+                        assert_eq!(value_from_batch, zero_value);
+
+                        gap_pages_count += 1;
+                        expected_buffer_size += zero_value_size;
+                        gap_key = gap_key.next();
+                    }
+                }
+            }
+        }
+
+        // Invariant 3: The length of the batch is equal to the number
+        // of values inserted, plus the number of gap pages. This extends
+        // to the raw buffer size.
+        assert_eq!(batch.len(), values.len() + gap_pages_count);
+        assert_eq!(expected_buffer_size, batch.buffer_size());
+
+        // Invariant 4: Metadata entries for any given key are sorted in LSN order.
+        batch.validate_lsn_order();
+    }
+
+    #[test]
+    fn test_creation_from_values() {
+        const LSN: Lsn = Lsn(0x10);
+        let key = Key::from_hex("110000000033333333444444445500000001").unwrap();
+
+        let values = vec![
+            (
+                key.to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("foo")),
+            ),
+            (
+                key.next().to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("bar")),
+            ),
+            (
+                key.to_compact(),
+                Lsn(LSN.0 + 0x10),
+                Value::WalRecord(NeonWalRecord::wal_append("baz")),
+            ),
+            (
+                key.next().next().to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("taz")),
+            ),
+        ];
+
+        let values = values
+            .into_iter()
+            .map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
+            .collect::<Vec<_>>();
+        let batch = SerializedValueBatch::from_values(values.clone());
+
+        validate_batch(&batch, &values, None);
+
+        assert!(!batch.is_empty());
+    }
+
+    #[test]
+    fn test_put() {
+        const LSN: Lsn = Lsn(0x10);
+        let key = Key::from_hex("110000000033333333444444445500000001").unwrap();
+
+        let values = vec![
+            (
+                key.to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("foo")),
+            ),
+            (
+                key.next().to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("bar")),
+            ),
+        ];
+
+        let mut values = values
+            .into_iter()
+            .map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
+            .collect::<Vec<_>>();
+        let mut batch = SerializedValueBatch::from_values(values.clone());
+
+        validate_batch(&batch, &values, None);
+
+        let value = (
+            key.to_compact(),
+            Lsn(LSN.0 + 0x10),
+            Value::WalRecord(NeonWalRecord::wal_append("baz")),
+        );
+        let serialized_size = value.2.serialized_size().unwrap() as usize;
+        let value = (value.0, value.1, serialized_size, value.2);
+        values.push(value.clone());
+        batch.put(value.0, value.3, value.1);
+
+        validate_batch(&batch, &values, None);
+
+        let value = (
+            key.next().next().to_compact(),
+            LSN,
+            Value::WalRecord(NeonWalRecord::wal_append("taz")),
+        );
+        let serialized_size = value.2.serialized_size().unwrap() as usize;
+        let value = (value.0, value.1, serialized_size, value.2);
+        values.push(value.clone());
+        batch.put(value.0, value.3, value.1);
+
+        validate_batch(&batch, &values, None);
+    }
+
+    #[test]
+    fn test_extension() {
+        const LSN: Lsn = Lsn(0x10);
+        let key = Key::from_hex("110000000033333333444444445500000001").unwrap();
+
+        let values = vec![
+            (
+                key.to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("foo")),
+            ),
+            (
+                key.next().to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("bar")),
+            ),
+            (
+                key.next().next().to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("taz")),
+            ),
+        ];
+
+        let mut values = values
+            .into_iter()
+            .map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
+            .collect::<Vec<_>>();
+        let mut batch = SerializedValueBatch::from_values(values.clone());
+
+        let other_values = vec![
+            (
+                key.to_compact(),
+                Lsn(LSN.0 + 0x10),
+                Value::WalRecord(NeonWalRecord::wal_append("foo")),
+            ),
+            (
+                key.next().to_compact(),
+                Lsn(LSN.0 + 0x10),
+                Value::WalRecord(NeonWalRecord::wal_append("bar")),
+            ),
+            (
+                key.next().next().to_compact(),
+                Lsn(LSN.0 + 0x10),
+                Value::WalRecord(NeonWalRecord::wal_append("taz")),
+            ),
+        ];
+
+        let other_values = other_values
+            .into_iter()
+            .map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
+            .collect::<Vec<_>>();
+        let other_batch = SerializedValueBatch::from_values(other_values.clone());
+
+        values.extend(other_values);
+        batch.extend(other_batch);
+
+        validate_batch(&batch, &values, None);
+    }
+
+    #[test]
+    fn test_gap_zeroing() {
+        const LSN: Lsn = Lsn(0x10);
+        let rel_foo_base_key = Key::from_hex("110000000033333333444444445500000001").unwrap();
+
+        let rel_bar_base_key = {
+            let mut key = rel_foo_base_key;
+            key.field4 += 1;
+            key
+        };
+
+        let values = vec![
+            (
+                rel_foo_base_key.to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("foo1")),
+            ),
+            (
+                rel_foo_base_key.add(1).to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("foo2")),
+            ),
+            (
+                rel_foo_base_key.add(5).to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("foo3")),
+            ),
+            (
+                rel_foo_base_key.add(1).to_compact(),
+                Lsn(LSN.0 + 0x10),
+                Value::WalRecord(NeonWalRecord::wal_append("foo4")),
+            ),
+            (
+                rel_foo_base_key.add(10).to_compact(),
+                Lsn(LSN.0 + 0x10),
+                Value::WalRecord(NeonWalRecord::wal_append("foo5")),
+            ),
+            (
+                rel_foo_base_key.add(11).to_compact(),
+                Lsn(LSN.0 + 0x10),
+                Value::WalRecord(NeonWalRecord::wal_append("foo6")),
+            ),
+            (
+                rel_foo_base_key.add(12).to_compact(),
+                Lsn(LSN.0 + 0x10),
+                Value::WalRecord(NeonWalRecord::wal_append("foo7")),
+            ),
+            (
+                rel_bar_base_key.to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("bar1")),
+            ),
+            (
+                rel_bar_base_key.add(4).to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("bar2")),
+            ),
+        ];
+
+        let values = values
+            .into_iter()
+            .map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
+            .collect::<Vec<_>>();
+
+        let mut batch = SerializedValueBatch::from_values(values.clone());
+
+        let gaps = vec![
+            (
+                KeySpace {
+                    ranges: vec![
+                        rel_foo_base_key.add(2)..rel_foo_base_key.add(5),
+                        rel_bar_base_key.add(1)..rel_bar_base_key.add(4),
+                    ],
+                },
+                LSN,
+            ),
+            (
+                KeySpace {
+                    ranges: vec![rel_foo_base_key.add(6)..rel_foo_base_key.add(10)],
+                },
+                Lsn(LSN.0 + 0x10),
+            ),
+        ];
+
+        batch.zero_gaps(gaps.clone());
+        validate_batch(&batch, &values, Some(&gaps));
+    }
+}
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -9,7 +9,6 @@ use pageserver::{
    l0_flush::{L0FlushConfig, L0FlushGlobalState},
    page_cache,
    task_mgr::TaskKind,
-    tenant::storage_layer::inmemory_layer::SerializedBatch,
    tenant::storage_layer::InMemoryLayer,
    virtual_file,
 };
@@ -18,6 +17,7 @@ use utils::{
    bin_ser::BeSer,
    id::{TenantId, TimelineId},
 };
+use wal_decoder::serialized_batch::SerializedValueBatch;

 // A very cheap hash for generating non-sequential keys.
 fn murmurhash32(mut h: u32) -> u32 {
@@ -102,13 +102,13 @@ async fn ingest(
        batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
        if batch.len() >= BATCH_SIZE {
            let this_batch = std::mem::take(&mut batch);
-            let serialized = SerializedBatch::from_values(this_batch).unwrap();
+            let serialized = SerializedValueBatch::from_values(this_batch);
            layer.put_batch(serialized, &ctx).await?;
        }
    }
    if !batch.is_empty() {
        let this_batch = std::mem::take(&mut batch);
-        let serialized = SerializedBatch::from_values(this_batch).unwrap();
+        let serialized = SerializedValueBatch::from_values(this_batch);
        layer.put_batch(serialized, &ctx).await?;
    }
    layer.freeze(lsn + 1).await;
@@ -167,6 +167,7 @@ fn criterion_benchmark(c: &mut Criterion) {
        16384,
        virtual_file::io_engine_for_bench(),
        conf.virtual_file_io_mode,
+        virtual_file::SyncMode::Sync,
    );
    page_cache::init(conf.page_cache_size);

--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -35,6 +35,15 @@ pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
    !(a.end <= b.start || b.end <= a.start)
 }

+/// Whether a fully contains b, example as below
+/// ```plain
+/// |      a       |
+///       |  b  |
+/// ```
+pub fn fully_contains<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
+    a.start <= b.start && a.end >= b.end
+}
+
 pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
    let x = std::mem::take(a);
    let mut all_ranges_iter = [x.into_iter(), b.into_iter()]
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -138,6 +138,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
        10,
        virtual_file::api::IoEngineKind::StdFs,
        IoMode::preferred(),
+        virtual_file::SyncMode::Sync,
    );
    pageserver::page_cache::init(100);

--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -51,6 +51,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
        10,
        virtual_file::api::IoEngineKind::StdFs,
        IoMode::preferred(),
+        virtual_file::SyncMode::Sync,
    );
    page_cache::init(100);
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
@@ -65,6 +66,7 @@ async fn read_image_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
        10,
        virtual_file::api::IoEngineKind::StdFs,
        IoMode::preferred(),
+        virtual_file::SyncMode::Sync,
    );
    page_cache::init(100);
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
@@ -171,6 +173,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                10,
                virtual_file::api::IoEngineKind::StdFs,
                IoMode::preferred(),
+                virtual_file::SyncMode::Sync,
            );
            pageserver::page_cache::init(100);

--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -209,6 +209,7 @@ async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
        10,
        virtual_file::api::IoEngineKind::StdFs,
        IoMode::preferred(),
+        virtual_file::SyncMode::Sync,
    );
    page_cache::init(100);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -19,7 +19,8 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
            | Scope::SafekeeperData
            | Scope::GenerationsApi
            | Scope::Infra
-            | Scope::Scrubber,
+            | Scope::Scrubber
+            | Scope::ControllerPeer,
            _,
        ) => Err(AuthError(
            format!(
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -154,24 +154,35 @@ fn main() -> anyhow::Result<()> {
            },
        };

-        let started = Instant::now();
-        syncfs(dirfd)?;
-        let elapsed = started.elapsed();
-        info!(
-            elapsed_ms = elapsed.as_millis(),
-            "made tenant directory contents durable"
-        );
+        if conf.no_sync {
+            info!("Skipping syncfs on startup");
+        } else {
+            let started = Instant::now();
+            syncfs(dirfd)?;
+            let elapsed = started.elapsed();
+            info!(
+                elapsed_ms = elapsed.as_millis(),
+                "made tenant directory contents durable"
+            );
+        }
    }

    // Initialize up failpoints support
    let scenario = failpoint_support::init();

    // Basic initialization of things that don't change after startup
+    tracing::info!("Initializing virtual_file...");
    virtual_file::init(
        conf.max_file_descriptors,
        conf.virtual_file_io_engine,
        conf.virtual_file_io_mode,
+        if conf.no_sync {
+            virtual_file::SyncMode::UnsafeNoSync
+        } else {
+            virtual_file::SyncMode::Sync
+        },
    );
+    tracing::info!("Initializing page_cache...");
    page_cache::init(conf.page_cache_size);

    start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -69,6 +69,7 @@ pub struct PageServerConf {
    pub wal_redo_timeout: Duration,

    pub superuser: String,
+    pub locale: String,

    pub page_cache_size: usize,
    pub max_file_descriptors: usize,
@@ -178,6 +179,9 @@ pub struct PageServerConf {

    /// Direct IO settings
    pub virtual_file_io_mode: virtual_file::IoMode,
+
+    /// Optionally disable disk syncs (unsafe!)
+    pub no_sync: bool,
 }

 /// Token for authentication to safekeepers
@@ -298,6 +302,7 @@ impl PageServerConf {
            wait_lsn_timeout,
            wal_redo_timeout,
            superuser,
+            locale,
            page_cache_size,
            max_file_descriptors,
            pg_distrib_dir,
@@ -332,6 +337,7 @@ impl PageServerConf {
            concurrent_tenant_size_logical_size_queries,
            virtual_file_io_engine,
            tenant_config,
+            no_sync,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -344,6 +350,7 @@ impl PageServerConf {
            wait_lsn_timeout,
            wal_redo_timeout,
            superuser,
+            locale,
            page_cache_size,
            max_file_descriptors,
            http_auth_type,
@@ -409,6 +416,7 @@ impl PageServerConf {
                .map(crate::l0_flush::L0FlushConfig::from)
                .unwrap_or_default(),
            virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
+            no_sync: no_sync.unwrap_or(false),
        };

        // ------------------------------------------------------------
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -37,6 +37,7 @@ use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantSorting;
+use pageserver_api::models::TenantState;
 use pageserver_api::models::TimelineArchivalConfigRequest;
 use pageserver_api::models::TimelineCreateRequestMode;
 use pageserver_api::models::TimelinesInfoAndOffloaded;
@@ -295,6 +296,9 @@ impl From<GetActiveTenantError> for ApiError {
            GetActiveTenantError::Broken(reason) => {
                ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
            }
+            GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
+                ApiError::ShuttingDown
+            }
            GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
            GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
            GetActiveTenantError::NotFound(gte) => gte.into(),
@@ -320,6 +324,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
                    .into_boxed_str(),
            ),
            a @ AlreadyInProgress(_) => ApiError::Conflict(a.to_string()),
+            Cancelled => ApiError::ResourceUnavailable("shutting down".into()),
            Other(e) => ApiError::InternalServerError(e),
        }
    }
@@ -1998,9 +2003,9 @@ async fn timeline_offload_handler(
                "timeline has attached children".into(),
            ));
        }
-        if !timeline.can_offload() {
+        if let (false, reason) = timeline.can_offload() {
            return Err(ApiError::PreconditionFailed(
-                "Timeline::can_offload() returned false".into(),
+                format!("Timeline::can_offload() check failed: {}", reason) .into(),
            ));
        }
        offload_timeline(&tenant, &timeline)
@@ -2165,6 +2170,21 @@ async fn timeline_detach_ancestor_handler(
        let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
        let ctx = &ctx;

+        // Flush the upload queues of all timelines before detaching ancestor. We do the same thing again
+        // during shutdown. This early upload ensures the pageserver does not need to upload too many
+        // things and creates downtime during timeline reloads.
+        for timeline in tenant.list_timelines() {
+            timeline
+                .remote_client
+                .wait_completion()
+                .await
+                .map_err(|e| {
+                    ApiError::PreconditionFailed(format!("cannot drain upload queue: {e}").into())
+                })?;
+        }
+
+        tracing::info!("all timeline upload queues are drained");
+
        let timeline = tenant.get_timeline(timeline_id, true)?;

        let progress = timeline
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1,10 +1,11 @@
 //! The Page Service listens for client connections and serves their GetPage@LSN
 //! requests.

-use anyhow::Context;
+use anyhow::{bail, Context};
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use futures::FutureExt;
+use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
@@ -1221,6 +1222,222 @@ impl PageServerHandler {
    }
 }

+/// `basebackup tenant timeline [lsn] [--gzip] [--replica]`
+#[derive(Debug, Clone, Eq, PartialEq)]
+struct BaseBackupCmd {
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    lsn: Option<Lsn>,
+    gzip: bool,
+    replica: bool,
+}
+
+/// `fullbackup tenant timeline [lsn] [prev_lsn]`
+#[derive(Debug, Clone, Eq, PartialEq)]
+struct FullBackupCmd {
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    lsn: Option<Lsn>,
+    prev_lsn: Option<Lsn>,
+}
+
+/// `pagestream_v2 tenant timeline`
+#[derive(Debug, Clone, Eq, PartialEq)]
+struct PageStreamCmd {
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+}
+
+/// `lease lsn tenant timeline lsn`
+#[derive(Debug, Clone, Eq, PartialEq)]
+struct LeaseLsnCmd {
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+    lsn: Lsn,
+}
+
+#[derive(Debug, Clone, Eq, PartialEq)]
+enum PageServiceCmd {
+    Set,
+    PageStream(PageStreamCmd),
+    BaseBackup(BaseBackupCmd),
+    FullBackup(FullBackupCmd),
+    LeaseLsn(LeaseLsnCmd),
+}
+
+impl PageStreamCmd {
+    fn parse(query: &str) -> anyhow::Result<Self> {
+        let parameters = query.split_whitespace().collect_vec();
+        if parameters.len() != 2 {
+            bail!(
+                "invalid number of parameters for pagestream command: {}",
+                query
+            );
+        }
+        let tenant_id = TenantId::from_str(parameters[0])
+            .with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
+        let timeline_id = TimelineId::from_str(parameters[1])
+            .with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
+        Ok(Self {
+            tenant_id,
+            timeline_id,
+        })
+    }
+}
+
+impl FullBackupCmd {
+    fn parse(query: &str) -> anyhow::Result<Self> {
+        let parameters = query.split_whitespace().collect_vec();
+        if parameters.len() < 2 || parameters.len() > 4 {
+            bail!(
+                "invalid number of parameters for basebackup command: {}",
+                query
+            );
+        }
+        let tenant_id = TenantId::from_str(parameters[0])
+            .with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
+        let timeline_id = TimelineId::from_str(parameters[1])
+            .with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
+        // The caller is responsible for providing correct lsn and prev_lsn.
+        let lsn = if let Some(lsn_str) = parameters.get(2) {
+            Some(
+                Lsn::from_str(lsn_str)
+                    .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
+            )
+        } else {
+            None
+        };
+        let prev_lsn = if let Some(prev_lsn_str) = parameters.get(3) {
+            Some(
+                Lsn::from_str(prev_lsn_str)
+                    .with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
+            )
+        } else {
+            None
+        };
+        Ok(Self {
+            tenant_id,
+            timeline_id,
+            lsn,
+            prev_lsn,
+        })
+    }
+}
+
+impl BaseBackupCmd {
+    fn parse(query: &str) -> anyhow::Result<Self> {
+        let parameters = query.split_whitespace().collect_vec();
+        if parameters.len() < 2 {
+            bail!(
+                "invalid number of parameters for basebackup command: {}",
+                query
+            );
+        }
+        let tenant_id = TenantId::from_str(parameters[0])
+            .with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
+        let timeline_id = TimelineId::from_str(parameters[1])
+            .with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
+        let lsn;
+        let flags_parse_from;
+        if let Some(maybe_lsn) = parameters.get(2) {
+            if *maybe_lsn == "latest" {
+                lsn = None;
+                flags_parse_from = 3;
+            } else if maybe_lsn.starts_with("--") {
+                lsn = None;
+                flags_parse_from = 2;
+            } else {
+                lsn = Some(
+                    Lsn::from_str(maybe_lsn)
+                        .with_context(|| format!("Failed to parse lsn from {maybe_lsn}"))?,
+                );
+                flags_parse_from = 3;
+            }
+        } else {
+            lsn = None;
+            flags_parse_from = 2;
+        }
+
+        let mut gzip = false;
+        let mut replica = false;
+
+        for &param in &parameters[flags_parse_from..] {
+            match param {
+                "--gzip" => {
+                    if gzip {
+                        bail!("duplicate parameter for basebackup command: {param}")
+                    }
+                    gzip = true
+                }
+                "--replica" => {
+                    if replica {
+                        bail!("duplicate parameter for basebackup command: {param}")
+                    }
+                    replica = true
+                }
+                _ => bail!("invalid parameter for basebackup command: {param}"),
+            }
+        }
+        Ok(Self {
+            tenant_id,
+            timeline_id,
+            lsn,
+            gzip,
+            replica,
+        })
+    }
+}
+
+impl LeaseLsnCmd {
+    fn parse(query: &str) -> anyhow::Result<Self> {
+        let parameters = query.split_whitespace().collect_vec();
+        if parameters.len() != 3 {
+            bail!(
+                "invalid number of parameters for lease lsn command: {}",
+                query
+            );
+        }
+        let tenant_shard_id = TenantShardId::from_str(parameters[0])
+            .with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
+        let timeline_id = TimelineId::from_str(parameters[1])
+            .with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
+        let lsn = Lsn::from_str(parameters[2])
+            .with_context(|| format!("Failed to parse lsn from {}", parameters[2]))?;
+        Ok(Self {
+            tenant_shard_id,
+            timeline_id,
+            lsn,
+        })
+    }
+}
+
+impl PageServiceCmd {
+    fn parse(query: &str) -> anyhow::Result<Self> {
+        let query = query.trim();
+        let Some((cmd, other)) = query.split_once(' ') else {
+            bail!("cannot parse query: {query}")
+        };
+        match cmd.to_ascii_lowercase().as_str() {
+            "pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(other)?)),
+            "basebackup" => Ok(Self::BaseBackup(BaseBackupCmd::parse(other)?)),
+            "fullbackup" => Ok(Self::FullBackup(FullBackupCmd::parse(other)?)),
+            "lease" => {
+                let Some((cmd2, other)) = other.split_once(' ') else {
+                    bail!("invalid lease command: {cmd}");
+                };
+                let cmd2 = cmd2.to_ascii_lowercase();
+                if cmd2 == "lsn" {
+                    Ok(Self::LeaseLsn(LeaseLsnCmd::parse(other)?))
+                } else {
+                    bail!("invalid lease command: {cmd}");
+                }
+            }
+            "set" => Ok(Self::Set),
+            _ => Err(anyhow::anyhow!("unsupported command {cmd} in {query}")),
+        }
+    }
+}
+
 impl<IO> postgres_backend::Handler<IO> for PageServerHandler
 where
    IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -1277,206 +1494,137 @@ where
        fail::fail_point!("ps::connection-start::process-query");

        let ctx = self.connection_ctx.attached_child();
-        debug!("process query {query_string:?}");
-        let parts = query_string.split_whitespace().collect::<Vec<_>>();
-        if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
-            if params.len() != 2 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for pagestream command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::PageStreamV2)
-                .inc();
-
-            self.handle_pagerequests(
-                pgb,
+        debug!("process query {query_string}");
+        let query = PageServiceCmd::parse(query_string)?;
+        match query {
+            PageServiceCmd::PageStream(PageStreamCmd {
                tenant_id,
                timeline_id,
-                PagestreamProtocolVersion::V2,
-                ctx,
-            )
-            .await?;
-        } else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
-            if params.len() < 2 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for basebackup command"
-                )));
+            }) => {
+                tracing::Span::current()
+                    .record("tenant_id", field::display(tenant_id))
+                    .record("timeline_id", field::display(timeline_id));
+
+                self.check_permission(Some(tenant_id))?;
+
+                COMPUTE_COMMANDS_COUNTERS
+                    .for_command(ComputeCommandKind::PageStreamV2)
+                    .inc();
+
+                self.handle_pagerequests(
+                    pgb,
+                    tenant_id,
+                    timeline_id,
+                    PagestreamProtocolVersion::V2,
+                    ctx,
+                )
+                .await?;
            }
+            PageServiceCmd::BaseBackup(BaseBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn,
+                gzip,
+                replica,
+            }) => {
+                tracing::Span::current()
+                    .record("tenant_id", field::display(tenant_id))
+                    .record("timeline_id", field::display(timeline_id));

-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+                self.check_permission(Some(tenant_id))?;

-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::Basebackup)
-                .inc();
-
-            let mut lsn = None;
-            let mut replica = false;
-            let mut gzip = false;
-            for param in &params[2..] {
-                if param.starts_with("--") {
-                    match *param {
-                        "--gzip" => gzip = true,
-                        "--replica" => replica = true,
-                        _ => {
-                            return Err(QueryError::Other(anyhow::anyhow!(
-                                "Unknown parameter {param}",
-                            )))
-                        }
-                    }
-                } else {
-                    lsn = Some(
-                        Lsn::from_str(param)
-                            .with_context(|| format!("Failed to parse Lsn from {param}"))?,
-                    );
+                COMPUTE_COMMANDS_COUNTERS
+                    .for_command(ComputeCommandKind::Basebackup)
+                    .inc();
+                let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
+                let res = async {
+                    self.handle_basebackup_request(
+                        pgb,
+                        tenant_id,
+                        timeline_id,
+                        lsn,
+                        None,
+                        false,
+                        gzip,
+                        replica,
+                        &ctx,
+                    )
+                    .await?;
+                    pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                    Result::<(), QueryError>::Ok(())
                }
+                .await;
+                metric_recording.observe(&res);
+                res?;
            }
+            // same as basebackup, but result includes relational data as well
+            PageServiceCmd::FullBackup(FullBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn,
+                prev_lsn,
+            }) => {
+                tracing::Span::current()
+                    .record("tenant_id", field::display(tenant_id))
+                    .record("timeline_id", field::display(timeline_id));

-            let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
-            let res = async {
+                self.check_permission(Some(tenant_id))?;
+
+                COMPUTE_COMMANDS_COUNTERS
+                    .for_command(ComputeCommandKind::Fullbackup)
+                    .inc();
+
+                // Check that the timeline exists
                self.handle_basebackup_request(
                    pgb,
                    tenant_id,
                    timeline_id,
                    lsn,
-                    None,
+                    prev_lsn,
+                    true,
+                    false,
                    false,
-                    gzip,
-                    replica,
                    &ctx,
                )
                .await?;
                pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                Result::<(), QueryError>::Ok(())
            }
-            .await;
-            metric_recording.observe(&res);
-            res?;
-        }
-        // same as basebackup, but result includes relational data as well
-        else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
-            if params.len() < 2 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for fullbackup command"
-                )));
+            PageServiceCmd::Set => {
+                // important because psycopg2 executes "SET datestyle TO 'ISO'"
+                // on connect
+                pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
            }
-
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            // The caller is responsible for providing correct lsn and prev_lsn.
-            let lsn = if let Some(lsn_str) = params.get(2) {
-                Some(
-                    Lsn::from_str(lsn_str)
-                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
-                )
-            } else {
-                None
-            };
-            let prev_lsn = if let Some(prev_lsn_str) = params.get(3) {
-                Some(
-                    Lsn::from_str(prev_lsn_str)
-                        .with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
-                )
-            } else {
-                None
-            };
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::Fullbackup)
-                .inc();
-
-            // Check that the timeline exists
-            self.handle_basebackup_request(
-                pgb,
-                tenant_id,
+            PageServiceCmd::LeaseLsn(LeaseLsnCmd {
+                tenant_shard_id,
                timeline_id,
                lsn,
-                prev_lsn,
-                true,
-                false,
-                false,
-                &ctx,
-            )
-            .await?;
-            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.to_ascii_lowercase().starts_with("set ") {
-            // important because psycopg2 executes "SET datestyle TO 'ISO'"
-            // on connect
-            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("lease lsn ") {
-            let params = &parts[2..];
-            if params.len() != 3 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number {} for lease lsn command",
-                    params.len()
-                )));
+            }) => {
+                tracing::Span::current()
+                    .record("tenant_id", field::display(tenant_shard_id))
+                    .record("timeline_id", field::display(timeline_id));
+
+                self.check_permission(Some(tenant_shard_id.tenant_id))?;
+
+                COMPUTE_COMMANDS_COUNTERS
+                    .for_command(ComputeCommandKind::LeaseLsn)
+                    .inc();
+
+                match self
+                    .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
+                    .await
+                {
+                    Ok(()) => {
+                        pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?
+                    }
+                    Err(e) => {
+                        error!("error obtaining lsn lease for {lsn}: {e:?}");
+                        pgb.write_message_noflush(&BeMessage::ErrorResponse(
+                            &e.to_string(),
+                            Some(e.pg_error_code()),
+                        ))?
+                    }
+                };
            }
-
-            let tenant_shard_id = TenantShardId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_shard_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_shard_id.tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::LeaseLsn)
-                .inc();
-
-            // The caller is responsible for providing correct lsn.
-            let lsn = Lsn::from_str(params[2])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
-
-            match self
-                .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
-                .await
-            {
-                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
-                Err(e) => {
-                    error!("error obtaining lsn lease for {lsn}: {e:?}");
-                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
-                        &e.to_string(),
-                        Some(e.pg_error_code()),
-                    ))?
-                }
-            };
-        } else {
-            return Err(QueryError::Other(anyhow::anyhow!(
-                "unknown command {query_string}"
-            )));
        }

        Ok(())
@@ -1525,3 +1673,181 @@ fn set_tracing_field_shard_id(timeline: &Timeline) {
    );
    debug_assert_current_span_has_tenant_and_timeline_id();
 }
+
+#[cfg(test)]
+mod tests {
+    use utils::shard::ShardCount;
+
+    use super::*;
+
+    #[test]
+    fn pageservice_cmd_parse() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+        let cmd =
+            PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id} {timeline_id}")).unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::PageStream(PageStreamCmd {
+                tenant_id,
+                timeline_id
+            })
+        );
+        let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id}")).unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::BaseBackup(BaseBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn: None,
+                gzip: false,
+                replica: false
+            })
+        );
+        let cmd =
+            PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} --gzip")).unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::BaseBackup(BaseBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn: None,
+                gzip: true,
+                replica: false
+            })
+        );
+        let cmd =
+            PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} latest")).unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::BaseBackup(BaseBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn: None,
+                gzip: false,
+                replica: false
+            })
+        );
+        let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} 0/16ABCDE"))
+            .unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::BaseBackup(BaseBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
+                gzip: false,
+                replica: false
+            })
+        );
+        let cmd = PageServiceCmd::parse(&format!(
+            "basebackup {tenant_id} {timeline_id} --replica --gzip"
+        ))
+        .unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::BaseBackup(BaseBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn: None,
+                gzip: true,
+                replica: true
+            })
+        );
+        let cmd = PageServiceCmd::parse(&format!(
+            "basebackup {tenant_id} {timeline_id} 0/16ABCDE --replica --gzip"
+        ))
+        .unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::BaseBackup(BaseBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
+                gzip: true,
+                replica: true
+            })
+        );
+        let cmd = PageServiceCmd::parse(&format!("fullbackup {tenant_id} {timeline_id}")).unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::FullBackup(FullBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn: None,
+                prev_lsn: None
+            })
+        );
+        let cmd = PageServiceCmd::parse(&format!(
+            "fullbackup {tenant_id} {timeline_id} 0/16ABCDE 0/16ABCDF"
+        ))
+        .unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::FullBackup(FullBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
+                prev_lsn: Some(Lsn::from_str("0/16ABCDF").unwrap()),
+            })
+        );
+        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+        let cmd = PageServiceCmd::parse(&format!(
+            "lease lsn {tenant_shard_id} {timeline_id} 0/16ABCDE"
+        ))
+        .unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::LeaseLsn(LeaseLsnCmd {
+                tenant_shard_id,
+                timeline_id,
+                lsn: Lsn::from_str("0/16ABCDE").unwrap(),
+            })
+        );
+        let tenant_shard_id = TenantShardId::split(&tenant_shard_id, ShardCount(8))[1];
+        let cmd = PageServiceCmd::parse(&format!(
+            "lease lsn {tenant_shard_id} {timeline_id} 0/16ABCDE"
+        ))
+        .unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::LeaseLsn(LeaseLsnCmd {
+                tenant_shard_id,
+                timeline_id,
+                lsn: Lsn::from_str("0/16ABCDE").unwrap(),
+            })
+        );
+        let cmd = PageServiceCmd::parse("set a = b").unwrap();
+        assert_eq!(cmd, PageServiceCmd::Set);
+        let cmd = PageServiceCmd::parse("SET foo").unwrap();
+        assert_eq!(cmd, PageServiceCmd::Set);
+    }
+
+    #[test]
+    fn pageservice_cmd_err_handling() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+        let cmd = PageServiceCmd::parse("unknown_command");
+        assert!(cmd.is_err());
+        let cmd = PageServiceCmd::parse("pagestream_v2");
+        assert!(cmd.is_err());
+        let cmd = PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id}xxx"));
+        assert!(cmd.is_err());
+        let cmd = PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id}xxx {timeline_id}xxx"));
+        assert!(cmd.is_err());
+        let cmd = PageServiceCmd::parse(&format!(
+            "basebackup {tenant_id} {timeline_id} --gzip --gzip"
+        ));
+        assert!(cmd.is_err());
+        let cmd = PageServiceCmd::parse(&format!(
+            "basebackup {tenant_id} {timeline_id} --gzip --unknown"
+        ));
+        assert!(cmd.is_err());
+        let cmd = PageServiceCmd::parse(&format!(
+            "basebackup {tenant_id} {timeline_id} --gzip 0/16ABCDE"
+        ));
+        assert!(cmd.is_err());
+        let cmd = PageServiceCmd::parse(&format!("lease {tenant_id} {timeline_id} gzip 0/16ABCDE"));
+        assert!(cmd.is_err());
+    }
+}
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -24,6 +24,7 @@ use pageserver_api::key::{
 use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
+use pageserver_api::shard::ShardIdentity;
 use pageserver_api::value::Value;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -38,12 +39,13 @@ use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
 use utils::{bin_ser::BeSer, lsn::Lsn};
+use wal_decoder::serialized_batch::SerializedValueBatch;

 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
 pub const MAX_AUX_FILE_DELTAS: usize = 1024;

 /// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached.
-pub const MAX_AUX_FILE_V2_DELTAS: usize = 64;
+pub const MAX_AUX_FILE_V2_DELTAS: usize = 16;

 #[derive(Debug)]
 pub enum LsnForTimestamp {
@@ -170,12 +172,11 @@ impl Timeline {
            tline: self,
            pending_lsns: Vec::new(),
            pending_metadata_pages: HashMap::new(),
-            pending_data_pages: Vec::new(),
-            pending_zero_data_pages: Default::default(),
+            pending_data_batch: None,
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
            pending_directory_entries: Vec::new(),
-            pending_bytes: 0,
+            pending_metadata_bytes: 0,
            lsn,
        }
    }
@@ -1025,21 +1026,14 @@ pub struct DatadirModification<'a> {

    /// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for
    /// which keys are stored here.
-    pending_data_pages: Vec<(CompactKey, Lsn, usize, Value)>,
-
-    // Sometimes during ingest, for example when extending a relation, we would like to write a zero page.  However,
-    // if we encounter a write from postgres in the same wal record, we will drop this entry.
-    //
-    // Unlike other 'pending' fields, this does not last until the next call to commit(): it is flushed
-    // at the end of each wal record, and all these writes implicitly are at lsn Self::lsn
-    pending_zero_data_pages: HashSet<CompactKey>,
+    pending_data_batch: Option<SerializedValueBatch>,

    /// For special "directory" keys that store key-value maps, track the size of the map
    /// if it was updated in this modification.
    pending_directory_entries: Vec<(DirectoryKind, usize)>,

-    /// An **approximation** of how large our EphemeralFile write will be when committed.
-    pending_bytes: usize,
+    /// An **approximation** of how many metadata bytes will be written to the EphemeralFile.
+    pending_metadata_bytes: usize,
 }

 impl<'a> DatadirModification<'a> {
@@ -1054,11 +1048,17 @@ impl<'a> DatadirModification<'a> {
    }

    pub(crate) fn approx_pending_bytes(&self) -> usize {
-        self.pending_bytes
+        self.pending_data_batch
+            .as_ref()
+            .map_or(0, |b| b.buffer_size())
+            + self.pending_metadata_bytes
    }

-    pub(crate) fn has_dirty_data_pages(&self) -> bool {
-        (!self.pending_data_pages.is_empty()) || (!self.pending_zero_data_pages.is_empty())
+    pub(crate) fn has_dirty_data(&self) -> bool {
+        !self
+            .pending_data_batch
+            .as_ref()
+            .map_or(true, |b| b.is_empty())
    }

    /// Set the current lsn
@@ -1070,9 +1070,6 @@ impl<'a> DatadirModification<'a> {
            self.lsn
        );

-        // If we are advancing LSN, then state from previous wal record should have been flushed.
-        assert!(self.pending_zero_data_pages.is_empty());
-
        if lsn > self.lsn {
            self.pending_lsns.push(self.lsn);
            self.lsn = lsn;
@@ -1147,6 +1144,107 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

+    /// Creates a relation if it is not already present.
+    /// Returns the current size of the relation
+    pub(crate) async fn create_relation_if_required(
+        &mut self,
+        rel: RelTag,
+        ctx: &RequestContext,
+    ) -> Result<u32, PageReconstructError> {
+        // Get current size and put rel creation if rel doesn't exist
+        //
+        // NOTE: we check the cache first even though get_rel_exists and get_rel_size would
+        //       check the cache too. This is because eagerly checking the cache results in
+        //       less work overall and 10% better performance. It's more work on cache miss
+        //       but cache miss is rare.
+        if let Some(nblocks) = self.tline.get_cached_rel_size(&rel, self.get_lsn()) {
+            Ok(nblocks)
+        } else if !self
+            .tline
+            .get_rel_exists(rel, Version::Modified(self), ctx)
+            .await?
+        {
+            // create it with 0 size initially, the logic below will extend it
+            self.put_rel_creation(rel, 0, ctx)
+                .await
+                .context("Relation Error")?;
+            Ok(0)
+        } else {
+            self.tline
+                .get_rel_size(rel, Version::Modified(self), ctx)
+                .await
+        }
+    }
+
+    /// Given a block number for a relation (which represents a newly written block),
+    /// the previous block count of the relation, and the shard info, find the gaps
+    /// that were created by the newly written block if any.
+    fn find_gaps(
+        rel: RelTag,
+        blkno: u32,
+        previous_nblocks: u32,
+        shard: &ShardIdentity,
+    ) -> Option<KeySpace> {
+        let mut key = rel_block_to_key(rel, blkno);
+        let mut gap_accum = None;
+
+        for gap_blkno in previous_nblocks..blkno {
+            key.field6 = gap_blkno;
+
+            if shard.get_shard_number(&key) != shard.number {
+                continue;
+            }
+
+            gap_accum
+                .get_or_insert_with(KeySpaceAccum::new)
+                .add_key(key);
+        }
+
+        gap_accum.map(|accum| accum.to_keyspace())
+    }
+
+    pub async fn ingest_batch(
+        &mut self,
+        mut batch: SerializedValueBatch,
+        // TODO(vlad): remove this argument and replace the shard check with is_key_local
+        shard: &ShardIdentity,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let mut gaps_at_lsns = Vec::default();
+
+        for meta in batch.metadata.iter() {
+            let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?;
+            let new_nblocks = blkno + 1;
+
+            let old_nblocks = self.create_relation_if_required(rel, ctx).await?;
+            if new_nblocks > old_nblocks {
+                self.put_rel_extend(rel, new_nblocks, ctx).await?;
+            }
+
+            if let Some(gaps) = Self::find_gaps(rel, blkno, old_nblocks, shard) {
+                gaps_at_lsns.push((gaps, meta.lsn()));
+            }
+        }
+
+        if !gaps_at_lsns.is_empty() {
+            batch.zero_gaps(gaps_at_lsns);
+        }
+
+        match self.pending_data_batch.as_mut() {
+            Some(pending_batch) => {
+                pending_batch.extend(batch);
+            }
+            None if !batch.is_empty() => {
+                self.pending_data_batch = Some(batch);
+            }
+            None => {
+                // Nothing to initialize the batch with
+            }
+        }
+
+        Ok(())
+    }
+
    /// Put a new page version that can be constructed from a WAL record
    ///
    /// NOTE: this will *not* implicitly extend the relation, if the page is beyond the
@@ -1229,8 +1327,13 @@ impl<'a> DatadirModification<'a> {
                self.lsn
            );
        }
-        self.pending_zero_data_pages.insert(key.to_compact());
-        self.pending_bytes += ZERO_PAGE.len();
+
+        let batch = self
+            .pending_data_batch
+            .get_or_insert_with(SerializedValueBatch::default);
+
+        batch.put(key.to_compact(), Value::Image(ZERO_PAGE.clone()), self.lsn);
+
        Ok(())
    }

@@ -1248,17 +1351,14 @@ impl<'a> DatadirModification<'a> {
                self.lsn
            );
        }
-        self.pending_zero_data_pages.insert(key.to_compact());
-        self.pending_bytes += ZERO_PAGE.len();
-        Ok(())
-    }

-    /// Call this at the end of each WAL record.
-    pub(crate) fn on_record_end(&mut self) {
-        let pending_zero_data_pages = std::mem::take(&mut self.pending_zero_data_pages);
-        for key in pending_zero_data_pages {
-            self.put_data(key, Value::Image(ZERO_PAGE.clone()));
-        }
+        let batch = self
+            .pending_data_batch
+            .get_or_insert_with(SerializedValueBatch::default);
+
+        batch.put(key.to_compact(), Value::Image(ZERO_PAGE.clone()), self.lsn);
+
+        Ok(())
    }

    /// Store a relmapper file (pg_filenode.map) in the repository
@@ -1750,12 +1850,17 @@ impl<'a> DatadirModification<'a> {
        let mut writer = self.tline.writer().await;

        // Flush relation and  SLRU data blocks, keep metadata.
-        let pending_data_pages = std::mem::take(&mut self.pending_data_pages);
+        if let Some(batch) = self.pending_data_batch.take() {
+            tracing::debug!(
+                "Flushing batch with max_lsn={}. Last record LSN is {}",
+                batch.max_lsn,
+                self.tline.get_last_record_lsn()
+            );

-        // This bails out on first error without modifying pending_updates.
-        // That's Ok, cf this function's doc comment.
-        writer.put_batch(pending_data_pages, ctx).await?;
-        self.pending_bytes = 0;
+            // This bails out on first error without modifying pending_updates.
+            // That's Ok, cf this function's doc comment.
+            writer.put_batch(batch, ctx).await?;
+        }

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1775,9 +1880,6 @@ impl<'a> DatadirModification<'a> {
    /// All the modifications in this atomic update are stamped by the specified LSN.
    ///
    pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
-        // Commit should never be called mid-wal-record
-        assert!(self.pending_zero_data_pages.is_empty());
-
        let mut writer = self.tline.writer().await;

        let pending_nblocks = self.pending_nblocks;
@@ -1785,21 +1887,49 @@ impl<'a> DatadirModification<'a> {

        // Ordering: the items in this batch do not need to be in any global order, but values for
        // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
-        // this to do efficient updates to its index.
-        let mut write_batch = std::mem::take(&mut self.pending_data_pages);
+        // this to do efficient updates to its index.  See [`wal_decoder::serialized_batch`] for
+        // more details.

-        write_batch.extend(
-            self.pending_metadata_pages
+        let metadata_batch = {
+            let pending_meta = self
+                .pending_metadata_pages
                .drain()
                .flat_map(|(key, values)| {
                    values
                        .into_iter()
                        .map(move |(lsn, value_size, value)| (key, lsn, value_size, value))
-                }),
-        );
+                })
+                .collect::<Vec<_>>();

-        if !write_batch.is_empty() {
-            writer.put_batch(write_batch, ctx).await?;
+            if pending_meta.is_empty() {
+                None
+            } else {
+                Some(SerializedValueBatch::from_values(pending_meta))
+            }
+        };
+
+        let data_batch = self.pending_data_batch.take();
+
+        let maybe_batch = match (data_batch, metadata_batch) {
+            (Some(mut data), Some(metadata)) => {
+                data.extend(metadata);
+                Some(data)
+            }
+            (Some(data), None) => Some(data),
+            (None, Some(metadata)) => Some(metadata),
+            (None, None) => None,
+        };
+
+        if let Some(batch) = maybe_batch {
+            tracing::debug!(
+                "Flushing batch with max_lsn={}. Last record LSN is {}",
+                batch.max_lsn,
+                self.tline.get_last_record_lsn()
+            );
+
+            // This bails out on first error without modifying pending_updates.
+            // That's Ok, cf this function's doc comment.
+            writer.put_batch(batch, ctx).await?;
        }

        if !self.pending_deletions.is_empty() {
@@ -1809,6 +1939,9 @@ impl<'a> DatadirModification<'a> {

        self.pending_lsns.push(self.lsn);
        for pending_lsn in self.pending_lsns.drain(..) {
+            // TODO(vlad): pretty sure the comment below is not valid anymore
+            // and we can call finish write with the latest LSN
+            //
            // Ideally, we should be able to call writer.finish_write() only once
            // with the highest LSN. However, the last_record_lsn variable in the
            // timeline keeps track of the latest LSN and the immediate previous LSN
@@ -1824,14 +1957,14 @@ impl<'a> DatadirModification<'a> {
            writer.update_directory_entries_count(kind, count as u64);
        }

-        self.pending_bytes = 0;
+        self.pending_metadata_bytes = 0;

        Ok(())
    }

    pub(crate) fn len(&self) -> usize {
        self.pending_metadata_pages.len()
-            + self.pending_data_pages.len()
+            + self.pending_data_batch.as_ref().map_or(0, |b| b.len())
            + self.pending_deletions.len()
    }

@@ -1873,11 +2006,10 @@ impl<'a> DatadirModification<'a> {
            // modifications before ingesting DB create operations, which are the only kind that reads
            // data pages during ingest.
            if cfg!(debug_assertions) {
-                for (dirty_key, _, _, _) in &self.pending_data_pages {
-                    debug_assert!(&key.to_compact() != dirty_key);
-                }
-
-                debug_assert!(!self.pending_zero_data_pages.contains(&key.to_compact()))
+                assert!(!self
+                    .pending_data_batch
+                    .as_ref()
+                    .map_or(false, |b| b.updates_key(&key)));
            }
        }

@@ -1895,18 +2027,10 @@ impl<'a> DatadirModification<'a> {
    }

    fn put_data(&mut self, key: CompactKey, val: Value) {
-        let val_serialized_size = val.serialized_size().unwrap() as usize;
-
-        // If this page was previously zero'd in the same WalRecord, then drop the previous zero page write.  This
-        // is an optimization that avoids persisting both the zero page generated by us (e.g. during a relation extend),
-        // and the subsequent postgres-originating write
-        if self.pending_zero_data_pages.remove(&key) {
-            self.pending_bytes -= ZERO_PAGE.len();
-        }
-
-        self.pending_bytes += val_serialized_size;
-        self.pending_data_pages
-            .push((key, self.lsn, val_serialized_size, val))
+        let batch = self
+            .pending_data_batch
+            .get_or_insert_with(SerializedValueBatch::default);
+        batch.put(key, val, self.lsn);
    }

    fn put_metadata(&mut self, key: CompactKey, val: Value) {
@@ -1914,10 +2038,10 @@ impl<'a> DatadirModification<'a> {
        // Replace the previous value if it exists at the same lsn
        if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
            if *last_lsn == self.lsn {
-                // Update the pending_bytes contribution from this entry, and update the serialized size in place
-                self.pending_bytes -= *last_value_ser_size;
+                // Update the pending_metadata_bytes contribution from this entry, and update the serialized size in place
+                self.pending_metadata_bytes -= *last_value_ser_size;
                *last_value_ser_size = val.serialized_size().unwrap() as usize;
-                self.pending_bytes += *last_value_ser_size;
+                self.pending_metadata_bytes += *last_value_ser_size;

                // Use the latest value, this replaces any earlier write to the same (key,lsn), such as much
                // have been generated by synthesized zero page writes prior to the first real write to a page.
@@ -1927,8 +2051,12 @@ impl<'a> DatadirModification<'a> {
        }

        let val_serialized_size = val.serialized_size().unwrap() as usize;
-        self.pending_bytes += val_serialized_size;
+        self.pending_metadata_bytes += val_serialized_size;
        values.push((self.lsn, val_serialized_size, val));
+
+        if key == CHECKPOINT_KEY.to_compact() {
+            tracing::debug!("Checkpoint key added to pending with size {val_serialized_size}");
+        }
    }

    fn delete(&mut self, key_range: Range<Key>) {
@@ -2037,7 +2165,11 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 #[cfg(test)]
 mod tests {
    use hex_literal::hex;
-    use utils::id::TimelineId;
+    use pageserver_api::{models::ShardParameters, shard::ShardStripeSize};
+    use utils::{
+        id::TimelineId,
+        shard::{ShardCount, ShardNumber},
+    };

    use super::*;

@@ -2091,6 +2223,93 @@ mod tests {
        Ok(())
    }

+    #[test]
+    fn gap_finding() {
+        let rel = RelTag {
+            spcnode: 1663,
+            dbnode: 208101,
+            relnode: 2620,
+            forknum: 0,
+        };
+        let base_blkno = 1;
+
+        let base_key = rel_block_to_key(rel, base_blkno);
+        let before_base_key = rel_block_to_key(rel, base_blkno - 1);
+
+        let shard = ShardIdentity::unsharded();
+
+        let mut previous_nblocks = 0;
+        for i in 0..10 {
+            let crnt_blkno = base_blkno + i;
+            let gaps = DatadirModification::find_gaps(rel, crnt_blkno, previous_nblocks, &shard);
+
+            previous_nblocks = crnt_blkno + 1;
+
+            if i == 0 {
+                // The first block we write is 1, so we should find the gap.
+                assert_eq!(gaps.unwrap(), KeySpace::single(before_base_key..base_key));
+            } else {
+                assert!(gaps.is_none());
+            }
+        }
+
+        // This is an update to an already existing block. No gaps here.
+        let update_blkno = 5;
+        let gaps = DatadirModification::find_gaps(rel, update_blkno, previous_nblocks, &shard);
+        assert!(gaps.is_none());
+
+        // This is an update past the current end block.
+        let after_gap_blkno = 20;
+        let gaps = DatadirModification::find_gaps(rel, after_gap_blkno, previous_nblocks, &shard);
+
+        let gap_start_key = rel_block_to_key(rel, previous_nblocks);
+        let after_gap_key = rel_block_to_key(rel, after_gap_blkno);
+        assert_eq!(
+            gaps.unwrap(),
+            KeySpace::single(gap_start_key..after_gap_key)
+        );
+    }
+
+    #[test]
+    fn sharded_gap_finding() {
+        let rel = RelTag {
+            spcnode: 1663,
+            dbnode: 208101,
+            relnode: 2620,
+            forknum: 0,
+        };
+
+        let first_blkno = 6;
+
+        // This shard will get the even blocks
+        let shard = ShardIdentity::from_params(
+            ShardNumber(0),
+            &ShardParameters {
+                count: ShardCount(2),
+                stripe_size: ShardStripeSize(1),
+            },
+        );
+
+        // Only keys belonging to this shard are considered as gaps.
+        let mut previous_nblocks = 0;
+        let gaps =
+            DatadirModification::find_gaps(rel, first_blkno, previous_nblocks, &shard).unwrap();
+        assert!(!gaps.ranges.is_empty());
+        for gap_range in gaps.ranges {
+            let mut k = gap_range.start;
+            while k != gap_range.end {
+                assert_eq!(shard.get_shard_number(&k), shard.number);
+                k = k.next();
+            }
+        }
+
+        previous_nblocks = first_blkno;
+
+        let update_blkno = 2;
+        let gaps = DatadirModification::find_gaps(rel, update_blkno, previous_nblocks, &shard);
+        assert!(gaps.is_none());
+    }
+
    /*
        fn assert_current_logical_size<R: Repository>(timeline: &DatadirTimeline<R>, lsn: Lsn) {
            let incremental = timeline.get_current_logical_size();
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -39,6 +39,7 @@ use remote_timeline_client::UploadQueueNotReadyError;
 use std::collections::BTreeMap;
 use std::fmt;
 use std::future::Future;
+use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
@@ -524,6 +525,9 @@ pub struct OffloadedTimeline {
    /// Prevent two tasks from deleting the timeline at the same time. If held, the
    /// timeline is being deleted. If 'true', the timeline has already been deleted.
    pub delete_progress: TimelineDeleteProgress,
+
+    /// Part of the `OffloadedTimeline` object's lifecycle: this needs to be set before we drop it
+    pub deleted_from_ancestor: AtomicBool,
 }

 impl OffloadedTimeline {
@@ -533,9 +537,16 @@ impl OffloadedTimeline {
    /// the timeline is not in a stopped state.
    /// Panics if the timeline is not archived.
    fn from_timeline(timeline: &Timeline) -> Result<Self, UploadQueueNotReadyError> {
-        let ancestor_retain_lsn = timeline
-            .get_ancestor_timeline_id()
-            .map(|_timeline_id| timeline.get_ancestor_lsn());
+        let (ancestor_retain_lsn, ancestor_timeline_id) =
+            if let Some(ancestor_timeline) = timeline.ancestor_timeline() {
+                let ancestor_lsn = timeline.get_ancestor_lsn();
+                let ancestor_timeline_id = ancestor_timeline.timeline_id;
+                let mut gc_info = ancestor_timeline.gc_info.write().unwrap();
+                gc_info.insert_child(timeline.timeline_id, ancestor_lsn, MaybeOffloaded::Yes);
+                (Some(ancestor_lsn), Some(ancestor_timeline_id))
+            } else {
+                (None, None)
+            };
        let archived_at = timeline
            .remote_client
            .archived_at_stopped_queue()?
@@ -543,14 +554,17 @@ impl OffloadedTimeline {
        Ok(Self {
            tenant_shard_id: timeline.tenant_shard_id,
            timeline_id: timeline.timeline_id,
-            ancestor_timeline_id: timeline.get_ancestor_timeline_id(),
+            ancestor_timeline_id,
            ancestor_retain_lsn,
            archived_at,

            delete_progress: timeline.delete_progress.clone(),
+            deleted_from_ancestor: AtomicBool::new(false),
        })
    }
    fn from_manifest(tenant_shard_id: TenantShardId, manifest: &OffloadedTimelineManifest) -> Self {
+        // We expect to reach this case in tenant loading, where the `retain_lsn` is populated in the parent's `gc_info`
+        // by the `initialize_gc_info` function.
        let OffloadedTimelineManifest {
            timeline_id,
            ancestor_timeline_id,
@@ -564,6 +578,7 @@ impl OffloadedTimeline {
            ancestor_retain_lsn,
            archived_at,
            delete_progress: TimelineDeleteProgress::default(),
+            deleted_from_ancestor: AtomicBool::new(false),
        }
    }
    fn manifest(&self) -> OffloadedTimelineManifest {
@@ -581,6 +596,33 @@ impl OffloadedTimeline {
            archived_at: *archived_at,
        }
    }
+    /// Delete this timeline's retain_lsn from its ancestor, if present in the given tenant
+    fn delete_from_ancestor_with_timelines(
+        &self,
+        timelines: &std::sync::MutexGuard<'_, HashMap<TimelineId, Arc<Timeline>>>,
+    ) {
+        if let (Some(_retain_lsn), Some(ancestor_timeline_id)) =
+            (self.ancestor_retain_lsn, self.ancestor_timeline_id)
+        {
+            if let Some((_, ancestor_timeline)) = timelines
+                .iter()
+                .find(|(tid, _tl)| **tid == ancestor_timeline_id)
+            {
+                ancestor_timeline
+                    .gc_info
+                    .write()
+                    .unwrap()
+                    .remove_child_offloaded(self.timeline_id);
+            }
+        }
+        self.deleted_from_ancestor.store(true, Ordering::Release);
+    }
+    /// Call [`Self::delete_from_ancestor_with_timelines`] instead if possible.
+    ///
+    /// As the entire tenant is being dropped, don't bother deregistering the `retain_lsn` from the ancestor.
+    fn defuse_for_tenant_drop(&self) {
+        self.deleted_from_ancestor.store(true, Ordering::Release);
+    }
 }

 impl fmt::Debug for OffloadedTimeline {
@@ -589,6 +631,17 @@ impl fmt::Debug for OffloadedTimeline {
    }
 }

+impl Drop for OffloadedTimeline {
+    fn drop(&mut self) {
+        if !self.deleted_from_ancestor.load(Ordering::Acquire) {
+            tracing::warn!(
+                "offloaded timeline {} was dropped without having cleaned it up at the ancestor",
+                self.timeline_id
+            );
+        }
+    }
+}
+
 #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
 pub enum MaybeOffloaded {
    Yes,
@@ -700,6 +753,9 @@ pub enum DeleteTimelineError {
    #[error("Timeline deletion is already in progress")]
    AlreadyInProgress(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>),

+    #[error("Cancelled")]
+    Cancelled,
+
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }
@@ -710,6 +766,7 @@ impl Debug for DeleteTimelineError {
            Self::NotFound => write!(f, "NotFound"),
            Self::HasChildren(c) => f.debug_tuple("HasChildren").field(c).finish(),
            Self::AlreadyInProgress(_) => f.debug_tuple("AlreadyInProgress").finish(),
+            Self::Cancelled => f.debug_tuple("Cancelled").finish(),
            Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
        }
    }
@@ -1433,6 +1490,12 @@ impl Tenant {
                    info!(%timeline_id, "index_part not found on remote");
                    continue;
                }
+                Err(DownloadError::Fatal(why)) => {
+                    // If, while loading one remote timeline, we saw an indication that our generation
+                    // number is likely invalid, then we should not load the whole tenant.
+                    error!(%timeline_id, "Fatal error loading timeline: {why}");
+                    anyhow::bail!(why.to_string());
+                }
                Err(e) => {
                    // Some (possibly ephemeral) error happened during index_part download.
                    // Pretend the timeline exists to not delete the timeline directory,
@@ -1521,7 +1584,7 @@ impl Tenant {
        }
        // Complete deletions for offloaded timeline id's.
        offloaded_timelines_list
-            .retain(|(offloaded_id, _offloaded)| {
+            .retain(|(offloaded_id, offloaded)| {
                // At this point, offloaded_timeline_ids has the list of all offloaded timelines
                // without a prefix in S3, so they are inexistent.
                // In the end, existence of a timeline is finally determined by the existence of an index-part.json in remote storage.
@@ -1529,6 +1592,7 @@ impl Tenant {
                let delete = offloaded_timeline_ids.contains(offloaded_id);
                if delete {
                    tracing::info!("Removing offloaded timeline {offloaded_id} from manifest as no remote prefix was found");
+                    offloaded.defuse_for_tenant_drop();
                }
                !delete
        });
@@ -1917,9 +1981,15 @@ impl Tenant {
                )));
            };
            let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap();
-            if offloaded_timelines.remove(&timeline_id).is_none() {
-                warn!("timeline already removed from offloaded timelines");
+            match offloaded_timelines.remove(&timeline_id) {
+                Some(offloaded) => {
+                    offloaded.delete_from_ancestor_with_timelines(&timelines);
+                }
+                None => warn!("timeline already removed from offloaded timelines"),
            }
+
+            self.initialize_gc_info(&timelines, &offloaded_timelines, Some(timeline_id));
+
            Arc::clone(timeline)
        };

@@ -2493,14 +2563,22 @@ impl Tenant {
            timelines_to_compact_or_offload = timelines
                .iter()
                .filter_map(|(timeline_id, timeline)| {
-                    let (is_active, can_offload) = (timeline.is_active(), timeline.can_offload());
+                    let (is_active, (can_offload, _)) =
+                        (timeline.is_active(), timeline.can_offload());
                    let has_no_unoffloaded_children = {
                        !timelines
                            .iter()
                            .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id))
                    };
+                    let config_allows_offload = self.conf.timeline_offloading
+                        || self
+                            .tenant_conf
+                            .load()
+                            .tenant_conf
+                            .timeline_offloading
+                            .unwrap_or_default();
                    let can_offload =
-                        can_offload && has_no_unoffloaded_children && self.conf.timeline_offloading;
+                        can_offload && has_no_unoffloaded_children && config_allows_offload;
                    if (is_active, can_offload) == (false, false) {
                        None
                    } else {
@@ -2649,7 +2727,7 @@ impl Tenant {
                .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping()));

            // Before activation, populate each Timeline's GcInfo with information about its children
-            self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor);
+            self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor, None);

            // Spawn gc and compaction loops. The loops will shut themselves
            // down when they notice that the tenant is inactive.
@@ -2764,8 +2842,14 @@ impl Tenant {
                let timeline_id = timeline.timeline_id;
                let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode);
                js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await });
-            })
-        };
+            });
+        }
+        {
+            let timelines_offloaded = self.timelines_offloaded.lock().unwrap();
+            timelines_offloaded.values().for_each(|timeline| {
+                timeline.defuse_for_tenant_drop();
+            });
+        }
        // test_long_timeline_create_then_tenant_delete is leaning on this message
        tracing::info!("Waiting for timelines...");
        while let Some(res) = js.join_next().await {
@@ -3749,10 +3833,13 @@ impl Tenant {
        &self,
        timelines: &std::sync::MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
        timelines_offloaded: &std::sync::MutexGuard<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
+        restrict_to_timeline: Option<TimelineId>,
    ) {
-        // This function must be called before activation: after activation timeline create/delete operations
-        // might happen, and this function is not safe to run concurrently with those.
-        assert!(!self.is_active());
+        if restrict_to_timeline.is_none() {
+            // This function must be called before activation: after activation timeline create/delete operations
+            // might happen, and this function is not safe to run concurrently with those.
+            assert!(!self.is_active());
+        }

        // Scan all timelines. For each timeline, remember the timeline ID and
        // the branch point where it was created.
@@ -3785,7 +3872,12 @@ impl Tenant {
        let horizon = self.get_gc_horizon();

        // Populate each timeline's GcInfo with information about its child branches
-        for timeline in timelines.values() {
+        let timelines_to_write = if let Some(timeline_id) = restrict_to_timeline {
+            itertools::Either::Left(timelines.get(&timeline_id).into_iter())
+        } else {
+            itertools::Either::Right(timelines.values())
+        };
+        for timeline in timelines_to_write {
            let mut branchpoints: Vec<(Lsn, TimelineId, MaybeOffloaded)> = all_branchpoints
                .remove(&timeline.timeline_id)
                .unwrap_or_default();
@@ -4772,10 +4864,12 @@ async fn run_initdb(

    let _permit = INIT_DB_SEMAPHORE.acquire().await;

-    let initdb_command = tokio::process::Command::new(&initdb_bin_path)
+    let mut initdb_command = tokio::process::Command::new(&initdb_bin_path);
+    initdb_command
        .args(["--pgdata", initdb_target_dir.as_ref()])
        .args(["--username", &conf.superuser])
        .args(["--encoding", "utf8"])
+        .args(["--locale", &conf.locale])
        .arg("--no-instructions")
        .arg("--no-sync")
        .env_clear()
@@ -4785,15 +4879,27 @@ async fn run_initdb(
        // stdout invocation produces the same output every time, we don't need it
        .stdout(std::process::Stdio::null())
        // we would be interested in the stderr output, if there was any
-        .stderr(std::process::Stdio::piped())
-        .spawn()?;
+        .stderr(std::process::Stdio::piped());
+
+    // Before version 14, only the libc provide was available.
+    if pg_version > 14 {
+        // Version 17 brought with it a builtin locale provider which only provides
+        // C and C.UTF-8. While being safer for collation purposes since it is
+        // guaranteed to be consistent throughout a major release, it is also more
+        // performant.
+        let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" };
+
+        initdb_command.args(["--locale-provider", locale_provider]);
+    }
+
+    let initdb_proc = initdb_command.spawn()?;

    // Ideally we'd select here with the cancellation token, but the problem is that
    // we can't safely terminate initdb: it launches processes of its own, and killing
    // initdb doesn't kill them. After we return from this function, we want the target
    // directory to be able to be cleaned up.
    // See https://github.com/neondatabase/neon/issues/6385
-    let initdb_output = initdb_command.wait_with_output().await?;
+    let initdb_output = initdb_proc.wait_with_output().await?;
    if !initdb_output.status.success() {
        return Err(InitdbError::Failed(
            initdb_output.status,
@@ -4902,6 +5008,7 @@ pub(crate) mod harness {
                ),
                lsn_lease_length: Some(tenant_conf.lsn_lease_length),
                lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
+                timeline_offloading: Some(tenant_conf.timeline_offloading),
            }
        }
    }
@@ -7734,13 +7841,13 @@ mod tests {
            (
                get_key(3),
                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_clear()),
+                Value::WalRecord(NeonWalRecord::wal_clear("c")),
            ),
            (get_key(4), Lsn(0x10), Value::Image("0x10".into())),
            (
                get_key(4),
                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_init()),
+                Value::WalRecord(NeonWalRecord::wal_init("i")),
            ),
        ];
        let image1 = vec![(get_key(1), "0x10".into())];
@@ -7889,8 +7996,30 @@ mod tests {

    #[cfg(feature = "testing")]
    #[tokio::test]
-    async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
+    async fn test_simple_bottom_most_compaction_deltas_1() -> anyhow::Result<()> {
+        test_simple_bottom_most_compaction_deltas_helper(
+            "test_simple_bottom_most_compaction_deltas_1",
+            false,
+        )
+        .await
+    }
+
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction_deltas_2() -> anyhow::Result<()> {
+        test_simple_bottom_most_compaction_deltas_helper(
+            "test_simple_bottom_most_compaction_deltas_2",
+            true,
+        )
+        .await
+    }
+
+    #[cfg(feature = "testing")]
+    async fn test_simple_bottom_most_compaction_deltas_helper(
+        test_name: &'static str,
+        use_delta_bottom_layer: bool,
+    ) -> anyhow::Result<()> {
+        let harness = TenantHarness::create(test_name).await?;
        let (tenant, ctx) = harness.load().await;

        fn get_key(id: u32) -> Key {
@@ -7921,6 +8050,16 @@ mod tests {
        let img_layer = (0..10)
            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
            .collect_vec();
+        // or, delta layer at 0x10 if `use_delta_bottom_layer` is true
+        let delta4 = (0..10)
+            .map(|id| {
+                (
+                    get_key(id),
+                    Lsn(0x08),
+                    Value::WalRecord(NeonWalRecord::wal_init(format!("value {id}@0x10"))),
+                )
+            })
+            .collect_vec();

        let delta1 = vec![
            (
@@ -7974,21 +8113,61 @@ mod tests {
            ),
        ];

-        let tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                vec![
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
-                ], // delta layers
-                vec![(Lsn(0x10), img_layer)], // image layers
-                Lsn(0x50),
-            )
-            .await?;
+        let tline = if use_delta_bottom_layer {
+            tenant
+                .create_test_timeline_with_layers(
+                    TIMELINE_ID,
+                    Lsn(0x08),
+                    DEFAULT_PG_VERSION,
+                    &ctx,
+                    vec![
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x08)..Lsn(0x10),
+                            delta4,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x20)..Lsn(0x48),
+                            delta1,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x20)..Lsn(0x48),
+                            delta2,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x48)..Lsn(0x50),
+                            delta3,
+                        ),
+                    ], // delta layers
+                    vec![], // image layers
+                    Lsn(0x50),
+                )
+                .await?
+        } else {
+            tenant
+                .create_test_timeline_with_layers(
+                    TIMELINE_ID,
+                    Lsn(0x10),
+                    DEFAULT_PG_VERSION,
+                    &ctx,
+                    vec![
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x10)..Lsn(0x48),
+                            delta1,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x10)..Lsn(0x48),
+                            delta2,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x48)..Lsn(0x50),
+                            delta3,
+                        ),
+                    ], // delta layers
+                    vec![(Lsn(0x10), img_layer)], // image layers
+                    Lsn(0x50),
+                )
+                .await?
+        };
        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
@@ -8098,7 +8277,7 @@ mod tests {
            (
                key,
                Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"0x10")),
+                Value::WalRecord(NeonWalRecord::wal_init("0x10")),
            ),
            (
                key,
@@ -8160,7 +8339,7 @@ mod tests {
                    Lsn(0x20),
                    KeyLogAtLsn(vec![(
                        Lsn(0x20),
-                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20")),
+                        Value::Image(Bytes::from_static(b"0x10;0x20")),
                    )]),
                ),
                (
@@ -9142,7 +9321,7 @@ mod tests {

            let will_init = will_init_keys.contains(&i);
            if will_init {
-                delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init())));
+                delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init(""))));

                expected_key_values.insert(key, "".to_string());
            } else {
@@ -9200,6 +9379,23 @@ mod tests {
        Ok(())
    }

+    fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
+        (
+            k1.is_delta,
+            k1.key_range.start,
+            k1.key_range.end,
+            k1.lsn_range.start,
+            k1.lsn_range.end,
+        )
+            .cmp(&(
+                k2.is_delta,
+                k2.key_range.start,
+                k2.key_range.end,
+                k2.lsn_range.start,
+                k2.lsn_range.end,
+            ))
+    }
+
    async fn inspect_and_sort(
        tline: &Arc<Timeline>,
        filter: Option<std::ops::Range<Key>>,
@@ -9208,25 +9404,30 @@ mod tests {
        if let Some(filter) = filter {
            all_layers.retain(|layer| overlaps_with(&layer.key_range, &filter));
        }
-        all_layers.sort_by(|k1, k2| {
-            (
-                k1.is_delta,
-                k1.key_range.start,
-                k1.key_range.end,
-                k1.lsn_range.start,
-                k1.lsn_range.end,
-            )
-                .cmp(&(
-                    k2.is_delta,
-                    k2.key_range.start,
-                    k2.key_range.end,
-                    k2.lsn_range.start,
-                    k2.lsn_range.end,
-                ))
-        });
+        all_layers.sort_by(sort_layer_key);
        all_layers
    }

+    #[cfg(feature = "testing")]
+    fn check_layer_map_key_eq(
+        mut left: Vec<PersistentLayerKey>,
+        mut right: Vec<PersistentLayerKey>,
+    ) {
+        left.sort_by(sort_layer_key);
+        right.sort_by(sort_layer_key);
+        if left != right {
+            eprintln!("---LEFT---");
+            for left in left.iter() {
+                eprintln!("{}", left);
+            }
+            eprintln!("---RIGHT---");
+            for right in right.iter() {
+                eprintln!("{}", right);
+            }
+            assert_eq!(left, right);
+        }
+    }
+
    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_simple_partial_bottom_most_compaction() -> anyhow::Result<()> {
@@ -9319,129 +9520,258 @@ mod tests {

        let cancel = CancellationToken::new();

-        // Do a partial compaction on key range 0..4, we should generate a image layer; no other layers
-        // can be removed because they might be used for other key ranges.
+        // Do a partial compaction on key range 0..2
        tline
-            .partial_compact_with_gc(Some(get_key(0)..get_key(4)), &cancel, EnumSet::new(), &ctx)
+            .partial_compact_with_gc(get_key(0)..get_key(2), &cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
-        assert_eq!(
+        check_layer_map_key_eq(
            all_layers,
            vec![
+                // newly-generated image layer for the partial compaction range 0-2
                PersistentLayerKey {
-                    key_range: get_key(0)..get_key(4),
+                    key_range: get_key(0)..get_key(2),
                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
+                    is_delta: false,
                },
                PersistentLayerKey {
                    key_range: get_key(0)..get_key(10),
                    lsn_range: Lsn(0x10)..Lsn(0x11),
-                    is_delta: false
+                    is_delta: false,
                },
+                // delta1 is split and the second part is rewritten
                PersistentLayerKey {
-                    key_range: get_key(1)..get_key(4),
+                    key_range: get_key(2)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                },
                PersistentLayerKey {
                    key_range: get_key(5)..get_key(7),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                },
                PersistentLayerKey {
                    key_range: get_key(8)..get_key(10),
                    lsn_range: Lsn(0x48)..Lsn(0x50),
-                    is_delta: true
-                }
-            ]
+                    is_delta: true,
+                },
+            ],
        );

-        // Do a partial compaction on key range 4..10
+        // Do a partial compaction on key range 2..4
        tline
-            .partial_compact_with_gc(Some(get_key(4)..get_key(10)), &cancel, EnumSet::new(), &ctx)
+            .partial_compact_with_gc(get_key(2)..get_key(4), &cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
-        assert_eq!(
+        check_layer_map_key_eq(
            all_layers,
            vec![
                PersistentLayerKey {
-                    key_range: get_key(0)..get_key(4),
+                    key_range: get_key(0)..get_key(2),
                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
+                    is_delta: false,
                },
                PersistentLayerKey {
-                    // if (in the future) GC kicks in, this layer will be removed
                    key_range: get_key(0)..get_key(10),
                    lsn_range: Lsn(0x10)..Lsn(0x11),
-                    is_delta: false
+                    is_delta: false,
                },
+                // image layer generated for the compaction range 2-4
                PersistentLayerKey {
-                    key_range: get_key(4)..get_key(10),
+                    key_range: get_key(2)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
+                    is_delta: false,
                },
+                // we have key2/key3 above the retain_lsn, so we still need this delta layer
                PersistentLayerKey {
-                    key_range: get_key(1)..get_key(4),
+                    key_range: get_key(2)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                },
                PersistentLayerKey {
                    key_range: get_key(5)..get_key(7),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                },
                PersistentLayerKey {
                    key_range: get_key(8)..get_key(10),
                    lsn_range: Lsn(0x48)..Lsn(0x50),
-                    is_delta: true
-                }
-            ]
+                    is_delta: true,
+                },
+            ],
+        );
+
+        // Do a partial compaction on key range 4..9
+        tline
+            .partial_compact_with_gc(get_key(4)..get_key(9), &cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(2)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(2)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true,
+                },
+                // image layer generated for this compaction range
+                PersistentLayerKey {
+                    key_range: get_key(4)..get_key(9),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        // Do a partial compaction on key range 9..10
+        tline
+            .partial_compact_with_gc(get_key(9)..get_key(10), &cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(2)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(2)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(4)..get_key(9),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                // image layer generated for the compaction range
+                PersistentLayerKey {
+                    key_range: get_key(9)..get_key(10),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
        );

        // Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
        tline
-            .partial_compact_with_gc(Some(get_key(0)..get_key(10)), &cancel, EnumSet::new(), &ctx)
+            .partial_compact_with_gc(get_key(0)..get_key(10), &cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
-        assert_eq!(
+        check_layer_map_key_eq(
            all_layers,
            vec![
-                PersistentLayerKey {
-                    key_range: get_key(0)..get_key(4),
-                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
-                },
+                // aha, we removed all unnecessary image/delta layers and got a very clean layer map!
                PersistentLayerKey {
                    key_range: get_key(0)..get_key(10),
                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
+                    is_delta: false,
                },
                PersistentLayerKey {
-                    key_range: get_key(4)..get_key(10),
-                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
-                },
-                PersistentLayerKey {
-                    key_range: get_key(1)..get_key(4),
+                    key_range: get_key(2)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
-                },
-                PersistentLayerKey {
-                    key_range: get_key(5)..get_key(7),
-                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                },
                PersistentLayerKey {
                    key_range: get_key(8)..get_key(10),
                    lsn_range: Lsn(0x48)..Lsn(0x50),
-                    is_delta: true
-                }
-            ]
+                    is_delta: true,
+                },
+            ],
        );

        Ok(())
    }
+
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_timeline_offload_retain_lsn() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_timeline_offload_retain_lsn")
+            .await
+            .unwrap();
+        let (tenant, ctx) = harness.load().await;
+        let tline_parent = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+        let tline_child = tenant
+            .branch_timeline_test(&tline_parent, NEW_TIMELINE_ID, Some(Lsn(0x20)), &ctx)
+            .await
+            .unwrap();
+        {
+            let gc_info_parent = tline_parent.gc_info.read().unwrap();
+            assert_eq!(
+                gc_info_parent.retain_lsns,
+                vec![(Lsn(0x20), tline_child.timeline_id, MaybeOffloaded::No)]
+            );
+        }
+        // We have to directly call the remote_client instead of using the archive function to avoid constructing broker client...
+        tline_child
+            .remote_client
+            .schedule_index_upload_for_timeline_archival_state(TimelineArchivalState::Archived)
+            .unwrap();
+        tline_child.remote_client.wait_completion().await.unwrap();
+        offload_timeline(&tenant, &tline_child)
+            .instrument(tracing::info_span!(parent: None, "offload_test", tenant_id=%"test", shard_id=%"test", timeline_id=%"test"))
+            .await.unwrap();
+        let child_timeline_id = tline_child.timeline_id;
+        Arc::try_unwrap(tline_child).unwrap();
+
+        {
+            let gc_info_parent = tline_parent.gc_info.read().unwrap();
+            assert_eq!(
+                gc_info_parent.retain_lsns,
+                vec![(Lsn(0x20), child_timeline_id, MaybeOffloaded::Yes)]
+            );
+        }
+
+        tenant
+            .get_offloaded_timeline(child_timeline_id)
+            .unwrap()
+            .defuse_for_tenant_drop();
+
+        Ok(())
+    }
 }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -349,6 +349,10 @@ pub struct TenantConfOpt {
    #[serde(with = "humantime_serde")]
    #[serde(default)]
    pub lsn_lease_length_for_ts: Option<Duration>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub timeline_offloading: Option<bool>,
 }

 impl TenantConfOpt {
@@ -411,6 +415,9 @@ impl TenantConfOpt {
            lsn_lease_length_for_ts: self
                .lsn_lease_length_for_ts
                .unwrap_or(global_conf.lsn_lease_length_for_ts),
+            timeline_offloading: self
+                .lazy_slru_download
+                .unwrap_or(global_conf.timeline_offloading),
        }
    }
 }
@@ -464,6 +471,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
            lsn_lease_length: value.lsn_lease_length.map(humantime),
            lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
+            timeline_offloading: value.timeline_offloading,
        }
    }
 }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1959,7 +1959,7 @@ impl TenantManager {
            attempt.before_reset_tenant();

            let (_guard, progress) = utils::completion::channel();
-            match tenant.shutdown(progress, ShutdownMode::Hard).await {
+            match tenant.shutdown(progress, ShutdownMode::Flush).await {
                Ok(()) => {
                    slot_guard.drop_old_value().expect("it was just shutdown");
                }
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -243,7 +243,7 @@ use self::index::IndexPart;
 use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerName, ResidentLayer};
 use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
-use super::Generation;
+use super::{DeleteTimelineError, Generation};

 pub(crate) use download::{
    download_index_part, download_tenant_manifest, is_temp_download_file,
@@ -574,12 +574,18 @@ impl RemoteTimelineClient {

            if latest_index_generation > index_generation {
                // Unexpected!  Why are we loading such an old index if a more recent one exists?
-                tracing::warn!(
+                // We will refuse to proceed, as there is no reasonable scenario where this should happen, but
+                // there _is_ a clear bug/corruption scenario where it would happen (controller sets the generation
+                // backwards).
+                tracing::error!(
                    ?index_generation,
                    ?latest_index_generation,
                    ?latest_index_mtime,
                    "Found a newer index while loading an old one"
                );
+                return Err(DownloadError::Fatal(
+                    "Index age exceeds threshold and a newer index exists".into(),
+                ));
            }
        }

@@ -1445,7 +1451,7 @@ impl RemoteTimelineClient {
        let remote_path = remote_layer_path(
            &self.tenant_shard_id.tenant_id,
            &self.timeline_id,
-            self.tenant_shard_id.to_index(),
+            uploaded.metadata().shard,
            &uploaded.layer_desc().layer_name(),
            uploaded.metadata().generation,
        );
@@ -1486,7 +1492,7 @@ impl RemoteTimelineClient {
            &adopted
                .get_timeline_id()
                .expect("Source timeline should be alive"),
-            self.tenant_shard_id.to_index(),
+            adopted.metadata().shard,
            &adopted.layer_desc().layer_name(),
            adopted.metadata().generation,
        );
@@ -1494,7 +1500,7 @@ impl RemoteTimelineClient {
        let target_remote_path = remote_layer_path(
            &self.tenant_shard_id.tenant_id,
            &self.timeline_id,
-            self.tenant_shard_id.to_index(),
+            adopted_as.metadata().shard,
            &adopted_as.layer_desc().layer_name(),
            adopted_as.metadata().generation,
        );
@@ -1544,15 +1550,17 @@ impl RemoteTimelineClient {
    /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
    /// The function deletes layer files one by one, then lists the prefix to see if we leaked something
    /// deletes leaked files if any and proceeds with deletion of index file at the end.
-    pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
+    pub(crate) async fn delete_all(self: &Arc<Self>) -> Result<(), DeleteTimelineError> {
        debug_assert_current_span_has_tenant_and_timeline_id();

        let layers: Vec<RemotePath> = {
            let mut locked = self.upload_queue.lock().unwrap();
-            let stopped = locked.stopped_mut()?;
+            let stopped = locked.stopped_mut().map_err(DeleteTimelineError::Other)?;

            if !matches!(stopped.deleted_at, SetDeletedFlagProgress::Successful(_)) {
-                anyhow::bail!("deleted_at is not set")
+                return Err(DeleteTimelineError::Other(anyhow::anyhow!(
+                    "deleted_at is not set"
+                )));
            }

            debug_assert!(stopped.upload_queue_for_deletion.no_pending_work());
@@ -1587,7 +1595,10 @@ impl RemoteTimelineClient {
        };

        let layer_deletion_count = layers.len();
-        self.deletion_queue_client.push_immediate(layers).await?;
+        self.deletion_queue_client
+            .push_immediate(layers)
+            .await
+            .map_err(|_| DeleteTimelineError::Cancelled)?;

        // Delete the initdb.tar.zst, which is not always present, but deletion attempts of
        // inexistant objects are not considered errors.
@@ -1595,7 +1606,8 @@ impl RemoteTimelineClient {
            remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &self.timeline_id);
        self.deletion_queue_client
            .push_immediate(vec![initdb_path])
-            .await?;
+            .await
+            .map_err(|_| DeleteTimelineError::Cancelled)?;

        // Do not delete index part yet, it is needed for possible retry. If we remove it first
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
@@ -1603,7 +1615,9 @@ impl RemoteTimelineClient {

        // Execute all pending deletions, so that when we proceed to do a listing below, we aren't
        // taking the burden of listing all the layers that we already know we should delete.
-        self.flush_deletion_queue().await?;
+        self.flush_deletion_queue()
+            .await
+            .map_err(|_| DeleteTimelineError::Cancelled)?;

        let cancel = shutdown_token();

@@ -1666,28 +1680,32 @@ impl RemoteTimelineClient {
        if !remaining_layers.is_empty() {
            self.deletion_queue_client
                .push_immediate(remaining_layers)
-                .await?;
+                .await
+                .map_err(|_| DeleteTimelineError::Cancelled)?;
        }

        fail::fail_point!("timeline-delete-before-index-delete", |_| {
-            Err(anyhow::anyhow!(
+            Err(DeleteTimelineError::Other(anyhow::anyhow!(
                "failpoint: timeline-delete-before-index-delete"
-            ))?
+            )))?
        });

        debug!("enqueuing index part deletion");
        self.deletion_queue_client
            .push_immediate([latest_index].to_vec())
-            .await?;
+            .await
+            .map_err(|_| DeleteTimelineError::Cancelled)?;

        // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
        // for a flush to a persistent deletion list so that we may be sure deletion will occur.
-        self.flush_deletion_queue().await?;
+        self.flush_deletion_queue()
+            .await
+            .map_err(|_| DeleteTimelineError::Cancelled)?;

        fail::fail_point!("timeline-delete-after-index-delete", |_| {
-            Err(anyhow::anyhow!(
+            Err(DeleteTimelineError::Other(anyhow::anyhow!(
                "failpoint: timeline-delete-after-index-delete"
-            ))?
+            )))?
        });

        info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json");
@@ -2201,6 +2219,18 @@ impl RemoteTimelineClient {
        inner.initialized_mut()?;
        Ok(UploadQueueAccessor { inner })
    }
+
+    pub(crate) fn no_pending_work(&self) -> bool {
+        let inner = self.upload_queue.lock().unwrap();
+        match &*inner {
+            UploadQueue::Uninitialized
+            | UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => true,
+            UploadQueue::Stopped(UploadQueueStopped::Deletable(x)) => {
+                x.upload_queue_for_deletion.no_pending_work()
+            }
+            UploadQueue::Initialized(x) => x.no_pending_work(),
+        }
+    }
 }

 pub(crate) struct UploadQueueAccessor<'a> {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -12,7 +12,7 @@ pub mod merge_iterator;

 use crate::context::{AccessStatsBehavior, RequestContext};
 use bytes::Bytes;
-use pageserver_api::key::Key;
+use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE};
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::value::Value;
@@ -196,6 +196,9 @@ impl ValuesReconstructState {
    /// Returns true if this was the last value needed for the key and false otherwise.
    ///
    /// If the key is done after the update, mark it as such.
+    ///
+    /// If the key is in the sparse keyspace (i.e., aux files), we do not track them in
+    /// `key_done`.
    pub(crate) fn update_key(
        &mut self,
        key: &Key,
@@ -206,10 +209,18 @@ impl ValuesReconstructState {
            .keys
            .entry(*key)
            .or_insert(Ok(VectoredValueReconstructState::default()));
-
+        let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key);
        if let Ok(state) = state {
            let key_done = match state.situation {
-                ValueReconstructSituation::Complete => unreachable!(),
+                ValueReconstructSituation::Complete => {
+                    if is_sparse_key {
+                        // Sparse keyspace might be visited multiple times because
+                        // we don't track unmapped keyspaces.
+                        return ValueReconstructSituation::Complete;
+                    } else {
+                        unreachable!()
+                    }
+                }
                ValueReconstructSituation::Continue => match value {
                    Value::Image(img) => {
                        state.img = Some((lsn, img));
@@ -234,7 +245,9 @@ impl ValuesReconstructState {

            if key_done && state.situation == ValueReconstructSituation::Continue {
                state.situation = ValueReconstructSituation::Complete;
-                self.keys_done.add_key(*key);
+                if !is_sparse_key {
+                    self.keys_done.add_key(*key);
+                }
            }

            state.situation
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -653,6 +653,10 @@ impl DeltaLayerWriter {
        })
    }

+    pub fn is_empty(&self) -> bool {
+        self.inner.as_ref().unwrap().num_keys == 0
+    }
+
    ///
    /// Append a key-value pair to the file.
    ///
--- a/pageserver/src/tenant/storage_layer/filter_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs
@@ -1,4 +1,4 @@
-use std::ops::Range;
+use std::{ops::Range, sync::Arc};

 use anyhow::bail;
 use pageserver_api::{
@@ -9,7 +9,10 @@ use utils::lsn::Lsn;

 use pageserver_api::value::Value;

-use super::merge_iterator::MergeIterator;
+use super::{
+    merge_iterator::{MergeIterator, MergeIteratorItem},
+    PersistentLayerKey,
+};

 /// A filter iterator over merge iterators (and can be easily extended to other types of iterators).
 ///
@@ -48,10 +51,10 @@ impl<'a> FilterIterator<'a> {
        })
    }

-    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        while let Some(item) = self.inner.next().await? {
+    async fn next_inner<R: MergeIteratorItem>(&mut self) -> anyhow::Result<Option<R>> {
+        while let Some(item) = self.inner.next_inner::<R>().await? {
            while self.current_filter_idx < self.retain_key_filters.len()
-                && item.0 >= self.retain_key_filters[self.current_filter_idx].end
+                && item.key_lsn_value().0 >= self.retain_key_filters[self.current_filter_idx].end
            {
                // [filter region]    [filter region]     [filter region]
                //                                     ^ item
@@ -68,7 +71,7 @@ impl<'a> FilterIterator<'a> {
                //                                                 ^ current filter (nothing)
                return Ok(None);
            }
-            if self.retain_key_filters[self.current_filter_idx].contains(&item.0) {
+            if self.retain_key_filters[self.current_filter_idx].contains(&item.key_lsn_value().0) {
                // [filter region]    [filter region]     [filter region]
                //                                              ^ item
                //                                        ^ current filter
@@ -81,6 +84,16 @@ impl<'a> FilterIterator<'a> {
        }
        Ok(None)
    }
+
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        self.next_inner().await
+    }
+
+    pub async fn next_with_trace(
+        &mut self,
+    ) -> anyhow::Result<Option<((Key, Lsn, Value), Arc<PersistentLayerKey>)>> {
+        self.next_inner().await
+    }
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -12,7 +12,7 @@ use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::{l0_flush, page_cache};
-use anyhow::{anyhow, Context, Result};
+use anyhow::{anyhow, Result};
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
 use pageserver_api::key::Key;
@@ -25,6 +25,7 @@ use std::sync::{Arc, OnceLock};
 use std::time::Instant;
 use tracing::*;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
+use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta};
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
 use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
@@ -66,6 +67,8 @@ pub struct InMemoryLayer {
    /// The above fields never change, except for `end_lsn`, which is only set once.
    /// All other changing parts are in `inner`, and protected by a mutex.
    inner: RwLock<InMemoryLayerInner>,
+
+    estimated_in_mem_size: AtomicU64,
 }

 impl std::fmt::Debug for InMemoryLayer {
@@ -452,6 +455,7 @@ impl InMemoryLayer {
                        len,
                        will_init,
                    } = index_entry.unpack();
+
                    reads.entry(key).or_default().push(ValueRead {
                        entry_lsn: *entry_lsn,
                        read: vectored_dio_read::LogicalRead::new(
@@ -513,68 +517,6 @@ impl InMemoryLayer {
    }
 }

-/// Offset of a particular Value within a serialized batch.
-struct SerializedBatchOffset {
-    key: CompactKey,
-    lsn: Lsn,
-    // TODO: separate type when we start serde-serializing this value, to avoid coupling
-    // in-memory representation to serialization format.
-    index_entry: IndexEntry,
-}
-
-pub struct SerializedBatch {
-    /// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
-    pub(crate) raw: Vec<u8>,
-
-    /// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
-    offsets: Vec<SerializedBatchOffset>,
-
-    /// The highest LSN of any value in the batch
-    pub(crate) max_lsn: Lsn,
-}
-
-impl SerializedBatch {
-    pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> anyhow::Result<Self> {
-        // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
-        // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
-        let buffer_size = batch.iter().map(|i| i.2).sum::<usize>();
-        let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
-
-        let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
-        let mut max_lsn: Lsn = Lsn(0);
-        for (key, lsn, val_ser_size, val) in batch {
-            let relative_off = cursor.position();
-
-            val.ser_into(&mut cursor)
-                .expect("Writing into in-memory buffer is infallible");
-
-            offsets.push(SerializedBatchOffset {
-                key,
-                lsn,
-                index_entry: IndexEntry::new(IndexEntryNewArgs {
-                    base_offset: 0,
-                    batch_offset: relative_off,
-                    len: val_ser_size,
-                    will_init: val.will_init(),
-                })
-                .context("higher-level code ensures that values are within supported ranges")?,
-            });
-            max_lsn = std::cmp::max(max_lsn, lsn);
-        }
-
-        let buffer = cursor.into_inner();
-
-        // Assert that we didn't do any extra allocations while building buffer.
-        debug_assert!(buffer.len() <= buffer_size);
-
-        Ok(Self {
-            raw: buffer,
-            offsets,
-            max_lsn,
-        })
-    }
-}
-
 fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
    write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
 }
@@ -603,6 +545,10 @@ impl InMemoryLayer {
        Ok(inner.file.len())
    }

+    pub fn estimated_in_mem_size(&self) -> u64 {
+        self.estimated_in_mem_size.load(AtomicOrdering::Relaxed)
+    }
+
    /// Create a new, empty, in-memory layer
    pub async fn create(
        conf: &'static PageServerConf,
@@ -632,6 +578,7 @@ impl InMemoryLayer {
                file,
                resource_units: GlobalResourceUnits::new(),
            }),
+            estimated_in_mem_size: AtomicU64::new(0),
        })
    }

@@ -642,7 +589,7 @@ impl InMemoryLayer {
    /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
    pub async fn put_batch(
        &self,
-        serialized_batch: SerializedBatch,
+        serialized_batch: SerializedValueBatch,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let mut inner = self.inner.write().await;
@@ -650,27 +597,13 @@ impl InMemoryLayer {

        let base_offset = inner.file.len();

-        let SerializedBatch {
+        let SerializedValueBatch {
            raw,
-            mut offsets,
+            metadata,
            max_lsn: _,
+            len: _,
        } = serialized_batch;

-        // Add the base_offset to the batch's index entries which are relative to the batch start.
-        for offset in &mut offsets {
-            let IndexEntryUnpacked {
-                will_init,
-                len,
-                pos,
-            } = offset.index_entry.unpack();
-            offset.index_entry = IndexEntry::new(IndexEntryNewArgs {
-                base_offset,
-                batch_offset: pos,
-                len: len.into_usize(),
-                will_init,
-            })?;
-        }
-
        // Write the batch to the file
        inner.file.write_raw(&raw, ctx).await?;
        let new_size = inner.file.len();
@@ -683,12 +616,28 @@ impl InMemoryLayer {
        assert_eq!(new_size, expected_new_len);

        // Update the index with the new entries
-        for SerializedBatchOffset {
-            key,
-            lsn,
-            index_entry,
-        } in offsets
-        {
+        for meta in metadata {
+            let SerializedValueMeta {
+                key,
+                lsn,
+                batch_offset,
+                len,
+                will_init,
+            } = match meta {
+                ValueMeta::Serialized(ser) => ser,
+                ValueMeta::Observed(_) => {
+                    continue;
+                }
+            };
+
+            // Add the base_offset to the batch's index entries which are relative to the batch start.
+            let index_entry = IndexEntry::new(IndexEntryNewArgs {
+                base_offset,
+                batch_offset,
+                len,
+                will_init,
+            })?;
+
            let vec_map = inner.index.entry(key).or_default();
            let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
            if old.is_some() {
@@ -700,6 +649,12 @@ impl InMemoryLayer {
                // because this case is unexpected, and we would like tests to fail if this happens.
                warn!("Key {} at {} written twice at same LSN", key, lsn);
            }
+            self.estimated_in_mem_size.fetch_add(
+                (std::mem::size_of::<CompactKey>()
+                    + std::mem::size_of::<Lsn>()
+                    + std::mem::size_of::<IndexEntry>()) as u64,
+                AtomicOrdering::Relaxed,
+            );
        }

        inner.resource_units.maybe_publish_size(new_size);
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -1,6 +1,7 @@
 use std::{
    cmp::Ordering,
    collections::{binary_heap, BinaryHeap},
+    sync::Arc,
 };

 use anyhow::bail;
@@ -13,10 +14,11 @@ use pageserver_api::value::Value;
 use super::{
    delta_layer::{DeltaLayerInner, DeltaLayerIterator},
    image_layer::{ImageLayerInner, ImageLayerIterator},
+    PersistentLayerDesc, PersistentLayerKey,
 };

 #[derive(Clone, Copy)]
-enum LayerRef<'a> {
+pub(crate) enum LayerRef<'a> {
    Image(&'a ImageLayerInner),
    Delta(&'a DeltaLayerInner),
 }
@@ -62,18 +64,20 @@ impl LayerIterRef<'_> {
 /// 1. Unified iterator for image and delta layers.
 /// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
 /// 3. Lazy creation of the real delta/image iterator.
-enum IteratorWrapper<'a> {
+pub(crate) enum IteratorWrapper<'a> {
    NotLoaded {
        ctx: &'a RequestContext,
        first_key_lower_bound: (Key, Lsn),
        layer: LayerRef<'a>,
+        source_desc: Arc<PersistentLayerKey>,
    },
    Loaded {
        iter: PeekableLayerIterRef<'a>,
+        source_desc: Arc<PersistentLayerKey>,
    },
 }

-struct PeekableLayerIterRef<'a> {
+pub(crate) struct PeekableLayerIterRef<'a> {
    iter: LayerIterRef<'a>,
    peeked: Option<(Key, Lsn, Value)>, // None == end
 }
@@ -151,6 +155,12 @@ impl<'a> IteratorWrapper<'a> {
            layer: LayerRef::Image(image_layer),
            first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()),
            ctx,
+            source_desc: PersistentLayerKey {
+                key_range: image_layer.key_range().clone(),
+                lsn_range: PersistentLayerDesc::image_layer_lsn_range(image_layer.lsn()),
+                is_delta: false,
+            }
+            .into(),
        }
    }

@@ -162,12 +172,18 @@ impl<'a> IteratorWrapper<'a> {
            layer: LayerRef::Delta(delta_layer),
            first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start),
            ctx,
+            source_desc: PersistentLayerKey {
+                key_range: delta_layer.key_range().clone(),
+                lsn_range: delta_layer.lsn_range().clone(),
+                is_delta: true,
+            }
+            .into(),
        }
    }

    fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> {
        match self {
-            Self::Loaded { iter } => iter
+            Self::Loaded { iter, .. } => iter
                .peek()
                .as_ref()
                .map(|(key, lsn, val)| (key, *lsn, Some(val))),
@@ -191,6 +207,7 @@ impl<'a> IteratorWrapper<'a> {
            ctx,
            first_key_lower_bound,
            layer,
+            source_desc,
        } = self
        else {
            unreachable!()
@@ -206,7 +223,10 @@ impl<'a> IteratorWrapper<'a> {
                );
            }
        }
-        *self = Self::Loaded { iter };
+        *self = Self::Loaded {
+            iter,
+            source_desc: source_desc.clone(),
+        };
        Ok(())
    }

@@ -220,11 +240,19 @@ impl<'a> IteratorWrapper<'a> {
    /// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and
    /// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        let Self::Loaded { iter } = self else {
+        let Self::Loaded { iter, .. } = self else {
            panic!("must load the iterator before using")
        };
        iter.next().await
    }
+
+    /// Get the persistent layer key corresponding to this iterator
+    fn trace_source(&self) -> Arc<PersistentLayerKey> {
+        match self {
+            Self::Loaded { source_desc, .. } => source_desc.clone(),
+            Self::NotLoaded { source_desc, .. } => source_desc.clone(),
+        }
+    }
 }

 /// A merge iterator over delta/image layer iterators.
@@ -242,6 +270,32 @@ pub struct MergeIterator<'a> {
    heap: BinaryHeap<IteratorWrapper<'a>>,
 }

+pub(crate) trait MergeIteratorItem {
+    fn new(item: (Key, Lsn, Value), iterator: &IteratorWrapper<'_>) -> Self;
+
+    fn key_lsn_value(&self) -> &(Key, Lsn, Value);
+}
+
+impl MergeIteratorItem for (Key, Lsn, Value) {
+    fn new(item: (Key, Lsn, Value), _: &IteratorWrapper<'_>) -> Self {
+        item
+    }
+
+    fn key_lsn_value(&self) -> &(Key, Lsn, Value) {
+        self
+    }
+}
+
+impl MergeIteratorItem for ((Key, Lsn, Value), Arc<PersistentLayerKey>) {
+    fn new(item: (Key, Lsn, Value), iter: &IteratorWrapper<'_>) -> Self {
+        (item, iter.trace_source().clone())
+    }
+
+    fn key_lsn_value(&self) -> &(Key, Lsn, Value) {
+        &self.0
+    }
+}
+
 impl<'a> MergeIterator<'a> {
    pub fn create(
        deltas: &[&'a DeltaLayerInner],
@@ -260,7 +314,7 @@ impl<'a> MergeIterator<'a> {
        }
    }

-    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+    pub(crate) async fn next_inner<R: MergeIteratorItem>(&mut self) -> anyhow::Result<Option<R>> {
        while let Some(mut iter) = self.heap.peek_mut() {
            if !iter.is_loaded() {
                // Once we load the iterator, we can know the real first key-value pair in the iterator.
@@ -275,10 +329,22 @@ impl<'a> MergeIterator<'a> {
                binary_heap::PeekMut::pop(iter);
                continue;
            };
-            return Ok(Some(item));
+            return Ok(Some(R::new(item, &iter)));
        }
        Ok(None)
    }
+
+    /// Get the next key-value pair from the iterator.
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        self.next_inner().await
+    }
+
+    /// Get the next key-value pair from the iterator, and trace where the key comes from.
+    pub async fn next_with_trace(
+        &mut self,
+    ) -> anyhow::Result<Option<((Key, Lsn, Value), Arc<PersistentLayerKey>)>> {
+        self.next_inner().await
+    }
 }

 #[cfg(test)]
@@ -496,7 +562,7 @@ mod tests {
            (
                get_key(0),
                Lsn(0x10),
-                Value::WalRecord(NeonWalRecord::wal_init()),
+                Value::WalRecord(NeonWalRecord::wal_init("")),
            ),
            (
                get_key(0),
@@ -506,7 +572,7 @@ mod tests {
            (
                get_key(5),
                Lsn(0x10),
-                Value::WalRecord(NeonWalRecord::wal_init()),
+                Value::WalRecord(NeonWalRecord::wal_init("")),
            ),
            (
                get_key(5),
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,9 +23,10 @@ use handle::ShardTimelineId;
 use offload::OffloadError;
 use once_cell::sync::Lazy;
 use pageserver_api::{
+    config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
    key::{
-        CompactKey, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
-        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
+        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
+        NON_INHERITED_SPARSE_RANGE,
    },
    keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
    models::{
@@ -49,6 +50,7 @@ use utils::{
    fs_ext, pausable_failpoint,
    sync::gate::{Gate, GateGuard},
 };
+use wal_decoder::serialized_batch::SerializedValueBatch;

 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
@@ -131,7 +133,6 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::gc_result::GcResult;
 use crate::ZERO_PAGE;
 use pageserver_api::key::Key;
-use pageserver_api::value::Value;

 use self::delete::DeleteTimelineFlow;
 pub(super) use self::eviction_task::EvictionTaskTenantState;
@@ -141,9 +142,7 @@ use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

 use super::{
-    config::TenantConf,
-    storage_layer::{inmemory_layer, LayerVisibilityHint},
-    upload_queue::NotInitialized,
+    config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
    MaybeOffloaded,
 };
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
@@ -157,6 +156,9 @@ use super::{
    GcError,
 };

+#[cfg(test)]
+use pageserver_api::value::Value;
+
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(crate) enum FlushLoopState {
    NotStarted,
@@ -475,8 +477,21 @@ impl GcInfo {
        self.retain_lsns.sort_by_key(|i| i.0);
    }

-    pub(super) fn remove_child(&mut self, child_id: TimelineId) {
-        self.retain_lsns.retain(|i| i.1 != child_id);
+    pub(super) fn remove_child_maybe_offloaded(
+        &mut self,
+        child_id: TimelineId,
+        maybe_offloaded: MaybeOffloaded,
+    ) {
+        self.retain_lsns
+            .retain(|i| !(i.1 == child_id && i.2 == maybe_offloaded));
+    }
+
+    pub(super) fn remove_child_not_offloaded(&mut self, child_id: TimelineId) {
+        self.remove_child_maybe_offloaded(child_id, MaybeOffloaded::No);
+    }
+
+    pub(super) fn remove_child_offloaded(&mut self, child_id: TimelineId) {
+        self.remove_child_maybe_offloaded(child_id, MaybeOffloaded::Yes);
    }
 }

@@ -851,6 +866,10 @@ pub(crate) enum ShutdownMode {
    /// While we are flushing, we continue to accept read I/O for LSNs ingested before
    /// the call to [`Timeline::shutdown`].
    FreezeAndFlush,
+    /// Only flush the layers to the remote storage without freezing any open layers. This is the
+    /// mode used by ancestor detach and any other operations that reloads a tenant but not increasing
+    /// the generation number.
+    Flush,
    /// Shut down immediately, without waiting for any open layers to flush.
    Hard,
 }
@@ -1564,12 +1583,16 @@ impl Timeline {
    ///
    /// This is neccessary but not sufficient for offloading of the timeline as it might have
    /// child timelines that are not offloaded yet.
-    pub(crate) fn can_offload(&self) -> bool {
+    pub(crate) fn can_offload(&self) -> (bool, &'static str) {
        if self.remote_client.is_archived() != Some(true) {
-            return false;
+            return (false, "the timeline is not archived");
+        }
+        if !self.remote_client.no_pending_work() {
+            // if the remote client is still processing some work, we can't offload
+            return (false, "the upload queue is not drained yet");
        }

-        true
+        (true, "ok")
    }

    /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending
@@ -1677,11 +1700,6 @@ impl Timeline {
    pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        let try_freeze_and_flush = match mode {
-            ShutdownMode::FreezeAndFlush => true,
-            ShutdownMode::Hard => false,
-        };
-
        // Regardless of whether we're going to try_freeze_and_flush
        // or not, stop ingesting any more data. Walreceiver only provides
        // cancellation but no "wait until gone", because it uses the Timeline::gate.
@@ -1703,7 +1721,7 @@ impl Timeline {
        // ... and inform any waiters for newer LSNs that there won't be any.
        self.last_record_lsn.shutdown();

-        if try_freeze_and_flush {
+        if let ShutdownMode::FreezeAndFlush = mode {
            if let Some((open, frozen)) = self
                .layers
                .read()
@@ -1745,6 +1763,20 @@ impl Timeline {
                    warn!("failed to freeze and flush: {e:#}");
                }
            }
+
+            // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
+            // we also do a final check here to ensure that the queue is empty.
+            if !self.remote_client.no_pending_work() {
+                warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+            }
+        }
+
+        if let ShutdownMode::Flush = mode {
+            // drain the upload queue
+            self.remote_client.shutdown().await;
+            if !self.remote_client.no_pending_work() {
+                warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+            }
        }

        // Signal any subscribers to our cancellation token to drop out
@@ -3487,18 +3519,37 @@ impl Timeline {

                let timer = self.metrics.flush_time_histo.start_timer();

+                let num_frozen_layers;
+                let frozen_layer_total_size;
                let layer_to_flush = {
                    let guard = self.layers.read().await;
                    let Ok(lm) = guard.layer_map() else {
                        info!("dropping out of flush loop for timeline shutdown");
                        return;
                    };
+                    num_frozen_layers = lm.frozen_layers.len();
+                    frozen_layer_total_size = lm
+                        .frozen_layers
+                        .iter()
+                        .map(|l| l.estimated_in_mem_size())
+                        .sum::<u64>();
                    lm.frozen_layers.front().cloned()
                    // drop 'layers' lock to allow concurrent reads and writes
                };
                let Some(layer_to_flush) = layer_to_flush else {
                    break Ok(());
                };
+                if num_frozen_layers
+                    > std::cmp::max(
+                        self.get_compaction_threshold(),
+                        DEFAULT_COMPACTION_THRESHOLD,
+                    )
+                    && frozen_layer_total_size >= /* 128 MB */ 128000000
+                {
+                    tracing::warn!(
+                        "too many frozen layers: {num_frozen_layers} layers with estimated in-mem size of {frozen_layer_total_size} bytes",
+                    );
+                }
                match self.flush_frozen_layer(layer_to_flush, ctx).await {
                    Ok(this_layer_to_lsn) => {
                        flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);
@@ -4089,6 +4140,7 @@ impl Timeline {
    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
        // Metadata keys image layer creation.
        let mut reconstruct_state = ValuesReconstructState::default();
+        let begin = Instant::now();
        let data = self
            .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
            .await?;
@@ -4105,14 +4157,11 @@ impl Timeline {
            (new_data, total_kb_retrieved / 1024, total_keys_retrieved)
        };
        let delta_files_accessed = reconstruct_state.get_delta_layers_visited();
+        let elapsed = begin.elapsed();

        let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
-        debug!(
-            trigger_generation,
-            delta_files_accessed,
-            total_kb_retrieved,
-            total_keys_retrieved,
-            "generate metadata images"
+        info!(
+            "metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s", elapsed.as_secs_f64()
        );

        if !trigger_generation && mode == ImageLayerCreationMode::Try {
@@ -4465,7 +4514,7 @@ impl Drop for Timeline {
            // This lock should never be poisoned, but in case it is we do a .map() instead of
            // an unwrap(), to avoid panicking in a destructor and thereby aborting the process.
            if let Ok(mut gc_info) = ancestor.gc_info.write() {
-                gc_info.remove_child(self.timeline_id)
+                gc_info.remove_child_not_offloaded(self.timeline_id)
            }
        }
    }
@@ -4994,7 +5043,7 @@ impl Timeline {

            // 1. Is it newer than GC horizon cutoff point?
            if l.get_lsn_range().end > space_cutoff {
-                debug!(
+                info!(
                    "keeping {} because it's newer than space_cutoff {}",
                    l.layer_name(),
                    space_cutoff,
@@ -5005,7 +5054,7 @@ impl Timeline {

            // 2. It is newer than PiTR cutoff point?
            if l.get_lsn_range().end > time_cutoff {
-                debug!(
+                info!(
                    "keeping {} because it's newer than time_cutoff {}",
                    l.layer_name(),
                    time_cutoff,
@@ -5024,7 +5073,7 @@ impl Timeline {
            for retain_lsn in &retain_lsns {
                // start_lsn is inclusive
                if &l.get_lsn_range().start <= retain_lsn {
-                    debug!(
+                    info!(
                        "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}",
                        l.layer_name(),
                        retain_lsn,
@@ -5039,7 +5088,7 @@ impl Timeline {
            if let Some(lsn) = &max_lsn_with_valid_lease {
                // keep if layer start <= any of the lease
                if &l.get_lsn_range().start <= lsn {
-                    debug!(
+                    info!(
                        "keeping {} because there is a valid lease preventing GC at {}",
                        l.layer_name(),
                        lsn,
@@ -5071,13 +5120,13 @@ impl Timeline {
            if !layers
                .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
            {
-                debug!("keeping {} because it is the latest layer", l.layer_name());
+                info!("keeping {} because it is the latest layer", l.layer_name());
                result.layers_not_updated += 1;
                continue 'outer;
            }

            // We didn't find any reason to keep this file, so remove it.
-            debug!(
+            info!(
                "garbage collecting {} is_dropped: xx is_incremental: {}",
                l.layer_name(),
                l.is_incremental(),
@@ -5736,23 +5785,22 @@ impl<'a> TimelineWriter<'a> {
    /// Put a batch of keys at the specified Lsns.
    pub(crate) async fn put_batch(
        &mut self,
-        batch: Vec<(CompactKey, Lsn, usize, Value)>,
+        batch: SerializedValueBatch,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        if batch.is_empty() {
            return Ok(());
        }

-        let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch)?;
-        let batch_max_lsn = serialized_batch.max_lsn;
-        let buf_size: u64 = serialized_batch.raw.len() as u64;
+        let batch_max_lsn = batch.max_lsn;
+        let buf_size: u64 = batch.buffer_size() as u64;

        let action = self.get_open_layer_action(batch_max_lsn, buf_size);
        let layer = self
            .handle_open_layer_action(batch_max_lsn, action, ctx)
            .await?;

-        let res = layer.put_batch(serialized_batch, ctx).await;
+        let res = layer.put_batch(batch, ctx).await;

        if res.is_ok() {
            // Update the current size only when the entire write was ok.
@@ -5787,11 +5835,14 @@ impl<'a> TimelineWriter<'a> {
            );
        }
        let val_ser_size = value.serialized_size().unwrap() as usize;
-        self.put_batch(
-            vec![(key.to_compact(), lsn, val_ser_size, value.clone())],
-            ctx,
-        )
-        .await
+        let batch = SerializedValueBatch::from_values(vec![(
+            key.to_compact(),
+            lsn,
+            val_ser_size,
+            value.clone(),
+        )]);
+
+        self.put_batch(batch, ctx).await
    }

    pub(crate) async fn delete_batch(
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,7 +4,7 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.

-use std::collections::{BinaryHeap, HashSet};
+use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::ops::{Deref, Range};
 use std::sync::Arc;

@@ -56,7 +56,7 @@ use pageserver_api::value::Value;

 use utils::lsn::Lsn;

-use pageserver_compaction::helpers::overlaps_with;
+use pageserver_compaction::helpers::{fully_contains, overlaps_with};
 use pageserver_compaction::interface::*;

 use super::CompactionError;
@@ -64,6 +64,23 @@ use super::CompactionError;
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;

+pub struct GcCompactionJobDescription {
+    /// All layers to read in the compaction job
+    selected_layers: Vec<Layer>,
+    /// GC cutoff of the job
+    gc_cutoff: Lsn,
+    /// LSNs to retain for the job
+    retain_lsns_below_horizon: Vec<Lsn>,
+    /// Maximum layer LSN processed in this compaction
+    max_layer_lsn: Lsn,
+    /// Only compact layers overlapping with this range
+    compaction_key_range: Range<Key>,
+    /// When partial compaction is enabled, these layers need to be rewritten to ensure no overlap.
+    /// This field is here solely for debugging. The field will not be read once the compaction
+    /// description is generated.
+    rewrite_layers: Vec<Arc<PersistentLayerDesc>>,
+}
+
 /// The result of bottom-most compaction for a single key at each LSN.
 #[derive(Debug)]
 #[cfg_attr(test, derive(PartialEq))]
@@ -1722,7 +1739,8 @@ impl Timeline {
        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        self.partial_compact_with_gc(None, cancel, flags, ctx).await
+        self.partial_compact_with_gc(Key::MIN..Key::MAX, cancel, flags, ctx)
+            .await
    }

    /// An experimental compaction building block that combines compaction with garbage collection.
@@ -1732,12 +1750,15 @@ impl Timeline {
    /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
    /// and create delta layers with all deltas >= gc horizon.
    ///
-    /// If `key_range`, it will only compact the keys within the range, aka partial compaction. This functionality
-    /// is not complete yet, and if it is set, only image layers will be generated.
-    ///
+    /// If `key_range` is provided, it will only compact the keys within the range, aka partial compaction.
+    /// Partial compaction will read and process all layers overlapping with the key range, even if it might
+    /// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained
+    /// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing
+    /// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
+    /// part of the range.
    pub(crate) async fn partial_compact_with_gc(
        self: &Arc<Self>,
-        compaction_key_range: Option<Range<Key>>,
+        compaction_key_range: Range<Key>,
        cancel: &CancellationToken,
        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
@@ -1762,9 +1783,8 @@ impl Timeline {
        .await?;

        let dry_run = flags.contains(CompactFlags::DryRun);
-        let partial_compaction = compaction_key_range.is_some();

-        if let Some(ref compaction_key_range) = compaction_key_range {
+        if compaction_key_range == (Key::MIN..Key::MAX) {
            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end);
        } else {
            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
@@ -1780,7 +1800,7 @@ impl Timeline {
        // The layer selection has the following properties:
        // 1. If a layer is in the selection, all layers below it are in the selection.
        // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
-        let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = if !partial_compaction {
+        let job_desc = {
            let guard = self.layers.read().await;
            let layers = guard.layer_map()?;
            let gc_info = self.gc_info.read().unwrap();
@@ -1810,9 +1830,21 @@ impl Timeline {
            };
            // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
            // layers to compact.
+            let mut rewrite_layers = Vec::new();
            for desc in layers.iter_historic_layers() {
-                if desc.get_lsn_range().end <= max_layer_lsn {
+                if desc.get_lsn_range().end <= max_layer_lsn
+                    && overlaps_with(&desc.get_key_range(), &compaction_key_range)
+                {
+                    // If the layer overlaps with the compaction key range, we need to read it to obtain all keys within the range,
+                    // even if it might contain extra keys
                    selected_layers.push(guard.get_from_desc(&desc));
+                    // If the layer is not fully contained within the key range, we need to rewrite it if it's a delta layer (it's fine
+                    // to overlap image layers)
+                    if desc.is_delta()
+                        && !fully_contains(&compaction_key_range, &desc.get_key_range())
+                    {
+                        rewrite_layers.push(desc);
+                    }
                }
            }
            if selected_layers.is_empty() {
@@ -1820,82 +1852,59 @@ impl Timeline {
                return Ok(());
            }
            retain_lsns_below_horizon.sort();
-            (selected_layers, gc_cutoff, retain_lsns_below_horizon)
-        } else {
-            // In case of partial compaction, we currently only support generating image layers, and therefore,
-            // we pick all layers that are below the lowest retain_lsn and does not intersect with any of the layers.
-            let guard = self.layers.read().await;
-            let layers = guard.layer_map()?;
-            let gc_info = self.gc_info.read().unwrap();
-            let mut min_lsn = gc_info.cutoffs.select_min();
-            for (lsn, _, _) in &gc_info.retain_lsns {
-                if lsn < &min_lsn {
-                    min_lsn = *lsn;
-                }
+            GcCompactionJobDescription {
+                selected_layers,
+                gc_cutoff,
+                retain_lsns_below_horizon,
+                max_layer_lsn,
+                compaction_key_range,
+                rewrite_layers,
            }
-            for lsn in gc_info.leases.keys() {
-                if lsn < &min_lsn {
-                    min_lsn = *lsn;
-                }
-            }
-            let mut selected_layers = Vec::new();
-            drop(gc_info);
-            // |-------| |-------| |-------|
-            // | Delta | | Delta | | Delta | -- min_lsn could be intersecting with the layers
-            // |-------| |-------| |-------| <- we want to pick all the layers below min_lsn, so that
-            // | Delta | | Delta | | Delta |    ...we can remove them after compaction
-            // |-------| |-------| |-------|
-            // Pick all the layers intersect or below the min_lsn, get the largest LSN in the selected layers.
-            let Some(compaction_key_range) = compaction_key_range.as_ref() else {
-                unreachable!()
-            };
-            for desc in layers.iter_historic_layers() {
-                if desc.get_lsn_range().end <= min_lsn
-                    && overlaps_with(&desc.key_range, compaction_key_range)
-                {
-                    selected_layers.push(guard.get_from_desc(&desc));
-                }
-            }
-            if selected_layers.is_empty() {
-                info!("no layers to compact with gc");
-                return Ok(());
-            }
-            (selected_layers, min_lsn, Vec::new())
        };
        let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
-            if partial_compaction {
-                warn!("partial compaction cannot run on child branches (for now)");
-                return Ok(());
-            }
            Lsn(self.ancestor_lsn.0 + 1)
        } else {
-            let res = retain_lsns_below_horizon
+            let res = job_desc
+                .retain_lsns_below_horizon
                .first()
                .copied()
-                .unwrap_or(gc_cutoff);
+                .unwrap_or(job_desc.gc_cutoff);
            if cfg!(debug_assertions) {
                assert_eq!(
                    res,
-                    retain_lsns_below_horizon
+                    job_desc
+                        .retain_lsns_below_horizon
                        .iter()
                        .min()
                        .copied()
-                        .unwrap_or(gc_cutoff)
+                        .unwrap_or(job_desc.gc_cutoff)
                );
            }
            res
        };
        info!(
-            "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
-            layer_selection.len(),
-            gc_cutoff,
-            lowest_retain_lsn
+            "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}",
+            job_desc.selected_layers.len(),
+            job_desc.rewrite_layers.len(),
+            job_desc.max_layer_lsn,
+            job_desc.gc_cutoff,
+            lowest_retain_lsn,
+            job_desc.compaction_key_range.start,
+            job_desc.compaction_key_range.end
        );

-        self.check_compaction_space(&layer_selection).await?;
+        for layer in &job_desc.selected_layers {
+            debug!("read layer: {}", layer.layer_desc().key());
+        }
+        for layer in &job_desc.rewrite_layers {
+            debug!("rewrite layer: {}", layer.key());
+        }
+
+        self.check_compaction_space(&job_desc.selected_layers)
+            .await?;

        // Generate statistics for the compaction
-        for layer in &layer_selection {
+        for layer in &job_desc.selected_layers {
            let desc = layer.layer_desc();
            if desc.is_delta() {
                stat.visit_delta_layer(desc.file_size());
@@ -1906,25 +1915,25 @@ impl Timeline {

        // Step 1: construct a k-merge iterator over all layers.
        // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
-        let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
+        let layer_names = job_desc
+            .selected_layers
            .iter()
            .map(|layer| layer.layer_desc().layer_name())
            .collect_vec();
        if let Some(err) = check_valid_layermap(&layer_names) {
-            bail!("cannot run gc-compaction because {}", err);
+            warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
        }
        // The maximum LSN we are processing in this compaction loop
-        let end_lsn = layer_selection
+        let end_lsn = job_desc
+            .selected_layers
            .iter()
            .map(|l| l.layer_desc().lsn_range.end)
            .max()
            .unwrap();
-        // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
-        // as an L0 layer.
        let mut delta_layers = Vec::new();
        let mut image_layers = Vec::new();
        let mut downloaded_layers = Vec::new();
-        for layer in &layer_selection {
+        for layer in &job_desc.selected_layers {
            let resident_layer = layer.download_and_keep_resident().await?;
            downloaded_layers.push(resident_layer);
        }
@@ -1943,8 +1952,8 @@ impl Timeline {
            dense_ks,
            sparse_ks,
        )?;
-        // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
-        // Data of the same key.
+
+        // Step 2: Produce images+deltas.
        let mut accumulated_values = Vec::new();
        let mut last_key: Option<Key> = None;

@@ -1956,10 +1965,7 @@ impl Timeline {
                    self.conf,
                    self.timeline_id,
                    self.tenant_shard_id,
-                    compaction_key_range
-                        .as_ref()
-                        .map(|x| x.start)
-                        .unwrap_or(Key::MIN),
+                    job_desc.compaction_key_range.start,
                    lowest_retain_lsn,
                    self.get_compaction_target_size(),
                    ctx,
@@ -1979,6 +1985,13 @@ impl Timeline {
        )
        .await?;

+        #[derive(Default)]
+        struct RewritingLayers {
+            before: Option<DeltaLayerWriter>,
+            after: Option<DeltaLayerWriter>,
+        }
+        let mut delta_layer_rewriters = HashMap::<Arc<PersistentLayerKey>, RewritingLayers>::new();
+
        /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
        ///
        /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
@@ -2004,10 +2017,51 @@ impl Timeline {
        // the key and LSN range are determined. However, to keep things simple here, we still
        // create this writer, and discard the writer in the end.

-        while let Some((key, lsn, val)) = merge_iter.next().await? {
+        while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? {
            if cancel.is_cancelled() {
                return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
            }
+            if !job_desc.compaction_key_range.contains(&key) {
+                if !desc.is_delta {
+                    continue;
+                }
+                let rewriter = delta_layer_rewriters.entry(desc.clone()).or_default();
+                let rewriter = if key < job_desc.compaction_key_range.start {
+                    if rewriter.before.is_none() {
+                        rewriter.before = Some(
+                            DeltaLayerWriter::new(
+                                self.conf,
+                                self.timeline_id,
+                                self.tenant_shard_id,
+                                desc.key_range.start,
+                                desc.lsn_range.clone(),
+                                ctx,
+                            )
+                            .await?,
+                        );
+                    }
+                    rewriter.before.as_mut().unwrap()
+                } else if key >= job_desc.compaction_key_range.end {
+                    if rewriter.after.is_none() {
+                        rewriter.after = Some(
+                            DeltaLayerWriter::new(
+                                self.conf,
+                                self.timeline_id,
+                                self.tenant_shard_id,
+                                job_desc.compaction_key_range.end,
+                                desc.lsn_range.clone(),
+                                ctx,
+                            )
+                            .await?,
+                        );
+                    }
+                    rewriter.after.as_mut().unwrap()
+                } else {
+                    unreachable!()
+                };
+                rewriter.put_value(key, lsn, val, ctx).await?;
+                continue;
+            }
            match val {
                Value::Image(_) => stat.visit_image_key(&val),
                Value::WalRecord(_) => stat.visit_wal_key(&val),
@@ -2018,35 +2072,27 @@ impl Timeline {
                }
                accumulated_values.push((key, lsn, val));
            } else {
-                let last_key = last_key.as_mut().unwrap();
-                stat.on_unique_key_visited();
-                let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
-                    !compaction_key_range.contains(last_key)
-                } else {
-                    false
-                };
-                if !skip_adding_key {
-                    let retention = self
-                        .generate_key_retention(
-                            *last_key,
-                            &accumulated_values,
-                            gc_cutoff,
-                            &retain_lsns_below_horizon,
-                            COMPACTION_DELTA_THRESHOLD,
-                            get_ancestor_image(self, *last_key, ctx).await?,
-                        )
-                        .await?;
-                    // Put the image into the image layer. Currently we have a single big layer for the compaction.
-                    retention
-                        .pipe_to(
-                            *last_key,
-                            &mut delta_layer_writer,
-                            image_layer_writer.as_mut(),
-                            &mut stat,
-                            ctx,
-                        )
-                        .await?;
-                }
+                let last_key: &mut Key = last_key.as_mut().unwrap();
+                stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction
+                let retention = self
+                    .generate_key_retention(
+                        *last_key,
+                        &accumulated_values,
+                        job_desc.gc_cutoff,
+                        &job_desc.retain_lsns_below_horizon,
+                        COMPACTION_DELTA_THRESHOLD,
+                        get_ancestor_image(self, *last_key, ctx).await?,
+                    )
+                    .await?;
+                retention
+                    .pipe_to(
+                        *last_key,
+                        &mut delta_layer_writer,
+                        image_layer_writer.as_mut(),
+                        &mut stat,
+                        ctx,
+                    )
+                    .await?;
                accumulated_values.clear();
                *last_key = key;
                accumulated_values.push((key, lsn, val));
@@ -2057,35 +2103,43 @@ impl Timeline {
        let last_key = last_key.expect("no keys produced during compaction");
        stat.on_unique_key_visited();

-        let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
-            !compaction_key_range.contains(&last_key)
-        } else {
-            false
-        };
-        if !skip_adding_key {
-            let retention = self
-                .generate_key_retention(
-                    last_key,
-                    &accumulated_values,
-                    gc_cutoff,
-                    &retain_lsns_below_horizon,
-                    COMPACTION_DELTA_THRESHOLD,
-                    get_ancestor_image(self, last_key, ctx).await?,
-                )
-                .await?;
-            // Put the image into the image layer. Currently we have a single big layer for the compaction.
-            retention
-                .pipe_to(
-                    last_key,
-                    &mut delta_layer_writer,
-                    image_layer_writer.as_mut(),
-                    &mut stat,
-                    ctx,
-                )
-                .await?;
-        }
+        let retention = self
+            .generate_key_retention(
+                last_key,
+                &accumulated_values,
+                job_desc.gc_cutoff,
+                &job_desc.retain_lsns_below_horizon,
+                COMPACTION_DELTA_THRESHOLD,
+                get_ancestor_image(self, last_key, ctx).await?,
+            )
+            .await?;
+        retention
+            .pipe_to(
+                last_key,
+                &mut delta_layer_writer,
+                image_layer_writer.as_mut(),
+                &mut stat,
+                ctx,
+            )
+            .await?;
        // end: move the above part to the loop body

+        let mut rewrote_delta_layers = Vec::new();
+        for (key, writers) in delta_layer_rewriters {
+            if let Some(delta_writer_before) = writers.before {
+                let (desc, path) = delta_writer_before
+                    .finish(job_desc.compaction_key_range.start, ctx)
+                    .await?;
+                let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+                rewrote_delta_layers.push(layer);
+            }
+            if let Some(delta_writer_after) = writers.after {
+                let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?;
+                let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+                rewrote_delta_layers.push(layer);
+            }
+        }
+
        let discard = |key: &PersistentLayerKey| {
            let key = key.clone();
            async move { KeyHistoryRetention::discard_key(&key, self, dry_run).await }
@@ -2093,10 +2147,7 @@ impl Timeline {

        let produced_image_layers = if let Some(writer) = image_layer_writer {
            if !dry_run {
-                let end_key = compaction_key_range
-                    .as_ref()
-                    .map(|x| x.end)
-                    .unwrap_or(Key::MAX);
+                let end_key = job_desc.compaction_key_range.end;
                writer
                    .finish_with_discard_fn(self, ctx, end_key, discard)
                    .await?
@@ -2117,10 +2168,8 @@ impl Timeline {
            Vec::new()
        };

-        if partial_compaction && !produced_delta_layers.is_empty() {
-            bail!("implementation error: partial compaction should not be producing delta layers (for now)");
-        }
-
+        // TODO: make image/delta/rewrote_delta layers generation atomic. At this point, we already generated resident layers, and if
+        // compaction is cancelled at this point, we might have some layers that are not cleaned up.
        let mut compact_to = Vec::new();
        let mut keep_layers = HashSet::new();
        let produced_delta_layers_len = produced_delta_layers.len();
@@ -2128,52 +2177,84 @@ impl Timeline {
        for action in produced_delta_layers {
            match action {
                BatchWriterResult::Produced(layer) => {
+                    if cfg!(debug_assertions) {
+                        info!("produced delta layer: {}", layer.layer_desc().key());
+                    }
                    stat.produce_delta_layer(layer.layer_desc().file_size());
                    compact_to.push(layer);
                }
                BatchWriterResult::Discarded(l) => {
+                    if cfg!(debug_assertions) {
+                        info!("discarded delta layer: {}", l);
+                    }
                    keep_layers.insert(l);
                    stat.discard_delta_layer();
                }
            }
        }
+        for layer in &rewrote_delta_layers {
+            debug!(
+                "produced rewritten delta layer: {}",
+                layer.layer_desc().key()
+            );
+        }
+        compact_to.extend(rewrote_delta_layers);
        for action in produced_image_layers {
            match action {
                BatchWriterResult::Produced(layer) => {
+                    debug!("produced image layer: {}", layer.layer_desc().key());
                    stat.produce_image_layer(layer.layer_desc().file_size());
                    compact_to.push(layer);
                }
                BatchWriterResult::Discarded(l) => {
+                    debug!("discarded image layer: {}", l);
                    keep_layers.insert(l);
                    stat.discard_image_layer();
                }
            }
        }
-        let mut layer_selection = layer_selection;
-        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
-        if let Some(ref compaction_key_range) = compaction_key_range {
-            // Partial compaction might select more data than it processes, e.g., if
-            // the compaction_key_range only partially overlaps:
-            //
-            //         [---compaction_key_range---]
-            //   [---A----][----B----][----C----][----D----]
-            //
-            // A,B,C,D are all in the `layer_selection`. The created image layers contain
-            // whatever is needed from B, C, and from `----]` of A, and from  `[--` of D.
-            //
-            // In contrast, `[--A-` and `--D----]` have not been processed, so, we must
-            // keep that data.
-            //
-            // The solution for now is to keep A and D completely.
-            // (layer_selection is what we'll remove from the layer map, so,
-            //  retain what is _not_ fully covered by compaction_key_range).
-            layer_selection.retain(|x| {
-                let key_range = &x.layer_desc().key_range;
-                key_range.start >= compaction_key_range.start
-                    && key_range.end <= compaction_key_range.end
-            });
+
+        let mut layer_selection = job_desc.selected_layers;
+
+        // Partial compaction might select more data than it processes, e.g., if
+        // the compaction_key_range only partially overlaps:
+        //
+        //         [---compaction_key_range---]
+        //   [---A----][----B----][----C----][----D----]
+        //
+        // For delta layers, we will rewrite the layers so that it is cut exactly at
+        // the compaction key range, so we can always discard them. However, for image
+        // layers, as we do not rewrite them for now, we need to handle them differently.
+        // Assume image layers  A, B, C, D are all in the `layer_selection`.
+        //
+        // The created image layers contain whatever is needed from B, C, and from
+        // `----]` of A, and from  `[---` of D.
+        //
+        // In contrast, `[---A` and `D----]` have not been processed, so, we must
+        // keep that data.
+        //
+        // The solution for now is to keep A and D completely if they are image layers.
+        // (layer_selection is what we'll remove from the layer map, so, retain what
+        // is _not_ fully covered by compaction_key_range).
+        for layer in &layer_selection {
+            if !layer.layer_desc().is_delta() {
+                if !overlaps_with(
+                    &layer.layer_desc().key_range,
+                    &job_desc.compaction_key_range,
+                ) {
+                    bail!("violated constraint: image layer outside of compaction key range");
+                }
+                if !fully_contains(
+                    &job_desc.compaction_key_range,
+                    &layer.layer_desc().key_range,
+                ) {
+                    keep_layers.insert(layer.layer_desc().key());
+                }
+            }
        }

+        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
+
        info!(
            "gc-compaction statistics: {}",
            serde_json::to_string(&stat)?
@@ -2192,6 +2273,7 @@ impl Timeline {

        // Step 3: Place back to the layer map.
        {
+            // TODO: sanity check if the layer map is valid (i.e., should not have overlaps)
            let mut guard = self.layers.write().await;
            guard
                .open_mut()?
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -5,6 +5,7 @@ use std::{

 use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
+use remote_storage::DownloadError;
 use tokio::sync::OwnedMutexGuard;
 use tracing::{error, info, info_span, instrument, Instrument};
 use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
@@ -16,8 +17,9 @@ use crate::{
        metadata::TimelineMetadata,
        remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
        CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
-        TimelineOrOffloaded,
+        TenantManifestError, TimelineOrOffloaded,
    },
+    virtual_file::MaybeFatalIo,
 };

 use super::{Timeline, TimelineResources};
@@ -62,10 +64,10 @@ pub(super) async fn delete_local_timeline_directory(
    conf: &PageServerConf,
    tenant_shard_id: TenantShardId,
    timeline: &Timeline,
-) -> anyhow::Result<()> {
+) {
    // Always ensure the lock order is compaction -> gc.
    let compaction_lock = timeline.compaction_lock.lock();
-    let compaction_lock = crate::timed(
+    let _compaction_lock = crate::timed(
        compaction_lock,
        "acquires compaction lock",
        std::time::Duration::from_secs(5),
@@ -73,7 +75,7 @@ pub(super) async fn delete_local_timeline_directory(
    .await;

    let gc_lock = timeline.gc_lock.lock();
-    let gc_lock = crate::timed(
+    let _gc_lock = crate::timed(
        gc_lock,
        "acquires gc lock",
        std::time::Duration::from_secs(5),
@@ -85,24 +87,15 @@ pub(super) async fn delete_local_timeline_directory(

    let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id);

-    fail::fail_point!("timeline-delete-before-rm", |_| {
-        Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
-    });
-
    // NB: This need not be atomic because the deleted flag in the IndexPart
    // will be observed during tenant/timeline load. The deletion will be resumed there.
    //
-    // Note that here we do not bail out on std::io::ErrorKind::NotFound.
-    // This can happen if we're called a second time, e.g.,
-    // because of a previous failure/cancellation at/after
-    // failpoint timeline-delete-after-rm.
-    //
-    // ErrorKind::NotFound can also happen if we race with tenant detach, because,
+    // ErrorKind::NotFound can happen e.g. if we race with tenant detach, because,
    // no locks are shared.
    tokio::fs::remove_dir_all(local_timeline_directory)
        .await
        .or_else(fs_ext::ignore_not_found)
-        .context("remove local timeline directory")?;
+        .fatal_err("removing timeline directory");

    // Make sure previous deletions are ordered before mark removal.
    // Otherwise there is no guarantee that they reach the disk before mark deletion.
@@ -113,24 +106,9 @@ pub(super) async fn delete_local_timeline_directory(
    let timeline_path = conf.timelines_path(&tenant_shard_id);
    crashsafe::fsync_async(timeline_path)
        .await
-        .context("fsync_pre_mark_remove")?;
+        .fatal_err("fsync after removing timeline directory");

    info!("finished deleting layer files, releasing locks");
-    drop(gc_lock);
-    drop(compaction_lock);
-
-    fail::fail_point!("timeline-delete-after-rm", |_| {
-        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
-    });
-
-    Ok(())
-}
-
-/// Removes remote layers and an index file after them.
-async fn delete_remote_layers_and_index(
-    remote_client: &Arc<RemoteTimelineClient>,
-) -> anyhow::Result<()> {
-    remote_client.delete_all().await.context("delete_all")
 }

 /// It is important that this gets called when DeletionGuard is being held.
@@ -163,9 +141,10 @@ async fn remove_maybe_offloaded_timeline_from_tenant(
            );
        }
        TimelineOrOffloaded::Offloaded(timeline) => {
-            timelines_offloaded
+            let offloaded_timeline = timelines_offloaded
                .remove(&timeline.timeline_id)
                .expect("timeline that we were deleting was concurrently removed from 'timelines_offloaded' map");
+            offloaded_timeline.delete_from_ancestor_with_timelines(&timelines);
        }
    }

@@ -237,11 +216,24 @@ impl DeleteTimelineFlow {
            None => {
                let remote_client = tenant
                    .build_timeline_client(timeline.timeline_id(), tenant.remote_storage.clone());
-                let result = remote_client
+                let result = match remote_client
                    .download_index_file(&tenant.cancel)
                    .instrument(info_span!("download_index_file"))
                    .await
-                    .map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!("error: {:?}", e)))?;
+                {
+                    Ok(r) => r,
+                    Err(DownloadError::NotFound) => {
+                        // Deletion is already complete
+                        tracing::info!("Timeline already deleted in remote storage");
+                        return Ok(());
+                    }
+                    Err(e) => {
+                        return Err(DeleteTimelineError::Other(anyhow::anyhow!(
+                            "error: {:?}",
+                            e
+                        )));
+                    }
+                };
                let index_part = match result {
                    MaybeDeletedIndexPart::Deleted(p) => {
                        tracing::info!("Timeline already set as deleted in remote index");
@@ -422,7 +414,12 @@ impl DeleteTimelineFlow {
            "timeline_delete",
            async move {
                if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await {
-                    error!("Error: {err:#}");
+                    // Only log as an error if it's not a cancellation.
+                    if matches!(err, DeleteTimelineError::Cancelled) {
+                        info!("Shutdown during timeline deletion");
+                    }else {
+                        error!("Error: {err:#}");
+                    }
                    if let TimelineOrOffloaded::Timeline(timeline) = timeline {
                        timeline.set_broken(format!("{err:#}"))
                    }
@@ -440,13 +437,21 @@ impl DeleteTimelineFlow {
        timeline: &TimelineOrOffloaded,
        remote_client: Arc<RemoteTimelineClient>,
    ) -> Result<(), DeleteTimelineError> {
+        fail::fail_point!("timeline-delete-before-rm", |_| {
+            Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
+        });
+
        // Offloaded timelines have no local state
        // TODO: once we persist offloaded information, delete the timeline from there, too
        if let TimelineOrOffloaded::Timeline(timeline) = timeline {
-            delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
+            delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await;
        }

-        delete_remote_layers_and_index(&remote_client).await?;
+        fail::fail_point!("timeline-delete-after-rm", |_| {
+            Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
+        });
+
+        remote_client.delete_all().await?;

        pausable_failpoint!("in_progress_delete");

@@ -457,10 +462,10 @@ impl DeleteTimelineFlow {
        // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
        // However, we handle this case in tenant loading code so the next time we attach, the issue is
        // resolved.
-        tenant
-            .store_tenant_manifest()
-            .await
-            .map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!(e)))?;
+        tenant.store_tenant_manifest().await.map_err(|e| match e {
+            TenantManifestError::Cancelled => DeleteTimelineError::Cancelled,
+            _ => DeleteTimelineError::Other(e.into()),
+        })?;

        *guard = Self::Finished;

--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -12,7 +12,7 @@ use crate::{
    virtual_file::{MaybeFatalIo, VirtualFile},
 };
 use anyhow::Context;
-use pageserver_api::models::detach_ancestor::AncestorDetached;
+use pageserver_api::{models::detach_ancestor::AncestorDetached, shard::ShardIdentity};
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
@@ -376,8 +376,14 @@ pub(super) async fn prepare(
        tasks.spawn(
            async move {
                let _permit = limiter.acquire().await;
-                let owned =
-                    remote_copy(&adopted, &timeline, timeline.generation, &timeline.cancel).await?;
+                let owned = remote_copy(
+                    &adopted,
+                    &timeline,
+                    timeline.generation,
+                    timeline.shard_identity,
+                    &timeline.cancel,
+                )
+                .await?;
                tracing::info!(layer=%owned, "remote copied");
                Ok(owned)
            }
@@ -629,6 +635,7 @@ async fn remote_copy(
    adopted: &Layer,
    adoptee: &Arc<Timeline>,
    generation: Generation,
+    shard_identity: ShardIdentity,
    cancel: &CancellationToken,
 ) -> Result<Layer, Error> {
    // depending if Layer::keep_resident we could hardlink
@@ -636,6 +643,7 @@ async fn remote_copy(
    let mut metadata = adopted.metadata();
    debug_assert!(metadata.generation <= generation);
    metadata.generation = generation;
+    metadata.shard = shard_identity.shard_index();

    let owned = crate::tenant::storage_layer::Layer::for_evicted(
        adoptee.conf,
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -47,31 +47,26 @@ pub(crate) async fn offload_timeline(
    match is_archived {
        Some(true) => (),
        Some(false) => {
-            tracing::warn!(?is_archived, "tried offloading a non-archived timeline");
+            tracing::warn!("tried offloading a non-archived timeline");
            return Err(OffloadError::NotArchived);
        }
        None => {
            // This is legal: calls to this function can race with the timeline shutting down
-            tracing::info!(
-                ?is_archived,
-                "tried offloading a timeline whose remote storage is not initialized"
-            );
+            tracing::info!("tried offloading a timeline whose remote storage is not initialized");
            return Err(OffloadError::Cancelled);
        }
    }

    // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
-    timeline.shutdown(super::ShutdownMode::Hard).await;
+    timeline.shutdown(super::ShutdownMode::Flush).await;

    // TODO extend guard mechanism above with method
    // to make deletions possible while offloading is in progress

    let conf = &tenant.conf;
-    delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline)
-        .await
-        .map_err(OffloadError::Other)?;
+    delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await;

-    remove_timeline_from_tenant(tenant, &timeline, &guard);
+    let remaining_refcount = remove_timeline_from_tenant(tenant, &timeline, &guard);

    {
        let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
@@ -92,16 +87,20 @@ pub(crate) async fn offload_timeline(
    // not our actual state of offloaded timelines.
    tenant.store_tenant_manifest().await?;

+    tracing::info!("Timeline offload complete (remaining arc refcount: {remaining_refcount})");
+
    Ok(())
 }

 /// It is important that this gets called when DeletionGuard is being held.
 /// For more context see comments in [`DeleteTimelineFlow::prepare`]
+///
+/// Returns the strong count of the timeline `Arc`
 fn remove_timeline_from_tenant(
    tenant: &Tenant,
    timeline: &Timeline,
    _: &DeletionGuard, // using it as a witness
-) {
+) -> usize {
    // Remove the timeline from the map.
    let mut timelines = tenant.timelines.lock().unwrap();
    let children_exist = timelines
@@ -114,7 +113,9 @@ fn remove_timeline_from_tenant(
        panic!("Timeline grew children while we removed layer files");
    }

-    timelines
+    let timeline = timelines
        .remove(&timeline.timeline_id)
        .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
+
+    Arc::strong_count(&timeline)
 }
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -331,11 +331,11 @@ pub(super) async fn handle_walreceiver_connection(
                        Ok(())
                    }

-                    while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
+                    while let Some((next_record_lsn, recdata)) = waldecoder.poll_decode()? {
                        // It is important to deal with the aligned records as lsn in getPage@LSN is
                        // aligned and can be several bytes bigger. Without this alignment we are
                        // at risk of hitting a deadlock.
-                        if !lsn.is_aligned() {
+                        if !next_record_lsn.is_aligned() {
                            return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
                        }

@@ -343,7 +343,7 @@ pub(super) async fn handle_walreceiver_connection(
                        let interpreted = InterpretedWalRecord::from_bytes_filtered(
                            recdata,
                            modification.tline.get_shard_identity(),
-                            lsn,
+                            next_record_lsn,
                            modification.tline.pg_version,
                        )?;

@@ -366,9 +366,11 @@ pub(super) async fn handle_walreceiver_connection(
                        let ingested = walingest
                            .ingest_record(interpreted, &mut modification, &ctx)
                            .await
-                            .with_context(|| format!("could not ingest record at {lsn}"))?;
+                            .with_context(|| {
+                                format!("could not ingest record at {next_record_lsn}")
+                            })?;
                        if !ingested {
-                            tracing::debug!("ingest: filtered out record @ LSN {lsn}");
+                            tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");
                            WAL_INGEST.records_filtered.inc();
                            filtered_records += 1;
                        }
@@ -378,7 +380,7 @@ pub(super) async fn handle_walreceiver_connection(
                        // to timeout the tests.
                        fail_point!("walreceiver-after-ingest");

-                        last_rec_lsn = lsn;
+                        last_rec_lsn = next_record_lsn;

                        // Commit every ingest_batch_size records. Even if we filtered out
                        // all records, we still need to call commit to advance the LSN.
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -175,10 +175,16 @@ impl VirtualFile {
    }

    pub async fn sync_all(&self) -> Result<(), Error> {
+        if SYNC_MODE.load(std::sync::atomic::Ordering::Relaxed) == SyncMode::UnsafeNoSync as u8 {
+            return Ok(());
+        }
        self.inner.sync_all().await
    }

    pub async fn sync_data(&self) -> Result<(), Error> {
+        if SYNC_MODE.load(std::sync::atomic::Ordering::Relaxed) == SyncMode::UnsafeNoSync as u8 {
+            return Ok(());
+        }
        self.inner.sync_data().await
    }

@@ -233,6 +239,27 @@ impl VirtualFile {
    }
 }

+/// Indicates whether to enable fsync, fdatasync, or O_SYNC/O_DSYNC when writing
+/// files. Switching this off is unsafe and only used for testing on machines
+/// with slow drives.
+#[repr(u8)]
+pub enum SyncMode {
+    Sync,
+    UnsafeNoSync,
+}
+
+impl TryFrom<u8> for SyncMode {
+    type Error = u8;
+
+    fn try_from(value: u8) -> Result<Self, Self::Error> {
+        Ok(match value {
+            v if v == (SyncMode::Sync as u8) => SyncMode::Sync,
+            v if v == (SyncMode::UnsafeNoSync as u8) => SyncMode::UnsafeNoSync,
+            x => return Err(x),
+        })
+    }
+}
+
 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
 /// the underlying file is closed if the system is low on file descriptors,
@@ -1332,12 +1359,13 @@ impl OpenFiles {
 /// server startup.
 ///
 #[cfg(not(test))]
-pub fn init(num_slots: usize, engine: IoEngineKind, mode: IoMode) {
+pub fn init(num_slots: usize, engine: IoEngineKind, mode: IoMode, sync_mode: SyncMode) {
    if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
        panic!("virtual_file::init called twice");
    }
    set_io_mode(mode);
    io_engine::init(engine);
+    SYNC_MODE.store(sync_mode as u8, std::sync::atomic::Ordering::Relaxed);
    crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
 }

@@ -1379,6 +1407,9 @@ pub(crate) fn set_io_mode(mode: IoMode) {
 pub(crate) fn get_io_mode() -> IoMode {
    IoMode::try_from(IO_MODE.load(Ordering::Relaxed)).unwrap()
 }
+
+static SYNC_MODE: AtomicU8 = AtomicU8::new(SyncMode::Sync as u8);
+
 #[cfg(test)]
 mod tests {
    use crate::context::DownloadBehavior;
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -28,14 +28,13 @@ use std::time::Duration;
 use std::time::Instant;
 use std::time::SystemTime;

-use pageserver_api::key::Key;
 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::fsm_logical_to_physical;
 use postgres_ffi::walrecord::*;
 use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
 use wal_decoder::models::*;

-use anyhow::{bail, Context, Result};
+use anyhow::{bail, Result};
 use bytes::{Buf, Bytes};
 use tracing::*;
 use utils::failpoint_support;
@@ -51,7 +50,6 @@ use crate::ZERO_PAGE;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
-use pageserver_api::value::Value;
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::TransactionId;
@@ -156,12 +154,12 @@ impl WalIngest {
        WAL_INGEST.records_received.inc();
        let prev_len = modification.len();

-        modification.set_lsn(interpreted.lsn)?;
+        modification.set_lsn(interpreted.next_record_lsn)?;

        if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes) {
            // Records of this type should always be preceded by a commit(), as they
            // rely on reading data pages back from the Timeline.
-            assert!(!modification.has_dirty_data_pages());
+            assert!(!modification.has_dirty_data());
        }

        assert!(!self.checkpoint_modified);
@@ -275,28 +273,9 @@ impl WalIngest {
            }
        }

-        // Iterate through all the key value pairs provided in the interpreted block
-        // and update the modification currently in-flight to include them.
-        for (compact_key, maybe_value) in interpreted.blocks.into_iter() {
-            let (rel, blk) = Key::from_compact(compact_key).to_rel_block()?;
-            match maybe_value {
-                Some(Value::Image(img)) => {
-                    self.put_rel_page_image(modification, rel, blk, img, ctx)
-                        .await?;
-                }
-                Some(Value::WalRecord(rec)) => {
-                    self.put_rel_wal_record(modification, rel, blk, rec, ctx)
-                        .await?;
-                }
-                None => {
-                    // Shard 0 tracks relation sizes. We will observe
-                    // its blkno in case it implicitly extends a relation.
-                    assert!(self.shard.is_shard_zero());
-                    self.observe_decoded_block(modification, rel, blk, ctx)
-                        .await?;
-                }
-            }
-        }
+        modification
+            .ingest_batch(interpreted.batch, &self.shard, ctx)
+            .await?;

        // If checkpoint data was updated, store the new version in the repository
        if self.checkpoint_modified {
@@ -310,8 +289,6 @@ impl WalIngest {
        // until commit() is called to flush the data into the repository and update
        // the latest LSN.

-        modification.on_record_end();
-
        Ok(modification.len() > prev_len)
    }

@@ -334,17 +311,6 @@ impl WalIngest {
        Ok((epoch as u64) << 32 | xid as u64)
    }

-    /// Do not store this block, but observe it for the purposes of updating our relation size state.
-    async fn observe_decoded_block(
-        &mut self,
-        modification: &mut DatadirModification<'_>,
-        rel: RelTag,
-        blkno: BlockNumber,
-        ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
-        self.handle_rel_extend(modification, rel, blkno, ctx).await
-    }
-
    async fn ingest_clear_vm_bits(
        &mut self,
        clear_vm_bits: ClearVmBits,
@@ -621,11 +587,29 @@ impl WalIngest {
                forknum: VISIBILITYMAP_FORKNUM,
            };

-            let mut vm_page_no = blkno / pg_constants::VM_HEAPBLOCKS_PER_PAGE;
-            if blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 {
-                // Tail of last remaining vm page has to be zeroed.
-                // We are not precise here and instead of digging in VM bitmap format just clear the whole page.
-                modification.put_rel_page_image_zero(rel, vm_page_no)?;
+            // last remaining block, byte, and bit
+            let mut vm_page_no = blkno / (pg_constants::VM_HEAPBLOCKS_PER_PAGE as u32);
+            let trunc_byte = blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_PAGE
+                / pg_constants::VM_HEAPBLOCKS_PER_BYTE;
+            let trunc_offs = blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_BYTE
+                * pg_constants::VM_BITS_PER_HEAPBLOCK;
+
+            // Unless the new size is exactly at a visibility map page boundary, the
+            // tail bits in the last remaining map page, representing truncated heap
+            // blocks, need to be cleared. This is not only tidy, but also necessary
+            // because we don't get a chance to clear the bits if the heap is extended
+            // again.
+            if (trunc_byte != 0 || trunc_offs != 0)
+                && self.shard.is_key_local(&rel_block_to_key(rel, vm_page_no))
+            {
+                modification.put_rel_wal_record(
+                    rel,
+                    vm_page_no,
+                    NeonWalRecord::TruncateVisibilityMap {
+                        trunc_byte,
+                        trunc_offs,
+                    },
+                )?;
                vm_page_no += 1;
            }
            let nblocks = get_relsize(modification, rel, ctx).await?;
@@ -1248,6 +1232,7 @@ impl WalIngest {
        Ok(())
    }

+    #[cfg(test)]
    async fn put_rel_page_image(
        &mut self,
        modification: &mut DatadirModification<'_>,
@@ -1297,36 +1282,7 @@ impl WalIngest {
        let new_nblocks = blknum + 1;
        // Check if the relation exists. We implicitly create relations on first
        // record.
-        // TODO: would be nice if to be more explicit about it
-
-        // Get current size and put rel creation if rel doesn't exist
-        //
-        // NOTE: we check the cache first even though get_rel_exists and get_rel_size would
-        //       check the cache too. This is because eagerly checking the cache results in
-        //       less work overall and 10% better performance. It's more work on cache miss
-        //       but cache miss is rare.
-        let old_nblocks = if let Some(nblocks) = modification
-            .tline
-            .get_cached_rel_size(&rel, modification.get_lsn())
-        {
-            nblocks
-        } else if !modification
-            .tline
-            .get_rel_exists(rel, Version::Modified(modification), ctx)
-            .await?
-        {
-            // create it with 0 size initially, the logic below will extend it
-            modification
-                .put_rel_creation(rel, 0, ctx)
-                .await
-                .context("Relation Error")?;
-            0
-        } else {
-            modification
-                .tline
-                .get_rel_size(rel, Version::Modified(modification), ctx)
-                .await?
-        };
+        let old_nblocks = modification.create_relation_if_required(rel, ctx).await?;

        if new_nblocks > old_nblocks {
            //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
@@ -1553,25 +1509,21 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
            .await?;
-        m.on_record_end();
        m.commit(&ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx)
            .await?;
-        m.on_record_end();
        m.commit(&ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x40));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx)
            .await?;
-        m.on_record_end();
        m.commit(&ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x50));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx)
            .await?;
-        m.on_record_end();
        m.commit(&ctx).await?;

        assert_current_logical_size(&tline, Lsn(0x50));
@@ -1713,7 +1665,6 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx)
            .await?;
-        m.on_record_end();
        m.commit(&ctx).await?;
        assert_eq!(
            tline
@@ -1739,7 +1690,6 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx)
            .await?;
-        m.on_record_end();
        m.commit(&ctx).await?;
        assert_eq!(
            tline
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -42,6 +42,34 @@ pub(crate) fn apply_in_neon(
        } => {
            anyhow::bail!("tried to pass postgres wal record to neon WAL redo");
        }
+        //
+        // Code copied from PostgreSQL `visibilitymap_prepare_truncate` function in `visibilitymap.c`
+        //
+        NeonWalRecord::TruncateVisibilityMap {
+            trunc_byte,
+            trunc_offs,
+        } => {
+            // sanity check that this is modifying the correct relation
+            let (rel, _) = key.to_rel_block().context("invalid record")?;
+            assert!(
+                rel.forknum == VISIBILITYMAP_FORKNUM,
+                "TruncateVisibilityMap record on unexpected rel {}",
+                rel
+            );
+            let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
+            map[*trunc_byte + 1..].fill(0u8);
+            /*----
+             * Mask out the unwanted bits of the last remaining byte.
+             *
+             * ((1 << 0) - 1) = 00000000
+             * ((1 << 1) - 1) = 00000001
+             * ...
+             * ((1 << 6) - 1) = 00111111
+             * ((1 << 7) - 1) = 01111111
+             *----
+             */
+            map[*trunc_byte] &= (1 << *trunc_offs) - 1;
+        }
        NeonWalRecord::ClearVisibilityMapFlags {
            new_heap_blkno,
            old_heap_blkno,
@@ -67,7 +95,10 @@ pub(crate) fn apply_in_neon(
                let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];

                map[map_byte as usize] &= !(flags << map_offset);
-                postgres_ffi::page_set_lsn(page, lsn);
+                // The page should never be empty, but we're checking it anyway as a precaution, so that if it is empty for some reason anyway, we don't make matters worse by setting the LSN on it.
+                if !postgres_ffi::page_is_new(page) {
+                    postgres_ffi::page_set_lsn(page, lsn);
+                }
            }

            // Repeat for 'old_heap_blkno', if any
@@ -81,7 +112,10 @@ pub(crate) fn apply_in_neon(
                let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];

                map[map_byte as usize] &= !(flags << map_offset);
-                postgres_ffi::page_set_lsn(page, lsn);
+                // The page should never be empty, but we're checking it anyway as a precaution, so that if it is empty for some reason anyway, we don't make matters worse by setting the LSN on it.
+                if !postgres_ffi::page_is_new(page) {
+                    postgres_ffi::page_set_lsn(page, lsn);
+                }
            }
        }
        // Non-relational WAL records are handled here, with custom code that has the
@@ -247,6 +281,10 @@ pub(crate) fn apply_in_neon(
            use bytes::BufMut;
            if *will_init {
                assert!(*clear, "init record must be clear to ensure correctness");
+                assert!(
+                    page.is_empty(),
+                    "init record must be the first entry to ensure correctness"
+                );
            }
            if *clear {
                page.clear();
--- a/pgxn/neon/logical_replication_monitor.c
+++ b/pgxn/neon/logical_replication_monitor.c
@@ -1,7 +1,8 @@
+#include <dirent.h>
 #include <limits.h>
 #include <string.h>
-#include <dirent.h>
 #include <signal.h>
+#include <sys/stat.h>

 #include "postgres.h"

@@ -21,17 +22,35 @@

 static int	logical_replication_max_snap_files = 300;

+/*
+ * According to Chi (shyzh), the pageserver _should_ be good with 10 MB worth of
+ * snapshot files. Let's use 8 MB since 8 is a power of 2.
+ */
+static int	logical_replication_max_logicalsnapdir_size = 8000;
+
+/*
+ * A primitive description of a logical snapshot file including the LSN of the
+ * file and its size.
+ */
+typedef struct SnapDesc {
+	XLogRecPtr	lsn;
+	off_t		sz;
+} SnapDesc;
+
 PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);

+/*
+ * Sorts an array of snapshot descriptors by their LSN.
+ */
 static int
-LsnDescComparator(const void *a, const void *b)
+SnapDescComparator(const void *a, const void *b)
 {
-	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
-	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
+	const SnapDesc	*desc1 = a;
+	const SnapDesc	*desc2 = b;

-	if (lsn1 < lsn2)
+	if (desc1->lsn < desc2->lsn)
 		return 1;
-	else if (lsn1 == lsn2)
+	else if (desc1->lsn == desc2->lsn)
 		return 0;
 	else
 		return -1;
@@ -43,28 +62,39 @@ LsnDescComparator(const void *a, const void *b)
 * slots having lower restart_lsn should be dropped.
 */
 static XLogRecPtr
-get_num_snap_files_lsn_threshold(void)
+get_snapshots_cutoff_lsn(void)
 {
-	DIR		   *dirdesc;
-	struct dirent *de;
-	char	   *snap_path = "pg_logical/snapshots/";
-	int			lsns_allocated = 1024;
-	int			lsns_num = 0;
-	XLogRecPtr *lsns;
-	XLogRecPtr	cutoff;
+/* PG 18 has a constant defined for this, PG_LOGICAL_SNAPSHOTS_DIR */
+#define SNAPDIR "pg_logical/snapshots"

-	if (logical_replication_max_snap_files < 0)
+	DIR		   *dirdesc;
+	int			dirdesc_fd;
+	struct dirent *de;
+	size_t		snapshot_index = 0;
+	SnapDesc   *snapshot_descriptors;
+	size_t		descriptors_allocated = 1024;
+	XLogRecPtr	cutoff = 0;
+	off_t		logicalsnapdir_size = 0;
+	const int	logical_replication_max_logicalsnapdir_size_bytes = logical_replication_max_logicalsnapdir_size * 1000;
+
+	if (logical_replication_max_snap_files < 0 && logical_replication_max_logicalsnapdir_size < 0)
 		return 0;

-	lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
+	snapshot_descriptors = palloc(sizeof(*snapshot_descriptors) * descriptors_allocated);
+
+	dirdesc = AllocateDir(SNAPDIR);
+	dirdesc_fd = dirfd(dirdesc);
+	if (dirdesc_fd == -1)
+		ereport(ERROR, errmsg("failed to get a file descriptor for " SNAPDIR ": %m"));

 	/* find all .snap files and get their lsns */
-	dirdesc = AllocateDir(snap_path);
-	while ((de = ReadDir(dirdesc, snap_path)) != NULL)
+	while ((de = ReadDir(dirdesc, SNAPDIR)) != NULL)
 	{
-		XLogRecPtr	lsn;
 		uint32		hi;
 		uint32		lo;
+		struct stat	st;
+		XLogRecPtr	lsn;
+		SnapDesc   *desc;

 		if (strcmp(de->d_name, ".") == 0 ||
 			strcmp(de->d_name, "..") == 0)
@@ -79,28 +109,69 @@ get_num_snap_files_lsn_threshold(void)

 		lsn = ((uint64) hi) << 32 | lo;
 		elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
-		if (lsns_allocated == lsns_num)
+
+		if (fstatat(dirdesc_fd, de->d_name, &st, 0) == -1)
+			ereport(ERROR, errmsg("failed to get the size of " SNAPDIR "/%s: %m", de->d_name));
+
+		if (descriptors_allocated == snapshot_index)
 		{
-			lsns_allocated *= 2;
-			lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
+			descriptors_allocated *= 2;
+			snapshot_descriptors = repalloc(snapshot_descriptors, sizeof(*snapshot_descriptors) * descriptors_allocated);
 		}
-		lsns[lsns_num++] = lsn;
+
+		desc = &snapshot_descriptors[snapshot_index++];
+		desc->lsn = lsn;
+		desc->sz = st.st_size;
 	}
-	/* sort by lsn desc */
-	qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
-	/* and take cutoff at logical_replication_max_snap_files */
-	if (logical_replication_max_snap_files > lsns_num)
-		cutoff = 0;
-	/* have less files than cutoff */
-	else
+
+	qsort(snapshot_descriptors, snapshot_index, sizeof(*snapshot_descriptors), SnapDescComparator);
+
+	/* Are there more snapshot files than specified? */
+	if (logical_replication_max_snap_files <= snapshot_index)
 	{
-		cutoff = lsns[logical_replication_max_snap_files - 1];
-		elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
-			 LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
+		cutoff = snapshot_descriptors[logical_replication_max_snap_files - 1].lsn;
+		elog(LOG,
+			"ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %zu snapshot files, limit is %d",
+			LSN_FORMAT_ARGS(cutoff), snapshot_index, logical_replication_max_snap_files);
 	}
-	pfree(lsns);
+
+	/* Is the size of the logical snapshots directory larger than specified?
+	 *
+	 * It's possible we could hit both thresholds, so remove any extra files
+	 * first, and then truncate based on size of the remaining files.
+	 */
+	if (logicalsnapdir_size > logical_replication_max_logicalsnapdir_size_bytes)
+	{
+		/* Unfortunately, iterating the directory does not guarantee any order
+		 * so we can't cache an index in the preceding loop.
+		 */
+
+		off_t		sz;
+		const XLogRecPtr original = cutoff;
+
+		sz = snapshot_descriptors[0].sz;
+		for (size_t i = 1; i < logical_replication_max_snap_files; ++i)
+		{
+			if (sz > logical_replication_max_logicalsnapdir_size_bytes)
+			{
+				cutoff = snapshot_descriptors[i - 1].lsn;
+				break;
+			}
+
+			sz += snapshot_descriptors[i].sz;
+		}
+
+		if (cutoff != original)
+			elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower than %X/%X, " SNAPDIR " is larger than %d KB",
+					LSN_FORMAT_ARGS(cutoff), logical_replication_max_logicalsnapdir_size);
+	}
+
+	pfree(snapshot_descriptors);
 	FreeDir(dirdesc);
+
 	return cutoff;
+
+#undef SNAPDIR
 }

 void
@@ -118,6 +189,16 @@ InitLogicalReplicationMonitor(void)
 							0,
 							NULL, NULL, NULL);

+	DefineCustomIntVariable(
+							"neon.logical_replication_max_logicalsnapdir_size",
+							"Maximum allowed size of the pg_logical/snapshots directory (KB). When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
+							NULL,
+							&logical_replication_max_logicalsnapdir_size,
+							8000, -1, INT_MAX,
+							PGC_SIGHUP,
+							GUC_UNIT_KB,
+							NULL, NULL, NULL);
+
 	memset(&bgw, 0, sizeof(bgw));
 	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
 	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
@@ -162,7 +243,7 @@ LogicalSlotsMonitorMain(Datum main_arg)
 		 * If there are too many .snap files, just drop all logical slots to
 		 * prevent aux files bloat.
 		 */
-		cutoff_lsn = get_num_snap_files_lsn_threshold();
+		cutoff_lsn = get_snapshots_cutoff_lsn();
 		if (cutoff_lsn > 0)
 		{
 			for (int i = 0; i < max_replication_slots; i++)
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -512,7 +512,7 @@ neon_shmem_startup_hook(void)
 	if (prev_shmem_startup_hook)
 		prev_shmem_startup_hook();

-#if PG_PG_MAJORVERSION_NUM >= 17
+#if PG_MAJORVERSION_NUM >= 17
 	WAIT_EVENT_NEON_LFC_MAINTENANCE = WaitEventExtensionNew("Neon/FileCache_Maintenance");
 	WAIT_EVENT_NEON_LFC_READ = WaitEventExtensionNew("Neon/FileCache_Read");
 	WAIT_EVENT_NEON_LFC_TRUNCATE = WaitEventExtensionNew("Neon/FileCache_Truncate");
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -611,6 +611,17 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
 	recptr = startptr;
 	nbytes = count;

+/* Try to read directly from WAL buffers first. */
+#if PG_MAJORVERSION_NUM >= 17
+	{
+		Size	rbytes;
+		rbytes = WALReadFromBuffers(p, recptr, nbytes, tli);
+		recptr += rbytes;
+		nbytes -= rbytes;
+		p += rbytes;
+	}
+#endif
+
 	while (nbytes > 0)
 	{
 		uint32		startoff;
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1361,29 +1361,35 @@ SendAppendRequests(Safekeeper *sk)
 		if (sk->active_state == SS_ACTIVE_READ_WAL)
 		{
 			char	   *errmsg;
+			int			req_len;

 			req = &sk->appendRequest;
+			req_len = req->endLsn - req->beginLsn;

-			switch (wp->api.wal_read(sk,
-									 &sk->outbuf.data[sk->outbuf.len],
-									 req->beginLsn,
-									 req->endLsn - req->beginLsn,
-									 &errmsg))
+			/* We send zero sized AppenRequests as heartbeats; don't wal_read for these. */
+			if (req_len > 0)
 			{
-				case NEON_WALREAD_SUCCESS:
-					break;
-				case NEON_WALREAD_WOULDBLOCK:
-					return true;
-				case NEON_WALREAD_ERROR:
-					wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
-						   sk->host, sk->port, errmsg);
-					ShutdownConnection(sk);
-					return false;
-				default:
-					Assert(false);
+				switch (wp->api.wal_read(sk,
+										&sk->outbuf.data[sk->outbuf.len],
+										req->beginLsn,
+										req_len,
+										&errmsg))
+				{
+					case NEON_WALREAD_SUCCESS:
+						break;
+					case NEON_WALREAD_WOULDBLOCK:
+						return true;
+					case NEON_WALREAD_ERROR:
+						wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
+							sk->host, sk->port, errmsg);
+						ShutdownConnection(sk);
+						return false;
+					default:
+						Assert(false);
+				}
 			}

-			sk->outbuf.len += req->endLsn - req->beginLsn;
+			sk->outbuf.len += req_len;

 			writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);

--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1489,33 +1489,11 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count,
 {
 	NeonWALReadResult res;

-#if PG_MAJORVERSION_NUM >= 17
-	if (!sk->wp->config->syncSafekeepers)
-	{
-		Size	rbytes;
-		rbytes = WALReadFromBuffers(buf, startptr, count,
-									walprop_pg_get_timeline_id());
-
-		startptr += rbytes;
-		count -= rbytes;
-	}
-#endif
-
-	if (count == 0)
-	{
-		res = NEON_WALREAD_SUCCESS;
-	}
-	else
-	{
-		Assert(count > 0);
-
-		/* Now read the remaining WAL from the WAL file */
-		res = NeonWALRead(sk->xlogreader,
-						  buf,
-						  startptr,
-						  count,
-						  walprop_pg_get_timeline_id());
-	}
+	res = NeonWALRead(sk->xlogreader,
+					  buf,
+					  startptr,
+					  count,
+					  walprop_pg_get_timeline_id());

 	if (res == NEON_WALREAD_SUCCESS)
 	{
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.

 [[package]]
 name = "aiohappyeyeballs"
@@ -2106,83 +2106,78 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]

 [[package]]
 name = "psycopg2-binary"
-version = "2.9.9"
+version = "2.9.10"
 description = "psycopg2 - Python-PostgreSQL Database Adapter"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "psycopg2-binary-2.9.9.tar.gz", hash = "sha256:7f01846810177d829c7692f1f5ada8096762d9172af1b1a28d4ab5b77c923c1c"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c2470da5418b76232f02a2fcd2229537bb2d5a7096674ce61859c3229f2eb202"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c6af2a6d4b7ee9615cbb162b0738f6e1fd1f5c3eda7e5da17861eacf4c717ea7"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75723c3c0fbbf34350b46a3199eb50638ab22a0228f93fb472ef4d9becc2382b"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83791a65b51ad6ee6cf0845634859d69a038ea9b03d7b26e703f94c7e93dbcf9"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ef4854e82c09e84cc63084a9e4ccd6d9b154f1dbdd283efb92ecd0b5e2b8c84"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed1184ab8f113e8d660ce49a56390ca181f2981066acc27cf637d5c1e10ce46e"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d2997c458c690ec2bc6b0b7ecbafd02b029b7b4283078d3b32a852a7ce3ddd98"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b58b4710c7f4161b5e9dcbe73bb7c62d65670a87df7bcce9e1faaad43e715245"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0c009475ee389757e6e34611d75f6e4f05f0cf5ebb76c6037508318e1a1e0d7e"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8dbf6d1bc73f1d04ec1734bae3b4fb0ee3cb2a493d35ede9badbeb901fb40f6f"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-win32.whl", hash = "sha256:3f78fd71c4f43a13d342be74ebbc0666fe1f555b8837eb113cb7416856c79682"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-win_amd64.whl", hash = "sha256:876801744b0dee379e4e3c38b76fc89f88834bb15bf92ee07d94acd06ec890a0"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ee825e70b1a209475622f7f7b776785bd68f34af6e7a46e2e42f27b659b5bc26"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1ea665f8ce695bcc37a90ee52de7a7980be5161375d42a0b6c6abedbf0d81f0f"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:143072318f793f53819048fdfe30c321890af0c3ec7cb1dfc9cc87aa88241de2"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c332c8d69fb64979ebf76613c66b985414927a40f8defa16cf1bc028b7b0a7b0"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7fc5a5acafb7d6ccca13bfa8c90f8c51f13d8fb87d95656d3950f0158d3ce53"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:977646e05232579d2e7b9c59e21dbe5261f403a88417f6a6512e70d3f8a046be"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b6356793b84728d9d50ead16ab43c187673831e9d4019013f1402c41b1db9b27"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bc7bb56d04601d443f24094e9e31ae6deec9ccb23581f75343feebaf30423359"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:77853062a2c45be16fd6b8d6de2a99278ee1d985a7bd8b103e97e41c034006d2"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:78151aa3ec21dccd5cdef6c74c3e73386dcdfaf19bced944169697d7ac7482fc"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e6f98446430fdf41bd36d4faa6cb409f5140c1c2cf58ce0bbdaf16af7d3f119"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c77e3d1862452565875eb31bdb45ac62502feabbd53429fdc39a1cc341d681ba"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8359bf4791968c5a78c56103702000105501adb557f3cf772b2c207284273984"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:275ff571376626195ab95a746e6a04c7df8ea34638b99fc11160de91f2fef503"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f9b5571d33660d5009a8b3c25dc1db560206e2d2f89d3df1cb32d72c0d117d52"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:420f9bbf47a02616e8554e825208cb947969451978dceb77f95ad09c37791dae"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:4154ad09dac630a0f13f37b583eae260c6aa885d67dfbccb5b02c33f31a6d420"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a148c5d507bb9b4f2030a2025c545fccb0e1ef317393eaba42e7eabd28eb6041"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-win32.whl", hash = "sha256:68fc1f1ba168724771e38bee37d940d2865cb0f562380a1fb1ffb428b75cb692"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-win_amd64.whl", hash = "sha256:281309265596e388ef483250db3640e5f414168c5a67e9c665cafce9492eda2f"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:60989127da422b74a04345096c10d416c2b41bd7bf2a380eb541059e4e999980"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:246b123cc54bb5361588acc54218c8c9fb73068bf227a4a531d8ed56fa3ca7d6"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34eccd14566f8fe14b2b95bb13b11572f7c7d5c36da61caf414d23b91fcc5d94"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18d0ef97766055fec15b5de2c06dd8e7654705ce3e5e5eed3b6651a1d2a9a152"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d3f82c171b4ccd83bbaf35aa05e44e690113bd4f3b7b6cc54d2219b132f3ae55"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ead20f7913a9c1e894aebe47cccf9dc834e1618b7aa96155d2091a626e59c972"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ca49a8119c6cbd77375ae303b0cfd8c11f011abbbd64601167ecca18a87e7cdd"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:323ba25b92454adb36fa425dc5cf6f8f19f78948cbad2e7bc6cdf7b0d7982e59"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:1236ed0952fbd919c100bc839eaa4a39ebc397ed1c08a97fc45fee2a595aa1b3"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:729177eaf0aefca0994ce4cffe96ad3c75e377c7b6f4efa59ebf003b6d398716"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-win32.whl", hash = "sha256:804d99b24ad523a1fe18cc707bf741670332f7c7412e9d49cb5eab67e886b9b5"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-win_amd64.whl", hash = "sha256:a6cdcc3ede532f4a4b96000b6362099591ab4a3e913d70bcbac2b56c872446f7"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:72dffbd8b4194858d0941062a9766f8297e8868e1dd07a7b36212aaa90f49472"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:30dcc86377618a4c8f3b72418df92e77be4254d8f89f14b8e8f57d6d43603c0f"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31a34c508c003a4347d389a9e6fcc2307cc2150eb516462a7a17512130de109e"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15208be1c50b99203fe88d15695f22a5bed95ab3f84354c494bcb1d08557df67"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1873aade94b74715be2246321c8650cabf5a0d098a95bab81145ffffa4c13876"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a58c98a7e9c021f357348867f537017057c2ed7f77337fd914d0bedb35dace7"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4686818798f9194d03c9129a4d9a702d9e113a89cb03bffe08c6cf799e053291"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ebdc36bea43063116f0486869652cb2ed7032dbc59fbcb4445c4862b5c1ecf7f"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:ca08decd2697fdea0aea364b370b1249d47336aec935f87b8bbfd7da5b2ee9c1"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ac05fb791acf5e1a3e39402641827780fe44d27e72567a000412c648a85ba860"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-win32.whl", hash = "sha256:9dba73be7305b399924709b91682299794887cbbd88e38226ed9f6712eabee90"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-win_amd64.whl", hash = "sha256:f7ae5d65ccfbebdfa761585228eb4d0df3a8b15cfb53bd953e713e09fbb12957"},
+    {file = "psycopg2-binary-2.9.10.tar.gz", hash = "sha256:4b3df0e6990aa98acda57d983942eff13d824135fe2250e6522edaa782a06de2"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:0ea8e3d0ae83564f2fc554955d327fa081d065c8ca5cc6d2abb643e2c9c1200f"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:3e9c76f0ac6f92ecfc79516a8034a544926430f7b080ec5a0537bca389ee0906"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ad26b467a405c798aaa1458ba09d7e2b6e5f96b1ce0ac15d82fd9f95dc38a92"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:270934a475a0e4b6925b5f804e3809dd5f90f8613621d062848dd82f9cd62007"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:48b338f08d93e7be4ab2b5f1dbe69dc5e9ef07170fe1f86514422076d9c010d0"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f4152f8f76d2023aac16285576a9ecd2b11a9895373a1f10fd9db54b3ff06b4"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:32581b3020c72d7a421009ee1c6bf4a131ef5f0a968fab2e2de0c9d2bb4577f1"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:2ce3e21dc3437b1d960521eca599d57408a695a0d3c26797ea0f72e834c7ffe5"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e984839e75e0b60cfe75e351db53d6db750b00de45644c5d1f7ee5d1f34a1ce5"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3c4745a90b78e51d9ba06e2088a2fe0c693ae19cc8cb051ccda44e8df8a6eb53"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-win32.whl", hash = "sha256:e5720a5d25e3b99cd0dc5c8a440570469ff82659bb09431c1439b92caf184d3b"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-win_amd64.whl", hash = "sha256:3c18f74eb4386bf35e92ab2354a12c17e5eb4d9798e4c0ad3a00783eae7cd9f1"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:04392983d0bb89a8717772a193cfaac58871321e3ec69514e1c4e0d4957b5aff"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:1a6784f0ce3fec4edc64e985865c17778514325074adf5ad8f80636cd029ef7c"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5f86c56eeb91dc3135b3fd8a95dc7ae14c538a2f3ad77a19645cf55bab1799c"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b3d2491d4d78b6b14f76881905c7a8a8abcf974aad4a8a0b065273a0ed7a2cb"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2286791ececda3a723d1910441c793be44625d86d1a4e79942751197f4d30341"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:512d29bb12608891e349af6a0cccedce51677725a921c07dba6342beaf576f9a"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5a507320c58903967ef7384355a4da7ff3f28132d679aeb23572753cbf2ec10b"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:6d4fa1079cab9018f4d0bd2db307beaa612b0d13ba73b5c6304b9fe2fb441ff7"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:851485a42dbb0bdc1edcdabdb8557c09c9655dfa2ca0460ff210522e073e319e"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:35958ec9e46432d9076286dda67942ed6d968b9c3a6a2fd62b48939d1d78bf68"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-win32.whl", hash = "sha256:ecced182e935529727401b24d76634a357c71c9275b356efafd8a2a91ec07392"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:ee0e8c683a7ff25d23b55b11161c2663d4b099770f6085ff0a20d4505778d6b4"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:880845dfe1f85d9d5f7c412efea7a08946a46894537e4e5d091732eb1d34d9a0"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9440fa522a79356aaa482aa4ba500b65f28e5d0e63b801abf6aa152a29bd842a"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3923c1d9870c49a2d44f795df0c889a22380d36ef92440ff618ec315757e539"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b2c956c028ea5de47ff3a8d6b3cc3330ab45cf0b7c3da35a2d6ff8420896526"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f758ed67cab30b9a8d2833609513ce4d3bd027641673d4ebc9c067e4d208eec1"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cd9b4f2cfab88ed4a9106192de509464b75a906462fb846b936eabe45c2063e"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dc08420625b5a20b53551c50deae6e231e6371194fa0651dbe0fb206452ae1f"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:d7cd730dfa7c36dbe8724426bf5612798734bff2d3c3857f36f2733f5bfc7c00"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:155e69561d54d02b3c3209545fb08938e27889ff5a10c19de8d23eb5a41be8a5"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c3cc28a6fd5a4a26224007712e79b81dbaee2ffb90ff406256158ec4d7b52b47"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-win32.whl", hash = "sha256:ec8a77f521a17506a24a5f626cb2aee7850f9b69a0afe704586f63a464f3cd64"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:18c5ee682b9c6dd3696dad6e54cc7ff3a1a9020df6a5c0f861ef8bfd338c3ca0"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:26540d4a9a4e2b096f1ff9cce51253d0504dca5a85872c7f7be23be5a53eb18d"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:e217ce4d37667df0bc1c397fdcd8de5e81018ef305aed9415c3b093faaeb10fb"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:245159e7ab20a71d989da00f280ca57da7641fa2cdcf71749c193cea540a74f7"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c4ded1a24b20021ebe677b7b08ad10bf09aac197d6943bfe6fec70ac4e4690d"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3abb691ff9e57d4a93355f60d4f4c1dd2d68326c968e7db17ea96df3c023ef73"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8608c078134f0b3cbd9f89b34bd60a943b23fd33cc5f065e8d5f840061bd0673"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:230eeae2d71594103cd5b93fd29d1ace6420d0b86f4778739cb1a5a32f607d1f"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"},
+    {file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"},
+    {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"},
+    {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"},
+    {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:056470c3dc57904bbf63d6f534988bafc4e970ffd50f6271fc4ee7daad9498a5"},
+    {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73aa0e31fa4bb82578f3a6c74a73c273367727de397a7a0f07bd83cbea696baa"},
+    {file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:8de718c0e1c4b982a54b41779667242bc630b2197948405b7bd8ce16bcecac92"},
+    {file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:5c370b1e4975df846b0277b4deba86419ca77dbc25047f535b0bb03d1a544d44"},
+    {file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:ffe8ed017e4ed70f68b7b371d84b7d4a790368db9203dfc2d222febd3a9c8863"},
+    {file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:8aecc5e80c63f7459a1a2ab2c64df952051df196294d9f739933a9f6687e86b3"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:7a813c8bdbaaaab1f078014b9b0b13f5de757e2b5d9be6403639b298a04d218b"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d00924255d7fc916ef66e4bf22f354a940c67179ad3fd7067d7a0a9c84d2fbfc"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7559bce4b505762d737172556a4e6ea8a9998ecac1e39b5233465093e8cee697"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e8b58f0a96e7a1e341fc894f62c1177a7c83febebb5ff9123b579418fdc8a481"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b269105e59ac96aba877c1707c600ae55711d9dcd3fc4b5012e4af68e30c648"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:79625966e176dc97ddabc142351e0409e28acf4660b88d1cf6adb876d20c490d"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:8aabf1c1a04584c168984ac678a668094d831f152859d06e055288fa515e4d30"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:19721ac03892001ee8fdd11507e6a2e01f4e37014def96379411ca99d78aeb2c"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7f5d859928e635fa3ce3477704acee0f667b3a3d3e4bb109f2b18d4005f38287"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-win32.whl", hash = "sha256:3216ccf953b3f267691c90c6fe742e45d890d8272326b4a8b20850a03d05b7b8"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-win_amd64.whl", hash = "sha256:30e34c4e97964805f715206c7b789d54a78b70f3ff19fbe590104b71c45600e5"},
 ]

 [[package]]
@@ -3013,13 +3008,13 @@ files = [

 [[package]]
 name = "types-psycopg2"
-version = "2.9.21.10"
+version = "2.9.21.20241019"
 description = "Typing stubs for psycopg2"
 optional = false
-python-versions = "*"
+python-versions = ">=3.8"
 files = [
-    {file = "types-psycopg2-2.9.21.10.tar.gz", hash = "sha256:c2600892312ae1c34e12f145749795d93dc4eac3ef7dbf8a9c1bfd45385e80d7"},
-    {file = "types_psycopg2-2.9.21.10-py3-none-any.whl", hash = "sha256:918224a0731a3650832e46633e720703b5beef7693a064e777d9748654fcf5e5"},
+    {file = "types-psycopg2-2.9.21.20241019.tar.gz", hash = "sha256:bca89b988d2ebd19bcd08b177d22a877ea8b841decb10ed130afcf39404612fa"},
+    {file = "types_psycopg2-2.9.21.20241019-py3-none-any.whl", hash = "sha256:44d091e67732d16a941baae48cd7b53bf91911bc36888652447cf1ef0c1fb3f6"},
 ]

 [[package]]
@@ -3489,4 +3484,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "13bfc7479aacfe051abb92252b8ddc2e0c429f4607b2d9d8c4b353d2f75c1927"
+content-hash = "c656496f9fbb7c29b2df3143c1d72c95b5e121cb6340134c0b8d070f54a08508"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -23,7 +23,7 @@ bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
 camino.workspace = true
 chrono.workspace = true
-clap.workspace = true
+clap = { workspace = true, features = ["derive", "env"] }
 compute_api.workspace = true
 consumption_metrics.workspace = true
 dashmap.workspace = true
@@ -60,7 +60,7 @@ prometheus.workspace = true
 rand.workspace = true
 regex.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
-reqwest.workspace = true
+reqwest = { workspace = true, features = ["rustls-tls-native-roots"] }
 reqwest-middleware = { workspace = true, features = ["json"] }
 reqwest-retry.workspace = true
 reqwest-tracing.workspace = true
@@ -98,7 +98,7 @@ rustls-native-certs.workspace = true
 x509-parser.workspace = true
 postgres-protocol.workspace = true
 redis.workspace = true
-zerocopy = { version = "0.8", features = ["derive"] }
+zerocopy.workspace = true

 # jwt stuff
 jose-jwa = "0.1.2"
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -51,7 +51,7 @@ pub(super) async fn authenticate(
                sasl::Outcome::Success(key) => key,
                sasl::Outcome::Failure(reason) => {
                    info!("auth backend failed with an error: {reason}");
-                    return Err(auth::AuthError::auth_failed(&*creds.user));
+                    return Err(auth::AuthError::password_failed(&*creds.user));
                }
            };

--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -9,15 +9,14 @@ use super::ComputeCredentialKeys;
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
 use crate::context::RequestMonitoring;
-use crate::control_plane::provider::NodeInfo;
-use crate::control_plane::{self, CachedNodeInfo};
+use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
 use crate::error::{ReportableError, UserFacingError};
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::stream::PqStream;
 use crate::{auth, compute, waiters};

 #[derive(Debug, Error)]
-pub(crate) enum WebAuthError {
+pub(crate) enum ConsoleRedirectError {
    #[error(transparent)]
    WaiterRegister(#[from] waiters::RegisterError),

@@ -33,13 +32,13 @@ pub struct ConsoleRedirectBackend {
    console_uri: reqwest::Url,
 }

-impl UserFacingError for WebAuthError {
+impl UserFacingError for ConsoleRedirectError {
    fn to_string_client(&self) -> String {
        "Internal error".to_string()
    }
 }

-impl ReportableError for WebAuthError {
+impl ReportableError for ConsoleRedirectError {
    fn get_error_kind(&self) -> crate::error::ErrorKind {
        match self {
            Self::WaiterRegister(_) => crate::error::ErrorKind::Service,
@@ -104,7 +103,7 @@ async fn authenticate(
    link_uri: &reqwest::Url,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
-    ctx.set_auth_method(crate::context::AuthMethod::Web);
+    ctx.set_auth_method(crate::context::AuthMethod::ConsoleRedirect);

    // registering waiter can fail if we get unlucky with rng.
    // just try again.
@@ -117,7 +116,7 @@ async fn authenticate(
        }
    };

-    let span = info_span!("web", psql_session_id = &psql_session_id);
+    let span = info_span!("console_redirect", psql_session_id = &psql_session_id);
    let greeting = hello_message(link_uri, &psql_session_id);

    // Give user a URL to spawn a new database.
@@ -128,14 +127,16 @@ async fn authenticate(
        .write_message(&Be::NoticeResponse(&greeting))
        .await?;

-    // Wait for web console response (see `mgmt`).
+    // Wait for console response via control plane (see `mgmt`).
    info!(parent: &span, "waiting for console's reply...");
-    let db_info = tokio::time::timeout(auth_config.webauth_confirmation_timeout, waiter)
+    let db_info = tokio::time::timeout(auth_config.console_redirect_confirmation_timeout, waiter)
        .await
        .map_err(|_elapsed| {
-            auth::AuthError::confirmation_timeout(auth_config.webauth_confirmation_timeout.into())
+            auth::AuthError::confirmation_timeout(
+                auth_config.console_redirect_confirmation_timeout.into(),
+            )
        })?
-        .map_err(WebAuthError::from)?;
+        .map_err(ConsoleRedirectError::from)?;

    if auth_config.ip_allowlist_check_enabled {
        if let Some(allowed_ips) = &db_info.allowed_ips {
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -46,7 +46,7 @@ pub(crate) async fn authenticate_cleartext(
        sasl::Outcome::Success(key) => key,
        sasl::Outcome::Failure(reason) => {
            info!("auth backend failed with an error: {reason}");
-            return Err(auth::AuthError::auth_failed(&*info.user));
+            return Err(auth::AuthError::password_failed(&*info.user));
        }
    };

--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`SELECT neon.backpressure_throttling_time() AS throttled;`
				`@@ -0,0 +1 @@`
				`SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled;`
				`@@ -0,0 +1 @@`
				`SELECT current_setting('max_connections') as max_connections;`