more wip

wip
wal_decoder: make InterpretedWalRecord serde
2026-05-30 19:40:39 +00:00 · 2024-11-06 19:41:22 +01:00 · 2024-11-06 16:13:14 +01:00 · 2024-11-06 16:13:14 +01:00 · 2024-11-06 16:13:14 +01:00 · 2024-11-06 16:13:14 +01:00
158 changed files with 1765 additions and 4186 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -20,4 +20,3 @@ config-variables:
  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
  - DEV_AWS_OIDC_ROLE_ARN
-  - BENCHMARK_INGEST_TARGET_PROJECTID
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -221,8 +221,6 @@ runs:
        REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
      with:
-        # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
-        retries: 5
        script: |
          const { REPORT_URL, COMMIT_SHA } = process.env

--- a/.github/actions/set-docker-config-dir/action.yml
+++ b/.github/actions/set-docker-config-dir/action.yml
@@ -0,0 +1,36 @@
+name: "Set custom docker config directory"
+description: "Create a directory for docker config and set DOCKER_CONFIG"
+
+# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+runs:
+  using: "composite"
+  steps:
+  - name: Show warning on GitHub-hosted runners
+    if: runner.environment == 'github-hosted'
+    shell: bash -euo pipefail {0}
+    run: |
+      # Using the following environment variables to find a path to the workflow file
+      # ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch
+      # ${GITHUB_REPOSITORY}   - octocat/hello-world
+      # ${GITHUB_REF}          - refs/heads/my_branch
+      # From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables
+
+      filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"}
+      filename=${filename_with_ref%"@$GITHUB_REF"}
+
+      # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message
+      title='Unnecessary usage of `.github/actions/set-docker-config-dir`'
+      message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners'
+      echo "::warning file=${filename},title=${title}::${message}"
+
+  - uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7
+    env:
+      DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }}
+    with:
+      main: |
+        mkdir -p "${DOCKER_CONFIG}"
+        echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV
+      post: |
+        if [ -d "${DOCKER_CONFIG}" ]; then
+          rm -r "${DOCKER_CONFIG}"
+        fi
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,3 +1,14 @@
 ## Problem

 ## Summary of changes
+
+## Checklist before requesting a review
+
+- [ ] I have performed a self-review of my code.
+- [ ] If it is a core feature, I have added thorough tests.
+- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
+- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.
+
+## Checklist before merging
+
+- [ ] Do not forget to reformat commit message to not include the above checklist
--- a/.github/workflows/_check-codestyle-python.yml
+++ b/.github/workflows/_check-codestyle-python.yml
@@ -1,37 +0,0 @@
-name: Check Codestyle Python
-
-on:
-  workflow_call:
-    inputs:
-      build-tools-image:
-        description: 'build-tools image'
-        required: true
-        type: string
-
-defaults:
-  run:
-    shell: bash -euxo pipefail {0}
-
-jobs:
-  check-codestyle-python:
-    runs-on: [ self-hosted, small ]
-    container:
-      image: ${{ inputs.build-tools-image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      options: --init
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - uses: actions/cache@v4
-        with:
-          path: ~/.cache/pypoetry/virtualenvs
-          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
-
-      - run: ./scripts/pysync
-
-      - run: poetry run ruff check .
-      - run: poetry run ruff format --check .
-      - run: poetry run mypy .
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -64,7 +64,7 @@ jobs:

      - uses: actions/checkout@v4

-      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
+      - uses: ./.github/actions/set-docker-config-dir
      - uses: docker/setup-buildx-action@v3
        with:
          cache-binary: false
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -90,10 +90,35 @@ jobs:

  check-codestyle-python:
    needs: [ check-permissions, build-build-tools-image ]
-    uses: ./.github/workflows/_check-codestyle-python.yml
-    with:
-      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
-    secrets: inherit
+    runs-on: [ self-hosted, small ]
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Cache poetry deps
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/pypoetry/virtualenvs
+          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
+
+      - name: Install Python deps
+        run: ./scripts/pysync
+
+      - name: Run `ruff check` to ensure code format
+        run: poetry run ruff check .
+
+      - name: Run `ruff format` to ensure code format
+        run: poetry run ruff format --check .
+
+      - name: Run mypy to check types
+        run: poetry run mypy .

  check-codestyle-jsonnet:
    needs: [ check-permissions, build-build-tools-image ]
@@ -116,7 +141,6 @@ jobs:
  # Check that the vendor/postgres-* submodules point to the
  # corresponding REL_*_STABLE_neon branches.
  check-submodules:
-    needs: [ check-permissions ]
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout
@@ -497,8 +521,6 @@ jobs:
          REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        with:
-          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
-          retries: 5
          script: |
            const { REPORT_URL_NEW, COMMIT_SHA } = process.env

@@ -530,7 +552,7 @@ jobs:
        with:
          submodules: true

-      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
+      - uses: ./.github/actions/set-docker-config-dir
      - uses: docker/setup-buildx-action@v3
        with:
          cache-binary: false
@@ -621,7 +643,7 @@ jobs:
        with:
          submodules: true

-      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
+      - uses: ./.github/actions/set-docker-config-dir
      - uses: docker/setup-buildx-action@v3
        with:
          cache-binary: false
@@ -802,7 +824,7 @@ jobs:
          curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
          chmod +x vm-builder

-      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
+      - uses: ./.github/actions/set-docker-config-dir
      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -838,7 +860,7 @@ jobs:
    steps:
      - uses: actions/checkout@v4

-      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
+      - uses: ./.github/actions/set-docker-config-dir
      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -1,372 +0,0 @@
-name: Benchmarking
-
-on:
-  # uncomment to run on push for debugging your PR
-  # push:
-  #   branches: [ your branch ]
-  schedule:
-    # * is a special character in YAML so you have to quote this string
-    #          ┌───────────── minute (0 - 59)
-    #          │ ┌───────────── hour (0 - 23)
-    #          │ │ ┌───────────── day of the month (1 - 31)
-    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
-    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:   '0 9 * * *' # run once a day, timezone is utc
-  workflow_dispatch: # adds ability to run this manually
-    
-defaults:
-  run:
-    shell: bash -euxo pipefail {0}
-
-concurrency:
-  # Allow only one workflow globally because we need dedicated resources which only exist once
-  group: ingest-bench-workflow
-  cancel-in-progress: true
-
-jobs:
-  ingest:
-    strategy:
-      matrix:
-        target_project: [new_empty_project, large_existing_project]  
-    permissions:
-      contents: write
-      statuses: write
-      id-token: write # aws-actions/configure-aws-credentials
-    env:
-      PG_CONFIG: /tmp/neon/pg_install/v16/bin/pg_config
-      PSQL: /tmp/neon/pg_install/v16/bin/psql
-      PG_16_LIB_PATH: /tmp/neon/pg_install/v16/lib
-      PGCOPYDB: /pgcopydb/bin/pgcopydb
-      PGCOPYDB_LIB_PATH: /pgcopydb/lib
-    runs-on: [ self-hosted, us-east-2, x64 ]
-    container:
-      image: neondatabase/build-tools:pinned-bookworm
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      options: --init
-    timeout-minutes: 1440
-
-    steps:
-    - uses: actions/checkout@v4
-
-    - name: Configure AWS credentials # necessary to download artefacts
-      uses: aws-actions/configure-aws-credentials@v4
-      with:
-        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-        role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role 
-
-    - name: Download Neon artifact
-      uses: ./.github/actions/download
-      with:
-        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
-        path: /tmp/neon/
-        prefix: latest
-
-    - name: Create Neon Project
-      if: ${{ matrix.target_project == 'new_empty_project' }}
-      id: create-neon-project-ingest-target
-      uses: ./.github/actions/neon-project-create
-      with:
-        region_id: aws-us-east-2
-        postgres_version: 16
-        compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck
-        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-
-    - name: Initialize Neon project and retrieve current backpressure seconds
-      if: ${{ matrix.target_project == 'new_empty_project' }}
-      env:
-          NEW_PROJECT_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }}
-          NEW_PROJECT_ID: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
-      run: |
-        echo "Initializing Neon project with project_id: ${NEW_PROJECT_ID}"
-        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
-        ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
-        BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
-        echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV
-        echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV
-
-    - name: Create Neon Branch for large tenant
-      if: ${{ matrix.target_project == 'large_existing_project' }}
-      id: create-neon-branch-ingest-target
-      uses: ./.github/actions/neon-branch-create
-      with:
-        project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }}
-        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-
-    - name: Initialize Neon project and retrieve current backpressure seconds
-      if: ${{ matrix.target_project == 'large_existing_project' }}
-      env:
-          NEW_PROJECT_CONNSTR: ${{ steps.create-neon-branch-ingest-target.outputs.dsn }}
-          NEW_BRANCH_ID: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }}
-      run: |
-        echo "Initializing Neon branch with branch_id: ${NEW_BRANCH_ID}"
-        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
-        # Extract the part before the database name
-        base_connstr="${NEW_PROJECT_CONNSTR%/*}"
-        # Extract the query parameters (if any) after the database name
-        query_params="${NEW_PROJECT_CONNSTR#*\?}"
-        # Reconstruct the new connection string
-        if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then
-          new_connstr="${base_connstr}/neondb?${query_params}"
-        else
-          new_connstr="${base_connstr}/neondb"
-        fi
-        ${PSQL} "${new_connstr}" -c "drop database ludicrous;"
-        ${PSQL} "${new_connstr}" -c "CREATE DATABASE ludicrous;"
-        if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then
-          NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous?${query_params}"
-        else
-          NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous"
-        fi
-        ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
-        BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
-        echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV
-        echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV
-      
-        
-    - name: Create pgcopydb filter file
-      run: |
-        cat << EOF > /tmp/pgcopydb_filter.txt
-          [include-only-table]
-          public.events
-          public.emails
-          public.email_transmissions
-          public.payments
-          public.editions
-          public.edition_modules
-          public.sp_content
-          public.email_broadcasts
-          public.user_collections
-          public.devices
-          public.user_accounts
-          public.lessons
-          public.lesson_users
-          public.payment_methods
-          public.orders
-          public.course_emails
-          public.modules
-          public.users
-          public.module_users
-          public.courses
-          public.payment_gateway_keys
-          public.accounts
-          public.roles
-          public.payment_gateways
-          public.management
-          public.event_names
-        EOF
-
-    - name: Invoke pgcopydb
-      env:
-          BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }}
-      run: |
-        export LD_LIBRARY_PATH=${PGCOPYDB_LIB_PATH}:${PG_16_LIB_PATH}
-        export PGCOPYDB_SOURCE_PGURI="${BENCHMARK_INGEST_SOURCE_CONNSTR}"
-        export PGCOPYDB_TARGET_PGURI="${NEW_PROJECT_CONNSTR}"
-        export PGOPTIONS="-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
-        ${PG_CONFIG} --bindir
-        ${PGCOPYDB} --version
-        ${PGCOPYDB} clone --skip-vacuum  --no-owner --no-acl --skip-db-properties --table-jobs 4 \
-          --index-jobs 4 --restore-jobs 4 --split-tables-larger-than 10GB --skip-extensions \
-          --use-copy-binary --filters /tmp/pgcopydb_filter.txt 2>&1 | tee /tmp/pgcopydb_${{ matrix.target_project }}.log
-
-    # create dummy pgcopydb log to test parsing
-    # - name: create dummy log for parser test
-    #   run: |
-    #     cat << EOF > /tmp/pgcopydb_${{ matrix.target_project }}.log
-    #     2024-11-04 18:00:53.433 500861 INFO   main.c:136                Running pgcopydb version 0.17.10.g8361a93 from "/usr/lib/postgresql/17/bin/pgcopydb"
-    #     2024-11-04 18:00:53.434 500861 INFO   cli_common.c:1225         [SOURCE] Copying database from "postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60"
-    #     2024-11-04 18:00:53.434 500861 INFO   cli_common.c:1226         [TARGET] Copying database into "postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60"
-    #     2024-11-04 18:00:53.442 500861 INFO   copydb.c:105              Using work dir "/tmp/pgcopydb"
-    #     2024-11-04 18:00:53.541 500861 INFO   snapshot.c:107            Exported snapshot "00000008-00000033-1" from the source database
-    #     2024-11-04 18:00:53.556 500865 INFO   cli_clone_follow.c:543    STEP 1: fetch source database tables, indexes, and sequences
-    #     2024-11-04 18:00:54.570 500865 INFO   copydb_schema.c:716       Splitting source candidate tables larger than 10 GB
-    #     2024-11-04 18:00:54.570 500865 INFO   copydb_schema.c:829       Table public.events is 96 GB large which is larger than --split-tables-larger-than 10 GB, and does not have a unique column of type integer: splitting by CTID
-    #     2024-11-04 18:01:05.538 500865 INFO   copydb_schema.c:905       Table public.events is 96 GB large, 10 COPY processes will be used, partitioning on ctid.
-    #     2024-11-04 18:01:05.564 500865 INFO   copydb_schema.c:905       Table public.email_transmissions is 27 GB large, 4 COPY processes will be used, partitioning on id.
-    #     2024-11-04 18:01:05.584 500865 INFO   copydb_schema.c:905       Table public.lessons is 25 GB large, 4 COPY processes will be used, partitioning on id.
-    #     2024-11-04 18:01:05.605 500865 INFO   copydb_schema.c:905       Table public.lesson_users is 16 GB large, 3 COPY processes will be used, partitioning on id.
-    #     2024-11-04 18:01:05.605 500865 INFO   copydb_schema.c:761       Fetched information for 26 tables (including 4 tables split in 21 partitions total), with an estimated total of 907 million tuples and 175 GB on-disk
-    #     2024-11-04 18:01:05.687 500865 INFO   copydb_schema.c:968       Fetched information for 57 indexes (supporting 25 constraints)
-    #     2024-11-04 18:01:05.753 500865 INFO   sequences.c:78            Fetching information for 24 sequences
-    #     2024-11-04 18:01:05.903 500865 INFO   copydb_schema.c:1122      Fetched information for 4 extensions
-    #     2024-11-04 18:01:06.178 500865 INFO   copydb_schema.c:1538      Found 0 indexes (supporting 0 constraints) in the target database
-    #     2024-11-04 18:01:06.184 500865 INFO   cli_clone_follow.c:584    STEP 2: dump the source database schema (pre/post data)
-    #     2024-11-04 18:01:06.186 500865 INFO   pgcmd.c:468                /usr/lib/postgresql/16/bin/pg_dump -Fc --snapshot 00000008-00000033-1 --section=pre-data --section=post-data --file /tmp/pgcopydb/schema/schema.dump 'postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60'
-    #     2024-11-04 18:01:06.952 500865 INFO   cli_clone_follow.c:592    STEP 3: restore the pre-data section to the target database
-    #     2024-11-04 18:01:07.004 500865 INFO   pgcmd.c:1001               /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section pre-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/pre-filtered.list /tmp/pgcopydb/schema/schema.dump
-    #     2024-11-04 18:01:07.438 500874 INFO   table-data.c:656          STEP 4: starting 4 table-data COPY processes
-    #     2024-11-04 18:01:07.451 500877 INFO   vacuum.c:139              STEP 8: skipping VACUUM jobs per --skip-vacuum
-    #     2024-11-04 18:01:07.457 500875 INFO   indexes.c:182             STEP 6: starting 4 CREATE INDEX processes
-    #     2024-11-04 18:01:07.457 500875 INFO   indexes.c:183             STEP 7: constraints are built by the CREATE INDEX processes
-    #     2024-11-04 18:01:07.507 500865 INFO   blobs.c:74                Skipping large objects: none found.
-    #     2024-11-04 18:01:07.509 500865 INFO   sequences.c:194           STEP 9: reset sequences values
-    #     2024-11-04 18:01:07.510 500886 INFO   sequences.c:290           Set sequences values on the target database
-    #     2024-11-04 20:49:00.587 500865 INFO   cli_clone_follow.c:608    STEP 10: restore the post-data section to the target database
-    #     2024-11-04 20:49:00.600 500865 INFO   pgcmd.c:1001               /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section post-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/post-filtered.list /tmp/pgcopydb/schema/schema.dump
-    #     2024-11-05 10:50:58.508 500865 INFO   cli_clone_follow.c:639    All step are now done, 16h49m elapsed
-    #     2024-11-05 10:50:58.508 500865 INFO   summary.c:3155            Printing summary for 26 tables and 57 indexes
-
-    #       OID | Schema |                 Name | Parts | copy duration | transmitted bytes | indexes | create index duration 
-    #     ------+--------+----------------------+-------+---------------+-------------------+---------+----------------------
-    #     24654 | public |               events |    10 |         1d11h |            878 GB |       1 |                 1h41m
-    #     24623 | public |  email_transmissions |     4 |         4h46m |             99 GB |       3 |                 2h04m
-    #     24665 | public |              lessons |     4 |         4h42m |            161 GB |       4 |                 1m11s
-    #     24661 | public |         lesson_users |     3 |         2h46m |             49 GB |       3 |                39m35s
-    #     24631 | public |               emails |     1 |        34m07s |             10 GB |       2 |                   17s
-    #     24739 | public |             payments |     1 |         5m47s |           1848 MB |       4 |                 4m40s
-    #     24681 | public |         module_users |     1 |         4m57s |           1610 MB |       3 |                 1m50s
-    #     24694 | public |               orders |     1 |         2m50s |            835 MB |       3 |                 1m05s
-    #     24597 | public |              devices |     1 |         1m45s |            498 MB |       2 |                   40s
-    #     24723 | public |      payment_methods |     1 |         1m24s |            548 MB |       2 |                   31s
-    #     24765 | public |     user_collections |     1 |         2m17s |           1005 MB |       2 |                 968ms
-    #     24774 | public |                users |     1 |           52s |            291 MB |       4 |                   27s
-    #     24760 | public |        user_accounts |     1 |           16s |            172 MB |       3 |                   16s
-    #     24606 | public |      edition_modules |     1 |         8s983 |             46 MB |       3 |                 4s749
-    #     24583 | public |        course_emails |     1 |         8s526 |             26 MB |       2 |                 996ms
-    #     24685 | public |              modules |     1 |         1s592 |             21 MB |       3 |                 1s696
-    #     24610 | public |             editions |     1 |         2s199 |           7483 kB |       2 |                 1s032
-    #     24755 | public |           sp_content |     1 |         1s555 |           4177 kB |       0 |                   0ms
-    #     24619 | public |     email_broadcasts |     1 |         744ms |           2645 kB |       2 |                 677ms
-    #     24590 | public |              courses |     1 |         387ms |           1540 kB |       2 |                 367ms
-    #     24704 | public | payment_gateway_keys |     1 |         1s972 |            164 kB |       2 |                  27ms
-    #     24576 | public |             accounts |     1 |          58ms |             24 kB |       1 |                  14ms
-    #     24647 | public |          event_names |     1 |          32ms |             397 B |       1 |                   8ms
-    #     24716 | public |     payment_gateways |     1 |         1s675 |             117 B |       1 |                  11ms
-    #     24748 | public |                roles |     1 |          71ms |             173 B |       1 |                   8ms
-    #     24676 | public |           management |     1 |          33ms |              40 B |       1 |                  19ms
-
-
-    #                                                   Step   Connection    Duration    Transfer   Concurrency
-    #     --------------------------------------------------   ----------  ----------  ----------  ------------
-    #       Catalog Queries (table ordering, filtering, etc)       source         12s                         1
-    #                                             Dump Schema       source       765ms                         1
-    #                                         Prepare Schema       target       466ms                         1
-    #           COPY, INDEX, CONSTRAINTS, VACUUM (wall clock)         both       2h47m                        12
-    #                                       COPY (cumulative)         both       7h46m     1225 GB             4
-    #                               CREATE INDEX (cumulative)       target       4h36m                         4
-    #                               CONSTRAINTS (cumulative)       target       8s493                         4
-    #                                     VACUUM (cumulative)       target         0ms                         4
-    #                                         Reset Sequences         both        60ms                         1
-    #                             Large Objects (cumulative)       (null)         0ms                         0
-    #                                         Finalize Schema         both      14h01m                         4
-    #     --------------------------------------------------   ----------  ----------  ----------  ------------
-    #                               Total Wall Clock Duration         both      16h49m                        20
-
-
-    #     EOF
-
-
-    - name: show tables sizes and retrieve current backpressure seconds
-      run: |
-        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
-        ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "\dt+"
-        BACKPRESSURE_TIME_AFTER_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
-        echo "BACKPRESSURE_TIME_AFTER_INGEST=${BACKPRESSURE_TIME_AFTER_INGEST}" >> $GITHUB_ENV
-
-    - name: Parse pgcopydb log and report performance metrics
-      env:
-        PERF_TEST_RESULT_CONNSTR: ${{ secrets.PERF_TEST_RESULT_CONNSTR }}
-      run: |
-        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
-
-        # Define the log file path
-        LOG_FILE="/tmp/pgcopydb_${{ matrix.target_project }}.log"
-        
-        # Get the current git commit hash
-        git config --global --add safe.directory /__w/neon/neon
-        COMMIT_HASH=$(git rev-parse --short HEAD)
-        
-        # Define the platform and test suite
-        PLATFORM="pg16-${{ matrix.target_project }}-us-east-2-staging"
-        SUIT="pgcopydb_ingest_bench"
-        
-        # Function to convert time (e.g., "2h47m", "4h36m", "118ms", "8s493") to seconds
-        convert_to_seconds() {
-          local duration=$1
-          local total_seconds=0
-    
-          # Check for hours (h)
-          if [[ "$duration" =~ ([0-9]+)h ]]; then
-            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 3600))
-          fi
-    
-          # Check for seconds (s)
-          if [[ "$duration" =~ ([0-9]+)s ]]; then
-            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0}))
-          fi
-    
-          # Check for milliseconds (ms) (if applicable)
-          if [[ "$duration" =~ ([0-9]+)ms ]]; then
-            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} / 1000))
-            duration=${duration/${BASH_REMATCH[0]}/} # need to remove it to avoid double counting with m 
-          fi
-
-          # Check for minutes (m) - must be checked after ms because m is contained in ms
-          if [[ "$duration" =~ ([0-9]+)m ]]; then
-            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 60))
-          fi
-    
-          echo $total_seconds
-        }
-
-        # Calculate the backpressure difference in seconds
-        BACKPRESSURE_TIME_DIFF=$(awk "BEGIN {print $BACKPRESSURE_TIME_AFTER_INGEST - $BACKPRESSURE_TIME_BEFORE_INGEST}")
-
-        # Insert the backpressure time difference into the performance database
-        if [ -n "$BACKPRESSURE_TIME_DIFF" ]; then
-          PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \"
-          INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp)
-          VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', 'backpressure_time', ${BACKPRESSURE_TIME_DIFF}, 'seconds', 'lower_is_better', now());
-          \""
-          echo "Inserting backpressure time difference: ${BACKPRESSURE_TIME_DIFF} seconds"
-          eval $PSQL_CMD
-        fi
-
-        # Extract and process log lines
-        while IFS= read -r line; do
-          METRIC_NAME=""
-          # Match each desired line and extract the relevant information
-          if [[ "$line" =~ COPY,\ INDEX,\ CONSTRAINTS,\ VACUUM.* ]]; then
-            METRIC_NAME="COPY, INDEX, CONSTRAINTS, VACUUM (wall clock)"
-          elif [[ "$line" =~ COPY\ \(cumulative\).* ]]; then
-            METRIC_NAME="COPY (cumulative)"
-          elif [[ "$line" =~ CREATE\ INDEX\ \(cumulative\).* ]]; then
-            METRIC_NAME="CREATE INDEX (cumulative)"
-          elif [[ "$line" =~ CONSTRAINTS\ \(cumulative\).* ]]; then
-            METRIC_NAME="CONSTRAINTS (cumulative)"
-          elif [[ "$line" =~ Finalize\ Schema.* ]]; then
-            METRIC_NAME="Finalize Schema"
-          elif [[ "$line" =~ Total\ Wall\ Clock\ Duration.* ]]; then
-            METRIC_NAME="Total Wall Clock Duration"
-          fi
-          
-          # If a metric was matched, insert it into the performance database
-          if [ -n "$METRIC_NAME" ]; then
-            DURATION=$(echo "$line" | grep -oP '\d+h\d+m|\d+s|\d+ms|\d{1,2}h\d{1,2}m|\d+\.\d+s' | head -n 1)
-            METRIC_VALUE=$(convert_to_seconds "$DURATION")
-            PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \"
-            INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp)
-            VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', '${METRIC_NAME}', ${METRIC_VALUE}, 'seconds', 'lower_is_better', now());
-            \""
-            echo "Inserting ${METRIC_NAME} with value ${METRIC_VALUE} seconds"
-            eval $PSQL_CMD
-          fi
-        done < "$LOG_FILE"
-      
-    - name: Delete Neon Project
-      if: ${{ always() && matrix.target_project == 'new_empty_project' }}
-      uses: ./.github/actions/neon-project-delete
-      with:
-        project_id: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
-        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-
-    - name: Delete Neon Branch for large tenant
-      if: ${{ always() && matrix.target_project == 'large_existing_project' }}
-      uses: ./.github/actions/neon-branch-delete
-      with:
-        project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }}
-        branch_id: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }}
-        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -201,8 +201,6 @@ jobs:
          REPORT_URL: ${{ steps.upload-stats.outputs.report-url }}
          SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        with:
-          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
-          retries: 5
          script: |
            const { REPORT_URL, SHA } = process.env

--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -1,94 +0,0 @@
-name: Pre-merge checks
-
-on:
-  merge_group:
-    branches:
-      - main
-
-defaults:
-  run:
-    shell: bash -euxo pipefail {0}
-
-# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
-permissions: {}
-
-jobs:
-  get-changed-files:
-    runs-on: ubuntu-22.04
-    outputs:
-      python-changed: ${{ steps.python-src.outputs.any_changed }}
-    steps:
-      - uses: actions/checkout@v4
-      - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
-        id: python-src
-        with:
-          files: |
-            .github/workflows/pre-merge-checks.yml
-            **/**.py
-            poetry.lock
-            pyproject.toml
-
-      - name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES
-        env:
-          PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }}
-        run: |
-          echo "${PYTHON_CHANGED_FILES}"
-
-  check-build-tools-image:
-    if: needs.get-changed-files.outputs.python-changed == 'true'
-    needs: [ get-changed-files ]
-    uses: ./.github/workflows/check-build-tools-image.yml
-
-  build-build-tools-image:
-    needs: [ check-build-tools-image ]
-    uses: ./.github/workflows/build-build-tools-image.yml
-    with:
-      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
-    secrets: inherit
-
-  check-codestyle-python:
-    if: needs.get-changed-files.outputs.python-changed == 'true'
-    needs: [ get-changed-files, build-build-tools-image ]
-    uses: ./.github/workflows/_check-codestyle-python.yml
-    with:
-      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
-    secrets: inherit
-
-  # To get items from the merge queue merged into main we need to satisfy "Status checks that are required".
-  # Currently we require 2 jobs (checks with exact name):
-  # - conclusion
-  # - neon-cloud-e2e
-  conclusion:
-    if: always()
-    permissions:
-      statuses: write # for `github.repos.createCommitStatus(...)`
-    needs:
-      - get-changed-files
-      - check-codestyle-python
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Create fake `neon-cloud-e2e` check
-        uses: actions/github-script@v7
-        with:
-          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
-          retries: 5
-          script: |
-            const { repo, owner } = context.repo;
-            const targetUrl = `${context.serverUrl}/${owner}/${repo}/actions/runs/${context.runId}`;
-
-            await github.rest.repos.createCommitStatus({
-              owner: owner,
-              repo: repo,
-              sha: context.sha,
-              context: `neon-cloud-e2e`,
-              state: `success`,
-              target_url: targetUrl,
-              description: `fake check for merge queue`,
-            });
-
-      - name: Fail the job if any of the dependencies do not succeed or skipped
-        run: exit 1
-        if: |
-          (contains(needs.check-codestyle-python.result, 'skipped') && needs.get-changed-files.outputs.python-changed == 'true')
-          || contains(needs.*.result, 'failure')
-          || contains(needs.*.result, 'cancelled')
--- a/.github/workflows/report-workflow-stats-batch.yml
+++ b/.github/workflows/report-workflow-stats-batch.yml
@@ -1,29 +0,0 @@
-name: Report Workflow Stats Batch
-
-on:
-  schedule:
-    - cron: '*/15 * * * *'
-    - cron: '25 0 * * *'
-
-jobs:
-  gh-workflow-stats-batch:
-    name: GitHub Workflow Stats Batch
-    runs-on: ubuntu-22.04
-    permissions:
-      actions: read
-    steps:
-    - name: Export Workflow Run for the past 2 hours
-      uses: neondatabase/gh-workflow-stats-action@v0.2.1
-      with:
-        db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
-        db_table: "gh_workflow_stats_batch_neon"
-        gh_token: ${{ secrets.GITHUB_TOKEN }}
-        duration: '2h'
-    - name: Export Workflow Run for the past 24 hours
-      if: github.event.schedule == '25 0 * * *'
-      uses: neondatabase/gh-workflow-stats-action@v0.2.1
-      with:
-        db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
-        db_table: "gh_workflow_stats_batch_neon"
-        gh_token: ${{ secrets.GITHUB_TOKEN }}
-        duration: '24h'
--- a/.github/workflows/report-workflow-stats.yml
+++ b/.github/workflows/report-workflow-stats.yml
@@ -23,7 +23,6 @@ on:
    - Test Postgres client libraries
    - Trigger E2E Tests
    - cleanup caches by a branch
-    - Pre-merge checks
    types: [completed]

 jobs:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1229,15 +1229,12 @@ dependencies = [
 "flate2",
 "futures",
 "hyper 0.14.30",
- "metrics",
 "nix 0.27.1",
 "notify",
 "num_cpus",
- "once_cell",
 "opentelemetry",
 "opentelemetry_sdk",
 "postgres",
- "prometheus",
 "regex",
 "remote_storage",
 "reqwest 0.12.4",
@@ -1248,7 +1245,7 @@ dependencies = [
 "tar",
 "thiserror",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
 "tokio-stream",
 "tokio-util",
 "tracing",
@@ -1354,7 +1351,7 @@ dependencies = [
 "storage_broker",
 "thiserror",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
 "tokio-util",
 "toml",
 "toml_edit",
@@ -3623,8 +3620,8 @@ dependencies = [
 "pageserver_compaction",
 "pin-project-lite",
 "postgres",
- "postgres-protocol",
- "postgres-types",
+ "postgres-protocol 0.6.4",
+ "postgres-types 0.2.4",
 "postgres_backend",
 "postgres_connection",
 "postgres_ffi",
@@ -3652,7 +3649,7 @@ dependencies = [
 "tokio",
 "tokio-epoll-uring",
 "tokio-io-timeout",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
@@ -3710,7 +3707,7 @@ dependencies = [
 "serde",
 "thiserror",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
 "tokio-stream",
 "tokio-util",
 "utils",
@@ -4009,14 +4006,31 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "bytes",
 "fallible-iterator",
 "futures-util",
 "log",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
+]
+
+[[package]]
+name = "postgres-protocol"
+version = "0.6.4"
+dependencies = [
+ "base64 0.20.0",
+ "byteorder",
+ "bytes",
+ "fallible-iterator",
+ "hmac",
+ "lazy_static",
+ "md-5",
+ "memchr",
+ "rand 0.8.5",
+ "sha2",
+ "stringprep",
+ "tokio",
 ]

 [[package]]
@@ -4038,6 +4052,17 @@ dependencies = [
 "tokio",
 ]

+[[package]]
+name = "postgres-types"
+version = "0.2.4"
+dependencies = [
+ "bytes",
+ "fallible-iterator",
+ "postgres-protocol 0.6.4",
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
@@ -4045,7 +4070,7 @@ source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1
 dependencies = [
 "bytes",
 "fallible-iterator",
- "postgres-protocol",
+ "postgres-protocol 0.6.4 (git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2)",
 "serde",
 "serde_json",
 ]
@@ -4063,7 +4088,7 @@ dependencies = [
 "serde",
 "thiserror",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
 "tokio-postgres-rustls",
 "tokio-rustls 0.26.0",
 "tokio-util",
@@ -4078,7 +4103,7 @@ dependencies = [
 "itertools 0.10.5",
 "once_cell",
 "postgres",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
 "url",
 ]

@@ -4130,7 +4155,7 @@ dependencies = [
 "byteorder",
 "bytes",
 "itertools 0.10.5",
- "postgres-protocol",
+ "postgres-protocol 0.6.4",
 "rand 0.8.5",
 "serde",
 "thiserror",
@@ -4316,7 +4341,7 @@ dependencies = [
 "parquet_derive",
 "pbkdf2",
 "pin-project-lite",
- "postgres-protocol",
+ "postgres-protocol 0.6.4",
 "postgres_backend",
 "pq_proto",
 "prometheus",
@@ -4351,7 +4376,7 @@ dependencies = [
 "tikv-jemalloc-ctl",
 "tikv-jemallocator",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
 "tokio-postgres-rustls",
 "tokio-rustls 0.26.0",
 "tokio-tungstenite",
@@ -4746,7 +4771,6 @@ dependencies = [
 "percent-encoding",
 "pin-project-lite",
 "rustls 0.22.4",
- "rustls-native-certs 0.7.0",
 "rustls-pemfile 2.1.1",
 "rustls-pki-types",
 "serde",
@@ -5150,7 +5174,6 @@ dependencies = [
 "chrono",
 "clap",
 "crc32c",
- "criterion",
 "desim",
 "fail",
 "futures",
@@ -5158,12 +5181,12 @@ dependencies = [
 "http 1.1.0",
 "humantime",
 "hyper 0.14.30",
- "itertools 0.10.5",
 "metrics",
 "once_cell",
+ "pageserver_api",
 "parking_lot 0.12.1",
 "postgres",
- "postgres-protocol",
+ "postgres-protocol 0.6.4",
 "postgres_backend",
 "postgres_ffi",
 "pq_proto",
@@ -5183,7 +5206,7 @@ dependencies = [
 "thiserror",
 "tokio",
 "tokio-io-timeout",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
@@ -5191,6 +5214,7 @@ dependencies = [
 "tracing-subscriber",
 "url",
 "utils",
+ "wal_decoder",
 "walproposer",
 "workspace_hack",
 ]
@@ -5827,7 +5851,7 @@ dependencies = [
 "serde_json",
 "storage_controller_client",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
 "tokio-postgres-rustls",
 "tokio-stream",
 "tokio-util",
@@ -6224,6 +6248,28 @@ dependencies = [
 "syn 2.0.52",
 ]

+[[package]]
+name = "tokio-postgres"
+version = "0.7.7"
+dependencies = [
+ "async-trait",
+ "byteorder",
+ "bytes",
+ "fallible-iterator",
+ "futures-channel",
+ "futures-util",
+ "log",
+ "parking_lot 0.12.1",
+ "percent-encoding",
+ "phf",
+ "pin-project-lite",
+ "postgres-protocol 0.6.4",
+ "postgres-types 0.2.4",
+ "socket2",
+ "tokio",
+ "tokio-util",
+]
+
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
@@ -6240,8 +6286,8 @@ dependencies = [
 "percent-encoding",
 "phf",
 "pin-project-lite",
- "postgres-protocol",
- "postgres-types",
+ "postgres-protocol 0.6.4 (git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2)",
+ "postgres-types 0.2.4 (git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2)",
 "socket2",
 "tokio",
 "tokio-util",
@@ -6256,7 +6302,7 @@ dependencies = [
 "ring",
 "rustls 0.23.16",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
 "tokio-rustls 0.26.0",
 "x509-certificate",
 ]
@@ -6839,7 +6885,7 @@ dependencies = [
 "serde_json",
 "sysinfo",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7",
 "tokio-util",
 "tracing",
 "tracing-subscriber",
@@ -7346,7 +7392,7 @@ dependencies = [
 "num-traits",
 "once_cell",
 "parquet",
- "postgres-types",
+ "postgres-types 0.2.4 (git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2)",
 "prettyplease",
 "proc-macro2",
 "prost",
@@ -7371,7 +7417,7 @@ dependencies = [
 "time",
 "time-macros",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.7 (git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2)",
 "tokio-rustls 0.26.0",
 "tokio-stream",
 "tokio-util",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -214,10 +214,14 @@ log = "0.4"
 #
 # When those proxy changes are re-applied (see PR #8747), we can switch using
 # the tip of the 'neon' branch again.
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+# postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+# postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+# postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+# tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+postgres = { path = "../../.cargo/git/checkouts/rust-postgres-e2c00088c8e2b112/20031d7/postgres" }
+postgres-protocol = { path = "../../.cargo/git/checkouts/rust-postgres-e2c00088c8e2b112/20031d7/postgres-protocol" }
+postgres-types = { path = "../../.cargo/git/checkouts/rust-postgres-e2c00088c8e2b112/20031d7/postgres-types" }
+tokio-postgres = { path = "../../.cargo/git/checkouts/rust-postgres-e2c00088c8e2b112/20031d7/tokio-postgres" }

 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
@@ -255,7 +259,8 @@ tonic-build = "0.12"
 [patch.crates-io]

 # Needed to get `tokio-postgres-rustls` to depend on our fork.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+tokio-postgres = { path = "../../.cargo/git/checkouts/rust-postgres-e2c00088c8e2b112/20031d7/tokio-postgres" }
+# tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }

 ################# Binary contents sections

--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -1,66 +1,12 @@
 ARG DEBIAN_VERSION=bullseye

-FROM debian:bookworm-slim AS pgcopydb_builder
-ARG DEBIAN_VERSION
-
-RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
-        set -e && \
-        apt update && \
-        apt install -y --no-install-recommends \
-        ca-certificates wget gpg && \
-        wget -qO - https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor -o /usr/share/keyrings/postgresql-keyring.gpg && \
-        echo "deb [signed-by=/usr/share/keyrings/postgresql-keyring.gpg] http://apt.postgresql.org/pub/repos/apt bookworm-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \
-        apt-get update && \
-        apt install -y --no-install-recommends \
-        build-essential \
-        autotools-dev \
-        libedit-dev \
-        libgc-dev \
-        libpam0g-dev \
-        libreadline-dev \
-        libselinux1-dev \
-        libxslt1-dev \
-        libssl-dev \
-        libkrb5-dev \
-        zlib1g-dev \
-        liblz4-dev \
-        libpq5 \
-        libpq-dev \
-        libzstd-dev \
-        postgresql-16 \
-        postgresql-server-dev-16 \
-        postgresql-common  \
-        python3-sphinx && \
-        wget -O /tmp/pgcopydb.tar.gz https://github.com/dimitri/pgcopydb/archive/refs/tags/v0.17.tar.gz && \
-        mkdir /tmp/pgcopydb && \
-        tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \
-        cd /tmp/pgcopydb && \
-        make -s clean && \
-        make -s -j12 install && \
-        libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \
-        mkdir -p /pgcopydb/lib && \
-        cp "$libpq_path" /pgcopydb/lib/; \
-    else \
-        # copy command below will fail if we don't have dummy files, so we create them for other debian versions
-        mkdir -p /usr/lib/postgresql/16/bin && touch /usr/lib/postgresql/16/bin/pgcopydb && \
-        mkdir -p mkdir -p /pgcopydb/lib && touch /pgcopydb/lib/libpq.so.5; \
-    fi
-
-FROM debian:${DEBIAN_VERSION}-slim AS build_tools
+FROM debian:${DEBIAN_VERSION}-slim
 ARG DEBIAN_VERSION

 # Add nonroot user
 RUN useradd -ms /bin/bash nonroot -b /home
 SHELL ["/bin/bash", "-c"]

-RUN mkdir -p /pgcopydb/bin && \
-    mkdir -p /pgcopydb/lib && \
-    chmod -R 755 /pgcopydb && \
-    chown -R nonroot:nonroot /pgcopydb
-        
-COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb 
-COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5 
-
 # System deps
 #
 # 'gdb' is included so that we get backtraces of core dumps produced in
@@ -92,7 +38,7 @@ RUN set -e \
        libseccomp-dev \
        libsqlite3-dev \
        libssl-dev \
-        $([[ "${DEBIAN_VERSION}" = "bullseye" ]] && echo libstdc++-10-dev || echo libstdc++-11-dev) \
+        $([[ "${DEBIAN_VERSION}" = "bullseye" ]] && libstdc++-10-dev || libstdc++-11-dev) \
        libtool \
        libxml2-dev \
        libxmlsec1-dev \
@@ -289,13 +235,7 @@ RUN whoami \
    && cargo --version --verbose \
    && rustup --version --verbose \
    && rustc --version --verbose \
-    && clang --version 
-
-RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
-    LD_LIBRARY_PATH=/pgcopydb/lib /pgcopydb/bin/pgcopydb --version; \
-else \
-    echo "pgcopydb is not available for ${DEBIAN_VERSION}"; \
-fi
+    && clang --version

 # Set following flag to check in Makefile if its running in Docker
 RUN touch /home/nonroot/.docker_build
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -559,8 +559,8 @@ RUN case "${PG_VERSION}" in \
        export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
        ;; \
      "v17") \
-        export TIMESCALEDB_VERSION=2.17.1 \
-        export TIMESCALEDB_CHECKSUM=6277cf43f5695e23dae1c5cfeba00474d730b66ed53665a84b787a6bb1a57e28 \
+        export TIMESCALEDB_VERSION=2.17.0 \
+        export TIMESCALEDB_CHECKSUM=155bf64391d3558c42f31ca0e523cfc6252921974f75298c9039ccad1c89811a \
        ;; \
    esac && \
    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
@@ -624,12 +624,16 @@ FROM build-deps AS pg-cron-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

+# 1.6.4 available, supports v17
 # This is an experimental extension that we do not support on prod yet.
 # !Do not remove!
 # We set it in shared_preload_libraries and computes will fail to start if library is not found.
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \
-    echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
+    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
    mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -1147,8 +1151,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # The topmost commit in the `neon` branch at the time of writing this
 # https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
-# https://github.com/Mooncake-Labs/pg_mooncake/commit/077c92c452bb6896a7b7776ee95f039984f076af
-ENV PG_MOONCAKE_VERSION=077c92c452bb6896a7b7776ee95f039984f076af
+# https://github.com/Mooncake-Labs/pg_mooncake/commit/568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
+ENV PG_MOONCAKE_VERSION=568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
 ENV PATH="/usr/local/pgsql/bin/:$PATH"

 RUN case "${PG_VERSION}" in \
@@ -1471,8 +1475,6 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
 COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
 COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter

-COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
-
 COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml               /etc/sql_exporter.yml
 COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml             /etc/neon_collector.yml
 COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml   /etc/sql_exporter_autoscaling.yml
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -3,7 +3,7 @@
  metrics: [
    import 'sql_exporter/checkpoints_req.libsonnet',
    import 'sql_exporter/checkpoints_timed.libsonnet',
-    import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet',
+    import 'sql_exporter/compute_backpressure_throttling_ms.libsonnet',
    import 'sql_exporter/compute_current_lsn.libsonnet',
    import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
    import 'sql_exporter/compute_receive_lsn.libsonnet',
--- a/compute/etc/postgres_exporter.yml
+++ b/compute/etc/postgres_exporter.yml
--- a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet
@@ -1,10 +1,10 @@
 {
-  metric_name: 'compute_backpressure_throttling_seconds',
+  metric_name: 'compute_backpressure_throttling_ms',
  type: 'gauge',
  help: 'Time compute has spent throttled',
  key_labels: null,
  values: [
    'throttled',
  ],
-  query: importstr 'sql_exporter/compute_backpressure_throttling_seconds.sql',
+  query: importstr 'sql_exporter/compute_backpressure_throttling_ms.sql',
 }
--- a/compute/etc/sql_exporter/compute_backpressure_throttling_ms.sql
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_ms.sql
@@ -0,0 +1 @@
+SELECT neon.backpressure_throttling_time() AS throttled;
--- a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql
@@ -1 +0,0 @@
-SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled;
--- a/compute/patches/pg_anon.patch
+++ b/compute/patches/pg_anon.patch
@@ -1,45 +1,3 @@
-commit 00aa659afc9c7336ab81036edec3017168aabf40
-Author: Heikki Linnakangas <heikki@neon.tech>
-Date:   Tue Nov 12 16:59:19 2024 +0200
-
-    Temporarily disable test that depends on timezone
-
-diff --git a/tests/expected/generalization.out b/tests/expected/generalization.out
-index 23ef5fa..9e60deb 100644
--- a/ext-src/pg_anon-src/tests/expected/generalization.out
-+++ b/ext-src/pg_anon-src/tests/expected/generalization.out
-@@ -284,12 +284,9 @@ SELECT anon.generalize_tstzrange('19041107','century');
-  ["Tue Jan 01 00:00:00 1901 PST","Mon Jan 01 00:00:00 2001 PST")
- (1 row)
- 
-SELECT anon.generalize_tstzrange('19041107','millennium');
-                      generalize_tstzrange                       
------------------------------------------------------------------
- ["Thu Jan 01 00:00:00 1001 PST","Mon Jan 01 00:00:00 2001 PST")
-(1 row)
-
-+-- temporarily disabled, see:
-+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
-+--SELECT anon.generalize_tstzrange('19041107','millennium');
- -- generalize_daterange
- SELECT anon.generalize_daterange('19041107');
-   generalize_daterange   
-diff --git a/tests/sql/generalization.sql b/tests/sql/generalization.sql
-index b868344..b4fc977 100644
--- a/ext-src/pg_anon-src/tests/sql/generalization.sql
-+++ b/ext-src/pg_anon-src/tests/sql/generalization.sql
-@@ -61,7 +61,9 @@ SELECT anon.generalize_tstzrange('19041107','month');
- SELECT anon.generalize_tstzrange('19041107','year');
- SELECT anon.generalize_tstzrange('19041107','decade');
- SELECT anon.generalize_tstzrange('19041107','century');
-SELECT anon.generalize_tstzrange('19041107','millennium');
-+-- temporarily disabled, see:
-+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
-+--SELECT anon.generalize_tstzrange('19041107','millennium');
- 
- -- generalize_daterange
- SELECT anon.generalize_daterange('19041107');
-
 commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
 Author: Alexey Masterov <alexeymasterov@neon.tech>
 Date:   Fri May 31 06:34:26 2024 +0000
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -26,7 +26,7 @@ commands:
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
-    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
  - name: sql-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -26,7 +26,7 @@ commands:
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
-    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
  - name: sql-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -18,11 +18,9 @@ clap.workspace = true
 flate2.workspace = true
 futures.workspace = true
 hyper0 = { workspace = true, features = ["full"] }
-metrics.workspace = true
 nix.workspace = true
 notify.workspace = true
 num_cpus.workspace = true
-once_cell.workspace = true
 opentelemetry.workspace = true
 opentelemetry_sdk.workspace = true
 postgres.workspace = true
@@ -41,7 +39,6 @@ tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 thiserror.workspace = true
 url.workspace = true
-prometheus.workspace = true

 compute_api.workspace = true
 utils.workspace = true
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -364,29 +364,11 @@ impl ComputeNode {
        let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;

        let basebackup_cmd = match lsn {
-            Lsn(0) => {
-                if spec.spec.mode != ComputeMode::Primary {
-                    format!(
-                        "basebackup {} {} --gzip --replica",
-                        spec.tenant_id, spec.timeline_id
-                    )
-                } else {
-                    format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id)
-                }
-            }
-            _ => {
-                if spec.spec.mode != ComputeMode::Primary {
-                    format!(
-                        "basebackup {} {} {} --gzip --replica",
-                        spec.tenant_id, spec.timeline_id, lsn
-                    )
-                } else {
-                    format!(
-                        "basebackup {} {} {} --gzip",
-                        spec.tenant_id, spec.timeline_id, lsn
-                    )
-                }
-            }
+            Lsn(0) => format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id),
+            _ => format!(
+                "basebackup {} {} {} --gzip",
+                spec.tenant_id, spec.timeline_id, lsn
+            ),
        };

        let copyreader = client.copy_out(basebackup_cmd.as_str())?;
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -73,19 +73,6 @@ pub fn write_postgres_conf(
        )?;
    }

-    // Locales
-    if cfg!(target_os = "macos") {
-        writeln!(file, "lc_messages='C'")?;
-        writeln!(file, "lc_monetary='C'")?;
-        writeln!(file, "lc_time='C'")?;
-        writeln!(file, "lc_numeric='C'")?;
-    } else {
-        writeln!(file, "lc_messages='C.UTF-8'")?;
-        writeln!(file, "lc_monetary='C.UTF-8'")?;
-        writeln!(file, "lc_time='C.UTF-8'")?;
-        writeln!(file, "lc_numeric='C.UTF-8'")?;
-    }
-
    match spec.mode {
        ComputeMode::Primary => {}
        ComputeMode::Static(lsn) => {
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -9,7 +9,6 @@ use crate::catalog::SchemaDumpError;
 use crate::catalog::{get_database_schema, get_dbs_and_roles};
 use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
-use crate::installed_extensions;
 use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
 use compute_api::responses::{
    ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
@@ -20,8 +19,6 @@ use anyhow::Result;
 use hyper::header::CONTENT_TYPE;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
-use metrics::Encoder;
-use metrics::TextEncoder;
 use tokio::task;
 use tracing::{debug, error, info, warn};
 use tracing_utils::http::OtelName;
@@ -68,28 +65,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
        }

-        // Prometheus metrics
-        (&Method::GET, "/metrics") => {
-            debug!("serving /metrics GET request");
-
-            let mut buffer = vec![];
-            let metrics = installed_extensions::collect();
-            let encoder = TextEncoder::new();
-            encoder.encode(&metrics, &mut buffer).unwrap();
-
-            match Response::builder()
-                .status(StatusCode::OK)
-                .header(CONTENT_TYPE, encoder.format_type())
-                .body(Body::from(buffer))
-            {
-                Ok(response) => response,
-                Err(err) => {
-                    let msg = format!("error handling /metrics request: {err}");
-                    error!(msg);
-                    render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
        // Collect Postgres current usage insights
        (&Method::GET, "/insights") => {
            info!("serving /insights GET request");
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -37,21 +37,6 @@ paths:
              schema:
                $ref: "#/components/schemas/ComputeMetrics"

-  /metrics
-    get:
-      tags:
-      - Info
-      summary: Get compute node metrics in text format.
-      description: ""
-      operationId: getComputeMetrics
-      responses:
-        200:
-          description: ComputeMetrics
-          content:
-            text/plain:
-              schema:
-                type: string
-                description: Metrics in text format.
  /insights:
    get:
      tags:
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -1,5 +1,4 @@
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
-use metrics::proto::MetricFamily;
 use std::collections::HashMap;
 use std::collections::HashSet;
 use tracing::info;
@@ -9,10 +8,6 @@ use anyhow::Result;
 use postgres::{Client, NoTls};
 use tokio::task;

-use metrics::core::Collector;
-use metrics::{register_uint_gauge_vec, UIntGaugeVec};
-use once_cell::sync::Lazy;
-
 /// We don't reuse get_existing_dbs() just for code clarity
 /// and to make database listing query here more explicit.
 ///
@@ -64,12 +59,6 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension

            for (extname, v) in extensions.iter() {
                let version = v.to_string();
-
-                // increment the number of databases where the version of extension is installed
-                INSTALLED_EXTENSIONS
-                    .with_label_values(&[extname, &version])
-                    .inc();
-
                extensions_map
                    .entry(extname.to_string())
                    .and_modify(|e| {
@@ -85,11 +74,9 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension
            }
        }

-        let res = InstalledExtensions {
+        Ok(InstalledExtensions {
            extensions: extensions_map.values().cloned().collect(),
-        };
-
-        Ok(res)
+        })
    })
    .await?
 }
@@ -110,18 +97,6 @@ pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> {
        "[NEON_EXT_STAT] {}",
        serde_json::to_string(&result).expect("failed to serialize extensions list")
    );
+
    Ok(())
 }
-
-static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "installed_extensions",
-        "Number of databases where the version of extension is installed",
-        &["extension_name", "version"]
-    )
-    .expect("failed to define a metric")
-});
-
-pub fn collect() -> Vec<MetricFamily> {
-    INSTALLED_EXTENSIONS.collect()
-}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -944,9 +944,6 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
                        pg_auth_type: AuthType::Trust,
                        http_auth_type: AuthType::Trust,
                        other: Default::default(),
-                        // Typical developer machines use disks with slow fsync, and we don't care
-                        // about data integrity: disable disk syncs.
-                        no_sync: true,
                    }
                })
                .collect(),
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -225,7 +225,6 @@ pub struct PageServerConf {
    pub listen_http_addr: String,
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,
-    pub no_sync: bool,
 }

 impl Default for PageServerConf {
@@ -236,7 +235,6 @@ impl Default for PageServerConf {
            listen_http_addr: String::new(),
            pg_auth_type: AuthType::Trust,
            http_auth_type: AuthType::Trust,
-            no_sync: false,
        }
    }
 }
@@ -251,8 +249,6 @@ pub struct NeonLocalInitPageserverConf {
    pub listen_http_addr: String,
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,
-    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
-    pub no_sync: bool,
    #[serde(flatten)]
    pub other: HashMap<String, toml::Value>,
 }
@@ -265,7 +261,6 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
            listen_http_addr,
            pg_auth_type,
            http_auth_type,
-            no_sync,
            other: _,
        } = conf;
        Self {
@@ -274,7 +269,6 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
            listen_http_addr: listen_http_addr.clone(),
            pg_auth_type: *pg_auth_type,
            http_auth_type: *http_auth_type,
-            no_sync: *no_sync,
        }
    }
 }
@@ -575,8 +569,6 @@ impl LocalEnv {
                    listen_http_addr: String,
                    pg_auth_type: AuthType,
                    http_auth_type: AuthType,
-                    #[serde(default)]
-                    no_sync: bool,
                }
                let config_toml_path = dentry.path().join("pageserver.toml");
                let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
@@ -599,7 +591,6 @@ impl LocalEnv {
                    listen_http_addr,
                    pg_auth_type,
                    http_auth_type,
-                    no_sync,
                } = config_toml;
                let IdentityTomlSubset {
                    id: identity_toml_id,
@@ -616,7 +607,6 @@ impl LocalEnv {
                    listen_http_addr,
                    pg_auth_type,
                    http_auth_type,
-                    no_sync,
                };
                pageservers.push(conf);
            }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -273,7 +273,6 @@ impl PageServerNode {
            )
        })?;
        let args = vec!["-D", datadir_path_str];
-
        background_process::start_process(
            "pageserver",
            &datadir,
--- a/docs/rfcs/038-aux-file-v2.md
+++ b/docs/rfcs/038-aux-file-v2.md
@@ -91,7 +91,7 @@ generating the basebackup by scanning the `REPL_ORIGIN_KEY_PREFIX` keyspace.
 There are two places we need to read the aux files from the pageserver:

 * On the write path, when the compute node adds an aux file to the pageserver, we will retrieve the key from the storage, append the file to the hashed key, and write it back. The current `get` API already supports that.
-*  We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API used to always attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error. Furthermore, as aux file reads usually need all layer files intersecting with that key range within the branch and cover a big keyspace, it incurs large overhead for tracking keyspaces that have not been read. Therefore, for sparse keyspaces, we [do not track](https://github.com/neondatabase/neon/pull/9631) `ummapped_keyspace`.
+*  We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API will attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error.

 ## Compaction and Image Layer Generation

--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -64,7 +64,6 @@ pub struct ConfigToml {
    #[serde(with = "humantime_serde")]
    pub wal_redo_timeout: Duration,
    pub superuser: String,
-    pub locale: String,
    pub page_cache_size: usize,
    pub max_file_descriptors: usize,
    pub pg_distrib_dir: Option<Utf8PathBuf>,
@@ -107,8 +106,6 @@ pub struct ConfigToml {
    pub ephemeral_bytes_per_memory_kb: usize,
    pub l0_flush: Option<crate::models::L0FlushConfig>,
    pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub no_sync: Option<bool>,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -277,11 +274,6 @@ pub mod defaults {
    pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";

    pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
-    pub const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
-        "C"
-    } else {
-        "C.UTF-8"
-    };

    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
@@ -332,7 +324,6 @@ impl Default for ConfigToml {
            wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
                .expect("cannot parse default wal redo timeout")),
            superuser: (DEFAULT_SUPERUSER.to_string()),
-            locale: DEFAULT_LOCALE.to_string(),
            page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
            max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
            pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
@@ -398,7 +389,6 @@ impl Default for ConfigToml {
            l0_flush: None,
            virtual_file_io_mode: None,
            tenant_config: TenantConfigToml::default(),
-            no_sync: None,
        }
    }
 }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -24,7 +24,7 @@ pub struct Key {

 /// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
 /// a struct of fields.
-#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd)]
+#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
 pub struct CompactKey(i128);

 /// The storage key size.
--- a/libs/pageserver_api/src/record.rs
+++ b/libs/pageserver_api/src/record.rs
@@ -80,18 +80,18 @@ impl NeonWalRecord {
    }

    #[cfg(feature = "testing")]
-    pub fn wal_clear(s: impl AsRef<str>) -> Self {
+    pub fn wal_clear() -> Self {
        Self::Test {
-            append: s.as_ref().to_string(),
+            append: "".to_string(),
            clear: true,
            will_init: false,
        }
    }

    #[cfg(feature = "testing")]
-    pub fn wal_init(s: impl AsRef<str>) -> Self {
+    pub fn wal_init() -> Self {
        Self::Test {
-            append: s.as_ref().to_string(),
+            append: "".to_string(),
            clear: true,
            will_init: true,
        }
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -24,7 +24,7 @@ use postgres_ffi::Oid;
 // FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
 // Then we could replace the custom Ord and PartialOrd implementations below with
 // deriving them. This will require changes in walredoproc.c.
-#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize)]
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
 pub struct RelTag {
    pub forknum: u8,
    pub spcnode: Oid,
--- a/libs/postgres_ffi/src/wal_generator.rs
+++ b/libs/postgres_ffi/src/wal_generator.rs
@@ -1,10 +1,10 @@
-use std::ffi::{CStr, CString};
+use std::ffi::CStr;

 use bytes::{Bytes, BytesMut};
 use crc32c::crc32c_append;
 use utils::lsn::Lsn;

-use super::bindings::{RmgrId, XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC};
+use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC};
 use super::xlog_utils::{
    XlLogicalMessage, XLOG_RECORD_CRC_OFFS, XLOG_SIZE_OF_XLOG_RECORD, XLP_BKP_REMOVABLE,
    XLP_FIRST_IS_CONTRECORD,
@@ -16,65 +16,11 @@ use crate::pg_constants::{
 };
 use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};

-/// A WAL record payload. Will be prefixed by an XLogRecord header when encoded.
-pub struct Record {
-    pub rmid: RmgrId,
-    pub info: u8,
-    pub data: Bytes,
-}
-
-impl Record {
-    /// Encodes the WAL record including an XLogRecord header. prev_lsn is the start position of
-    /// the previous record in the WAL -- this is ignored by the Safekeeper, but not Postgres.
-    pub fn encode(&self, prev_lsn: Lsn) -> Bytes {
-        // Prefix data with block ID and length.
-        let data_header = Bytes::from(match self.data.len() {
-            0 => vec![],
-            1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, self.data.len() as u8],
-            256.. => {
-                let len_bytes = (self.data.len() as u32).to_le_bytes();
-                [&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat()
-            }
-        });
-
-        // Construct the WAL record header.
-        let mut header = XLogRecord {
-            xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + self.data.len()) as u32,
-            xl_xid: 0,
-            xl_prev: prev_lsn.into(),
-            xl_info: self.info,
-            xl_rmid: self.rmid,
-            __bindgen_padding_0: [0; 2],
-            xl_crc: 0, // see below
-        };
-
-        // Compute the CRC checksum for the data, and the header up to the CRC field.
-        let mut crc = 0;
-        crc = crc32c_append(crc, &data_header);
-        crc = crc32c_append(crc, &self.data);
-        crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]);
-        header.xl_crc = crc;
-
-        // Encode the final header and record.
-        let header = header.encode().unwrap();
-
-        [header, data_header, self.data.clone()].concat().into()
-    }
-}
-
-/// Generates WAL record payloads.
-///
-/// TODO: currently only provides LogicalMessageGenerator for trivial noop messages. Add a generator
-/// that creates a table and inserts rows.
-pub trait RecordGenerator: Iterator<Item = Record> {}
-
-impl<I: Iterator<Item = Record>> RecordGenerator for I {}
-
-/// Generates binary WAL for use in tests and benchmarks. The provided record generator constructs
-/// the WAL records. It is used as an iterator which yields encoded bytes for a single WAL record,
-/// including internal page headers if it spans pages. Concatenating the bytes will yield a
-/// complete, well-formed WAL, which can be chunked at segment boundaries if desired. Not optimized
-/// for performance.
+/// Generates binary WAL records for use in tests and benchmarks. Currently only generates logical
+/// messages (effectively noops) with a fixed payload. It is used as an iterator which yields
+/// encoded bytes for a single WAL record, including internal page headers if it spans pages.
+/// Concatenating the bytes will yield a complete, well-formed WAL, which can be chunked at segment
+/// boundaries if desired. Not optimized for performance.
 ///
 /// The WAL format is version-dependant (see e.g. `XLOG_PAGE_MAGIC`), so make sure to import this
 /// for the appropriate Postgres version (e.g. `postgres_ffi::v17::wal_generator::WalGenerator`).
@@ -85,10 +31,10 @@ impl<I: Iterator<Item = Record>> RecordGenerator for I {}
 /// |        Segment 1         |        Segment 2         |        Segment 3         |
 /// | Page 1 | Page 2 | Page 3 | Page 4 | Page 5 | Page 6 | Page 7 | Page 8 | Page 9 |
 /// | R1 |   R2  |R3|  R4  | R5  |  R6  |                 R7            | R8  |
+///
+/// TODO: support generating actual tables and rows.
 #[derive(Default)]
-pub struct WalGenerator<R: RecordGenerator> {
-    /// Generates record payloads for the WAL.
-    pub record_generator: R,
+pub struct WalGenerator {
    /// Current LSN to append the next record at.
    ///
    /// Callers can modify this (and prev_lsn) to restart generation at a different LSN, but should
@@ -100,35 +46,73 @@ pub struct WalGenerator<R: RecordGenerator> {
    pub prev_lsn: Lsn,
 }

-impl<R: RecordGenerator> WalGenerator<R> {
-    // Hardcode the sys and timeline ID. We can make them configurable if we care about them.
+impl WalGenerator {
+    // For now, hardcode the message payload.
+    // TODO: support specifying the payload size.
+    const PREFIX: &CStr = c"prefix";
+    const MESSAGE: &[u8] = b"message";
+
+    // Hardcode the sys, timeline, and DB IDs. We can make them configurable if we care about them.
    const SYS_ID: u64 = 0;
    const TIMELINE_ID: u32 = 1;
+    const DB_ID: u32 = 0;

-    /// Creates a new WAL generator with the given record generator.
-    pub fn new(record_generator: R) -> WalGenerator<R> {
-        Self {
-            record_generator,
-            lsn: Lsn(0),
-            prev_lsn: Lsn(0),
-        }
+    /// Creates a new WAL generator, which emits logical message records (noops).
+    pub fn new() -> Self {
+        Self::default()
    }

-    /// Appends a record with an arbitrary payload at the current LSN, then increments the LSN.
-    /// Returns the WAL bytes for the record, including page headers and padding, and the start LSN.
-    fn append_record(&mut self, record: Record) -> (Lsn, Bytes) {
-        let record = record.encode(self.prev_lsn);
-        let record = Self::insert_pages(record, self.lsn);
-        let record = Self::pad_record(record, self.lsn);
-        let lsn = self.lsn;
-        self.prev_lsn = self.lsn;
-        self.lsn += record.len() as u64;
-        (lsn, record)
+    /// Encodes a logical message (basically a noop), with the given prefix and message.
+    pub(crate) fn encode_logical_message(prefix: &CStr, message: &[u8]) -> Bytes {
+        let prefix = prefix.to_bytes_with_nul();
+        let header = XlLogicalMessage {
+            db_id: Self::DB_ID,
+            transactional: 0,
+            prefix_size: prefix.len() as u64,
+            message_size: message.len() as u64,
+        };
+        [&header.encode(), prefix, message].concat().into()
    }

-    /// Inserts page headers on 8KB page boundaries. Takes the current LSN position where the record
+    /// Encode a WAL record with the given payload data (e.g. a logical message).
+    pub(crate) fn encode_record(data: Bytes, rmid: u8, info: u8, prev_lsn: Lsn) -> Bytes {
+        // Prefix data with block ID and length.
+        let data_header = Bytes::from(match data.len() {
+            0 => vec![],
+            1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, data.len() as u8],
+            256.. => {
+                let len_bytes = (data.len() as u32).to_le_bytes();
+                [&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat()
+            }
+        });
+
+        // Construct the WAL record header.
+        let mut header = XLogRecord {
+            xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + data.len()) as u32,
+            xl_xid: 0,
+            xl_prev: prev_lsn.into(),
+            xl_info: info,
+            xl_rmid: rmid,
+            __bindgen_padding_0: [0; 2],
+            xl_crc: 0, // see below
+        };
+
+        // Compute the CRC checksum for the data, and the header up to the CRC field.
+        let mut crc = 0;
+        crc = crc32c_append(crc, &data_header);
+        crc = crc32c_append(crc, &data);
+        crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]);
+        header.xl_crc = crc;
+
+        // Encode the final header and record.
+        let header = header.encode().unwrap();
+
+        [header, data_header, data].concat().into()
+    }
+
+    /// Injects page headers on 8KB page boundaries. Takes the current LSN position where the record
    /// is to be appended.
-    fn insert_pages(record: Bytes, mut lsn: Lsn) -> Bytes {
+    fn encode_pages(record: Bytes, mut lsn: Lsn) -> Bytes {
        // Fast path: record fits in current page, and the page already has a header.
        if lsn.remaining_in_block() as usize >= record.len() && lsn.block_offset() > 0 {
            return record;
@@ -189,71 +173,31 @@ impl<R: RecordGenerator> WalGenerator<R> {
        }
        [record, Bytes::from(vec![0; padding])].concat().into()
    }
+
+    /// Generates a record with an arbitrary payload at the current LSN, then increments the LSN.
+    pub fn generate_record(&mut self, data: Bytes, rmid: u8, info: u8) -> Bytes {
+        let record = Self::encode_record(data, rmid, info, self.prev_lsn);
+        let record = Self::encode_pages(record, self.lsn);
+        let record = Self::pad_record(record, self.lsn);
+        self.prev_lsn = self.lsn;
+        self.lsn += record.len() as u64;
+        record
+    }
+
+    /// Generates a logical message at the current LSN. Can be used to construct arbitrary messages.
+    pub fn generate_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> Bytes {
+        let data = Self::encode_logical_message(prefix, message);
+        self.generate_record(data, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE)
+    }
 }

-/// Generates WAL records as an iterator.
-impl<R: RecordGenerator> Iterator for WalGenerator<R> {
+/// Generate WAL records as an iterator.
+impl Iterator for WalGenerator {
    type Item = (Lsn, Bytes);

    fn next(&mut self) -> Option<Self::Item> {
-        let record = self.record_generator.next()?;
-        Some(self.append_record(record))
-    }
-}
-
-/// Generates logical message records (effectively noops) with a fixed message.
-pub struct LogicalMessageGenerator {
-    prefix: CString,
-    message: Vec<u8>,
-}
-
-impl LogicalMessageGenerator {
-    const DB_ID: u32 = 0; // hardcoded for now
-    const RM_ID: RmgrId = RM_LOGICALMSG_ID;
-    const INFO: u8 = XLOG_LOGICAL_MESSAGE;
-
-    /// Creates a new LogicalMessageGenerator.
-    pub fn new(prefix: &CStr, message: &[u8]) -> Self {
-        Self {
-            prefix: prefix.to_owned(),
-            message: message.to_owned(),
-        }
-    }
-
-    /// Encodes a logical message.
-    fn encode(prefix: &CStr, message: &[u8]) -> Bytes {
-        let prefix = prefix.to_bytes_with_nul();
-        let header = XlLogicalMessage {
-            db_id: Self::DB_ID,
-            transactional: 0,
-            prefix_size: prefix.len() as u64,
-            message_size: message.len() as u64,
-        };
-        [&header.encode(), prefix, message].concat().into()
-    }
-}
-
-impl Iterator for LogicalMessageGenerator {
-    type Item = Record;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        Some(Record {
-            rmid: Self::RM_ID,
-            info: Self::INFO,
-            data: Self::encode(&self.prefix, &self.message),
-        })
-    }
-}
-
-impl WalGenerator<LogicalMessageGenerator> {
-    /// Convenience method for appending a WAL record with an arbitrary logical message at the
-    /// current WAL LSN position. Returns the start LSN and resulting WAL bytes.
-    pub fn append_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> (Lsn, Bytes) {
-        let record = Record {
-            rmid: LogicalMessageGenerator::RM_ID,
-            info: LogicalMessageGenerator::INFO,
-            data: LogicalMessageGenerator::encode(prefix, message),
-        };
-        self.append_record(record)
+        let lsn = self.lsn;
+        let record = self.generate_logical_message(Self::PREFIX, Self::MESSAGE);
+        Some((lsn, record))
    }
 }
--- a/libs/postgres_ffi/src/walrecord.rs
+++ b/libs/postgres_ffi/src/walrecord.rs
@@ -16,7 +16,7 @@ use utils::bin_ser::DeserializeError;
 use utils::lsn::Lsn;

 #[repr(C)]
-#[derive(Debug)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct XlMultiXactCreate {
    pub mid: MultiXactId,
    /* new MultiXact's ID */
@@ -46,7 +46,7 @@ impl XlMultiXactCreate {
 }

 #[repr(C)]
-#[derive(Debug)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct XlMultiXactTruncate {
    pub oldest_multi_db: Oid,
    /* to-be-truncated range of multixact offsets */
@@ -72,7 +72,7 @@ impl XlMultiXactTruncate {
 }

 #[repr(C)]
-#[derive(Debug)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct XlRelmapUpdate {
    pub dbid: Oid,   /* database ID, or 0 for shared map */
    pub tsid: Oid,   /* database's tablespace, or pg_global */
@@ -90,7 +90,7 @@ impl XlRelmapUpdate {
 }

 #[repr(C)]
-#[derive(Debug)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct XlReploriginDrop {
    pub node_id: RepOriginId,
 }
@@ -104,7 +104,7 @@ impl XlReploriginDrop {
 }

 #[repr(C)]
-#[derive(Debug)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct XlReploriginSet {
    pub remote_lsn: Lsn,
    pub node_id: RepOriginId,
@@ -120,7 +120,7 @@ impl XlReploriginSet {
 }

 #[repr(C)]
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub struct RelFileNode {
    pub spcnode: Oid, /* tablespace */
    pub dbnode: Oid,  /* database */
@@ -911,7 +911,7 @@ impl XlSmgrCreate {
 }

 #[repr(C)]
-#[derive(Debug)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct XlSmgrTruncate {
    pub blkno: BlockNumber,
    pub rnode: RelFileNode,
@@ -984,7 +984,7 @@ impl XlDropDatabase {
 /// xl_xact_parsed_abort structs in PostgreSQL, but we use the same
 /// struct for commits and aborts.
 ///
-#[derive(Debug)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct XlXactParsedRecord {
    pub xid: TransactionId,
    pub info: u8,
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -12,9 +12,9 @@ use super::bindings::{
    CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz,
    XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
 };
-use super::wal_generator::LogicalMessageGenerator;
+use super::wal_generator::WalGenerator;
 use super::PG_MAJORVERSION;
-use crate::pg_constants;
+use crate::pg_constants::{self, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE};
 use crate::PG_TLI;
 use crate::{uint32, uint64, Oid};
 use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
@@ -493,10 +493,12 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Bytes {
    // This function can take untrusted input, so discard any NUL bytes in the prefix string.
    let prefix = CString::new(prefix.replace('\0', "")).expect("no NULs");
    let message = message.as_bytes();
-    LogicalMessageGenerator::new(&prefix, message)
-        .next()
-        .unwrap()
-        .encode(Lsn(0))
+    WalGenerator::encode_record(
+        WalGenerator::encode_logical_message(&prefix, message),
+        RM_LOGICALMSG_ID,
+        XLOG_LOGICAL_MESSAGE,
+        Lsn(0),
+    )
 }

 #[cfg(test)]
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -13,3 +13,4 @@ rand.workspace = true
 tokio = { workspace = true, features = ["io-util"] }
 thiserror.workspace = true
 serde.workspace = true
+# wal_decoder.workspace = true
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -562,6 +562,7 @@ pub enum BeMessage<'a> {
        options: &'a [&'a str],
    },
    KeepAlive(WalSndKeepAlive),
+    InterpretedWalRecord(InterpretedWalRecordBody<'a>),
 }

 /// Common shorthands.
@@ -665,6 +666,12 @@ pub struct XLogDataBody<'a> {
    pub data: &'a [u8],
 }

+#[derive(Debug)]
+pub struct InterpretedWalRecordBody<'a> {
+    pub wal_end: u64,
+    pub data: &'a [u8],
+}
+
 #[derive(Debug)]
 pub struct WalSndKeepAlive {
    pub wal_end: u64, // current end of WAL on the server
@@ -996,6 +1003,15 @@ impl BeMessage<'_> {
                    Ok(())
                })?
            }
+
+            BeMessage::InterpretedWalRecord(rec) => {
+                buf.put_u8(b'd'); // arbitrary?
+                write_body(buf, |buf| {
+                    buf.put_u8(b'0');
+                    buf.put_u64(rec.wal_end);
+                    buf.put_slice(rec.data);
+                });
+            }
        }
        Ok(())
    }
--- a/libs/utils/scripts/restore_from_wal.sh
+++ b/libs/utils/scripts/restore_from_wal.sh
@@ -1,4 +1,4 @@
-#!/usr/bin/env bash
+#!/bin/bash

 set -euxo pipefail

@@ -6,44 +6,9 @@ PG_BIN=$1
 WAL_PATH=$2
 DATA_DIR=$3
 PORT=$4
-PG_VERSION=$5
 SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
-
-# The way that initdb is invoked must match how the pageserver runs initdb.
-function initdb_with_args {
-    local cmd=(
-        "$PG_BIN"/initdb
-        -E utf8
-        -U cloud_admin
-        -D "$DATA_DIR"
-        --locale 'C.UTF-8'
-        --lc-collate 'C.UTF-8'
-        --lc-ctype 'C.UTF-8'
-        --lc-messages 'C.UTF-8'
-        --lc-monetary 'C.UTF-8'
-        --lc-numeric 'C.UTF-8'
-        --lc-time 'C.UTF-8'
-        --sysid="$SYSID"
-    )
-
-    case "$PG_VERSION" in
-        14)
-            # Postgres 14 and below didn't support --locale-provider
-            ;;
-        15 | 16)
-            cmd+=(--locale-provider 'libc')
-            ;;
-        *)
-            # Postgres 17 added the builtin provider
-            cmd+=(--locale-provider 'builtin')
-            ;;
-    esac
-
-    eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}"
-}
-
 rm -fr "$DATA_DIR"
-initdb_with_args
+env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
 echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
 echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
 REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -40,11 +40,6 @@ pub enum Scope {
    /// Allows access to storage controller APIs used by the scrubber, to interrogate the state
    /// of a tenant & post scrub results.
    Scrubber,
-
-    /// This scope is used for communication with other storage controller instances.
-    /// At the time of writing, this is only used for the step down request.
-    #[serde(rename = "controller_peer")]
-    ControllerPeer,
 }

 /// JWT payload. See docs/authentication.md for the format
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -123,27 +123,15 @@ pub async fn fsync_async_opt(
    Ok(())
 }

-/// Like postgres' durable_rename, renames a file and issues fsyncs to make it durable. After
-/// returning, both the file and rename are guaranteed to be persisted. Both paths must be on the
-/// same file system.
+/// Like postgres' durable_rename, renames file issuing fsyncs do make it
+/// durable. After return, file and rename are guaranteed to be persisted.
 ///
-/// Unlike postgres, it only fsyncs 1) the file to make contents durable, and 2) the directory to
-/// make the rename durable. This sequence ensures the target file will never be incomplete.
-///
-/// Postgres also:
-///
-/// * Fsyncs the target file, if it exists, before the rename, to ensure either the new or existing
-///   file survives a crash. Current callers don't need this as it should already be fsynced if
-///   durability is needed.
-///
-/// * Fsyncs the file after the rename. This can be required with certain OSes or file systems (e.g.
-///   NFS), but not on Linux with most common file systems like ext4 (which we currently use).
-///
-/// An audit of 8 other databases found that none fsynced the file after a rename:
-/// <https://github.com/neondatabase/neon/pull/9686#discussion_r1837180535>
-///
-/// eBPF probes confirmed that this is sufficient with ext4, XFS, and ZFS, but possibly not Btrfs:
-/// <https://github.com/neondatabase/neon/pull/9686#discussion_r1837926218>
+/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make
+/// contents durable; 2) its directory entry to make rename durable 3) again to
+/// already renamed file, which is not required by standards but postgres does
+/// it, let's stick to that. Postgres additionally fsyncs newpath *before*
+/// rename if it exists to ensure that at least one of the files survives, but
+/// current callers don't need that.
 ///
 /// virtual_file.rs has similar code, but it doesn't use vfs.
 ///
@@ -161,6 +149,9 @@ pub async fn durable_rename(
    // Time to do the real deal.
    tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?;

+    // Postgres'ish fsync of renamed file.
+    fsync_async_opt(new_path.as_ref(), do_fsync).await?;
+
    // Now fsync the parent
    let parent = match new_path.as_ref().parent() {
        Some(p) => p,
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -138,11 +138,6 @@ impl Lsn {
        self.0.checked_sub(other).map(Lsn)
    }

-    /// Subtract a number, saturating at numeric bounds instead of overflowing.
-    pub fn saturating_sub<T: Into<u64>>(self, other: T) -> Lsn {
-        Lsn(self.0.saturating_sub(other.into()))
-    }
-
    /// Subtract a number, returning the difference as i128 to avoid overflow.
    pub fn widening_sub<T: Into<u64>>(self, other: T) -> i128 {
        let other: u64 = other.into();
--- a/libs/utils/src/postgres_client.rs
+++ b/libs/utils/src/postgres_client.rs
@@ -7,29 +7,65 @@ use postgres_connection::{parse_host_port, PgConnectionConfig};

 use crate::id::TenantTimelineId;

+/// Protocol used for safekeeper recovery. This sends raw Postgres WAL.
+pub const POSTGRES_PROTO_VERSION: u8 = 0;
+/// Protocol used for safekeeper to pageserver communication.
+/// This sends interpreted WAL records for the pageserver to ingest
+/// and is shard-aware.
+pub const PAGESERVER_SAFEKEEPER_PROTO_VERSION: u8 = 1;
+
+pub struct ConnectionConfigArgs<'a> {
+    pub protocol_version: u8,
+
+    pub ttid: TenantTimelineId,
+    pub shard_number: Option<u8>,
+    pub shard_count: Option<u8>,
+    pub shard_stripe_size: Option<u32>,
+
+    pub listen_pg_addr_str: &'a str,
+
+    pub auth_token: Option<&'a str>,
+    pub availability_zone: Option<&'a str>,
+}
+
+impl<'a> ConnectionConfigArgs<'a> {
+    fn options(&'a self) -> Vec<String> {
+        let mut options = vec![
+            "-c".to_owned(),
+            format!("timeline_id={}", self.ttid.timeline_id),
+            format!("tenant_id={}", self.ttid.tenant_id),
+            format!("protocol_version={}", self.protocol_version),
+        ];
+
+        if self.shard_number.is_some() {
+            assert!(self.shard_count.is_some());
+            assert!(self.shard_stripe_size.is_some());
+
+            options.push(format!("shard_count={}", self.shard_count.unwrap()));
+            options.push(format!("shard_number={}", self.shard_number.unwrap()));
+            options.push(format!(
+                "shard_stripe_size={}",
+                self.shard_stripe_size.unwrap()
+            ));
+        }
+
+        options
+    }
+}
+
 /// Create client config for fetching WAL from safekeeper on particular timeline.
 /// listen_pg_addr_str is in form host:\[port\].
 pub fn wal_stream_connection_config(
-    TenantTimelineId {
-        tenant_id,
-        timeline_id,
-    }: TenantTimelineId,
-    listen_pg_addr_str: &str,
-    auth_token: Option<&str>,
-    availability_zone: Option<&str>,
+    args: ConnectionConfigArgs,
 ) -> anyhow::Result<PgConnectionConfig> {
    let (host, port) =
-        parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
+        parse_host_port(args.listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
    let port = port.unwrap_or(5432);
    let mut connstr = PgConnectionConfig::new_host_port(host, port)
-        .extend_options([
-            "-c".to_owned(),
-            format!("timeline_id={}", timeline_id),
-            format!("tenant_id={}", tenant_id),
-        ])
-        .set_password(auth_token.map(|s| s.to_owned()));
+        .extend_options(args.options())
+        .set_password(args.auth_token.map(|s| s.to_owned()));

-    if let Some(availability_zone) = availability_zone {
+    if let Some(availability_zone) = args.availability_zone {
        connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
    }

--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -32,16 +32,19 @@ use postgres_ffi::walrecord::{
    XlSmgrTruncate, XlXactParsedRecord,
 };
 use postgres_ffi::{Oid, TransactionId};
+use serde::{Deserialize, Serialize};
 use utils::lsn::Lsn;

 use crate::serialized_batch::SerializedValueBatch;

+#[derive(Serialize, Deserialize)]
 pub enum FlushUncommittedRecords {
    Yes,
    No,
 }

 /// An interpreted Postgres WAL record, ready to be handled by the pageserver
+#[derive(Serialize, Deserialize)]
 pub struct InterpretedWalRecord {
    /// Optional metadata record - may cause writes to metadata keys
    /// in the storage engine
@@ -62,6 +65,7 @@ pub struct InterpretedWalRecord {

 /// The interpreted part of the Postgres WAL record which requires metadata
 /// writes to the underlying storage engine.
+#[derive(Serialize, Deserialize)]
 pub enum MetadataRecord {
    Heapam(HeapamRecord),
    Neonrmgr(NeonrmgrRecord),
@@ -77,10 +81,12 @@ pub enum MetadataRecord {
    Replorigin(ReploriginRecord),
 }

+#[derive(Serialize, Deserialize)]
 pub enum HeapamRecord {
    ClearVmBits(ClearVmBits),
 }

+#[derive(Serialize, Deserialize)]
 pub struct ClearVmBits {
    pub new_heap_blkno: Option<u32>,
    pub old_heap_blkno: Option<u32>,
@@ -88,24 +94,29 @@ pub struct ClearVmBits {
    pub flags: u8,
 }

+#[derive(Serialize, Deserialize)]
 pub enum NeonrmgrRecord {
    ClearVmBits(ClearVmBits),
 }

+#[derive(Serialize, Deserialize)]
 pub enum SmgrRecord {
    Create(SmgrCreate),
    Truncate(XlSmgrTruncate),
 }

+#[derive(Serialize, Deserialize)]
 pub struct SmgrCreate {
    pub rel: RelTag,
 }

+#[derive(Serialize, Deserialize)]
 pub enum DbaseRecord {
    Create(DbaseCreate),
    Drop(DbaseDrop),
 }

+#[derive(Serialize, Deserialize)]
 pub struct DbaseCreate {
    pub db_id: Oid,
    pub tablespace_id: Oid,
@@ -113,27 +124,32 @@ pub struct DbaseCreate {
    pub src_tablespace_id: Oid,
 }

+#[derive(Serialize, Deserialize)]
 pub struct DbaseDrop {
    pub db_id: Oid,
    pub tablespace_ids: Vec<Oid>,
 }

+#[derive(Serialize, Deserialize)]
 pub enum ClogRecord {
    ZeroPage(ClogZeroPage),
    Truncate(ClogTruncate),
 }

+#[derive(Serialize, Deserialize)]
 pub struct ClogZeroPage {
    pub segno: u32,
    pub rpageno: u32,
 }

+#[derive(Serialize, Deserialize)]
 pub struct ClogTruncate {
    pub pageno: u32,
    pub oldest_xid: TransactionId,
    pub oldest_xid_db: Oid,
 }

+#[derive(Serialize, Deserialize)]
 pub enum XactRecord {
    Commit(XactCommon),
    Abort(XactCommon),
@@ -142,6 +158,7 @@ pub enum XactRecord {
    Prepare(XactPrepare),
 }

+#[derive(Serialize, Deserialize)]
 pub struct XactCommon {
    pub parsed: XlXactParsedRecord,
    pub origin_id: u16,
@@ -150,61 +167,73 @@ pub struct XactCommon {
    pub lsn: Lsn,
 }

+#[derive(Serialize, Deserialize)]
 pub struct XactPrepare {
    pub xl_xid: TransactionId,
    pub data: Bytes,
 }

+#[derive(Serialize, Deserialize)]
 pub enum MultiXactRecord {
    ZeroPage(MultiXactZeroPage),
    Create(XlMultiXactCreate),
    Truncate(XlMultiXactTruncate),
 }

+#[derive(Serialize, Deserialize)]
 pub struct MultiXactZeroPage {
    pub slru_kind: SlruKind,
    pub segno: u32,
    pub rpageno: u32,
 }

+#[derive(Serialize, Deserialize)]
 pub enum RelmapRecord {
    Update(RelmapUpdate),
 }

+#[derive(Serialize, Deserialize)]
 pub struct RelmapUpdate {
    pub update: XlRelmapUpdate,
    pub buf: Bytes,
 }

+#[derive(Serialize, Deserialize)]
 pub enum XlogRecord {
    Raw(RawXlogRecord),
 }

+#[derive(Serialize, Deserialize)]
 pub struct RawXlogRecord {
    pub info: u8,
    pub lsn: Lsn,
    pub buf: Bytes,
 }

+#[derive(Serialize, Deserialize)]
 pub enum LogicalMessageRecord {
    Put(PutLogicalMessage),
    #[cfg(feature = "testing")]
    Failpoint,
 }

+#[derive(Serialize, Deserialize)]
 pub struct PutLogicalMessage {
    pub path: String,
    pub buf: Bytes,
 }

+#[derive(Serialize, Deserialize)]
 pub enum StandbyRecord {
    RunningXacts(StandbyRunningXacts),
 }

+#[derive(Serialize, Deserialize)]
 pub struct StandbyRunningXacts {
    pub oldest_running_xid: TransactionId,
 }

+#[derive(Serialize, Deserialize)]
 pub enum ReploriginRecord {
    Set(XlReploriginSet),
    Drop(XlReploriginDrop),
--- a/libs/wal_decoder/src/serialized_batch.rs
+++ b/libs/wal_decoder/src/serialized_batch.rs
@@ -16,6 +16,7 @@ use pageserver_api::shard::ShardIdentity;
 use pageserver_api::{key::CompactKey, value::Value};
 use postgres_ffi::walrecord::{DecodedBkpBlock, DecodedWALRecord};
 use postgres_ffi::{page_is_new, page_set_lsn, pg_constants, BLCKSZ};
+use serde::{Deserialize, Serialize};
 use utils::bin_ser::BeSer;
 use utils::lsn::Lsn;

@@ -29,6 +30,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 /// relation sizes. In the case of "observed" values, we only need to know
 /// the key and LSN, so two types of metadata are supported to save on network
 /// bandwidth.
+#[derive(Serialize, Deserialize, Debug)]
 pub enum ValueMeta {
    Serialized(SerializedValueMeta),
    Observed(ObservedValueMeta),
@@ -75,6 +77,7 @@ impl PartialEq for OrderedValueMeta {
 impl Eq for OrderedValueMeta {}

 /// Metadata for a [`Value`] serialized into the batch.
+#[derive(Serialize, Deserialize, Debug)]
 pub struct SerializedValueMeta {
    pub key: CompactKey,
    pub lsn: Lsn,
@@ -86,12 +89,14 @@ pub struct SerializedValueMeta {
 }

 /// Metadata for a [`Value`] observed by the batch
+#[derive(Serialize, Deserialize, Debug)]
 pub struct ObservedValueMeta {
    pub key: CompactKey,
    pub lsn: Lsn,
 }

 /// Batch of serialized [`Value`]s.
+#[derive(Serialize, Deserialize)]
 pub struct SerializedValueBatch {
    /// [`Value`]s serialized in EphemeralFile's native format,
    /// ready for disk write by the pageserver
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -35,15 +35,6 @@ pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
    !(a.end <= b.start || b.end <= a.start)
 }

-/// Whether a fully contains b, example as below
-/// ```plain
-/// |      a       |
-///       |  b  |
-/// ```
-pub fn fully_contains<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
-    a.start <= b.start && a.end >= b.end
-}
-
 pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
    let x = std::mem::take(a);
    let mut all_ranges_iter = [x.into_iter(), b.into_iter()]
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -19,8 +19,7 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
            | Scope::SafekeeperData
            | Scope::GenerationsApi
            | Scope::Infra
-            | Scope::Scrubber
-            | Scope::ControllerPeer,
+            | Scope::Scrubber,
            _,
        ) => Err(AuthError(
            format!(
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -154,17 +154,13 @@ fn main() -> anyhow::Result<()> {
            },
        };

-        if conf.no_sync {
-            info!("Skipping syncfs on startup");
-        } else {
-            let started = Instant::now();
-            syncfs(dirfd)?;
-            let elapsed = started.elapsed();
-            info!(
-                elapsed_ms = elapsed.as_millis(),
-                "made tenant directory contents durable"
-            );
-        }
+        let started = Instant::now();
+        syncfs(dirfd)?;
+        let elapsed = started.elapsed();
+        info!(
+            elapsed_ms = elapsed.as_millis(),
+            "made tenant directory contents durable"
+        );
    }

    // Initialize up failpoints support
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -69,7 +69,6 @@ pub struct PageServerConf {
    pub wal_redo_timeout: Duration,

    pub superuser: String,
-    pub locale: String,

    pub page_cache_size: usize,
    pub max_file_descriptors: usize,
@@ -179,9 +178,6 @@ pub struct PageServerConf {

    /// Direct IO settings
    pub virtual_file_io_mode: virtual_file::IoMode,
-
-    /// Optionally disable disk syncs (unsafe!)
-    pub no_sync: bool,
 }

 /// Token for authentication to safekeepers
@@ -302,7 +298,6 @@ impl PageServerConf {
            wait_lsn_timeout,
            wal_redo_timeout,
            superuser,
-            locale,
            page_cache_size,
            max_file_descriptors,
            pg_distrib_dir,
@@ -337,7 +332,6 @@ impl PageServerConf {
            concurrent_tenant_size_logical_size_queries,
            virtual_file_io_engine,
            tenant_config,
-            no_sync,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -350,7 +344,6 @@ impl PageServerConf {
            wait_lsn_timeout,
            wal_redo_timeout,
            superuser,
-            locale,
            page_cache_size,
            max_file_descriptors,
            http_auth_type,
@@ -416,7 +409,6 @@ impl PageServerConf {
                .map(crate::l0_flush::L0FlushConfig::from)
                .unwrap_or_default(),
            virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
-            no_sync: no_sync.unwrap_or(false),
        };

        // ------------------------------------------------------------
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2002,9 +2002,9 @@ async fn timeline_offload_handler(
                "timeline has attached children".into(),
            ));
        }
-        if let (false, reason) = timeline.can_offload() {
+        if !timeline.can_offload() {
            return Err(ApiError::PreconditionFailed(
-                format!("Timeline::can_offload() check failed: {}", reason) .into(),
+                "Timeline::can_offload() returned false".into(),
            ));
        }
        offload_timeline(&tenant, &timeline)
@@ -2169,21 +2169,6 @@ async fn timeline_detach_ancestor_handler(
        let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
        let ctx = &ctx;

-        // Flush the upload queues of all timelines before detaching ancestor. We do the same thing again
-        // during shutdown. This early upload ensures the pageserver does not need to upload too many
-        // things and creates downtime during timeline reloads.
-        for timeline in tenant.list_timelines() {
-            timeline
-                .remote_client
-                .wait_completion()
-                .await
-                .map_err(|e| {
-                    ApiError::PreconditionFailed(format!("cannot drain upload queue: {e}").into())
-                })?;
-        }
-
-        tracing::info!("all timeline upload queues are drained");
-
        let timeline = tenant.get_timeline(timeline_id, true)?;

        let progress = timeline
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1,11 +1,10 @@
 //! The Page Service listens for client connections and serves their GetPage@LSN
 //! requests.

-use anyhow::{bail, Context};
+use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use futures::FutureExt;
-use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
@@ -1222,222 +1221,6 @@ impl PageServerHandler {
    }
 }

-/// `basebackup tenant timeline [lsn] [--gzip] [--replica]`
-#[derive(Debug, Clone, Eq, PartialEq)]
-struct BaseBackupCmd {
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    lsn: Option<Lsn>,
-    gzip: bool,
-    replica: bool,
-}
-
-/// `fullbackup tenant timeline [lsn] [prev_lsn]`
-#[derive(Debug, Clone, Eq, PartialEq)]
-struct FullBackupCmd {
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    lsn: Option<Lsn>,
-    prev_lsn: Option<Lsn>,
-}
-
-/// `pagestream_v2 tenant timeline`
-#[derive(Debug, Clone, Eq, PartialEq)]
-struct PageStreamCmd {
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-}
-
-/// `lease lsn tenant timeline lsn`
-#[derive(Debug, Clone, Eq, PartialEq)]
-struct LeaseLsnCmd {
-    tenant_shard_id: TenantShardId,
-    timeline_id: TimelineId,
-    lsn: Lsn,
-}
-
-#[derive(Debug, Clone, Eq, PartialEq)]
-enum PageServiceCmd {
-    Set,
-    PageStream(PageStreamCmd),
-    BaseBackup(BaseBackupCmd),
-    FullBackup(FullBackupCmd),
-    LeaseLsn(LeaseLsnCmd),
-}
-
-impl PageStreamCmd {
-    fn parse(query: &str) -> anyhow::Result<Self> {
-        let parameters = query.split_whitespace().collect_vec();
-        if parameters.len() != 2 {
-            bail!(
-                "invalid number of parameters for pagestream command: {}",
-                query
-            );
-        }
-        let tenant_id = TenantId::from_str(parameters[0])
-            .with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
-        let timeline_id = TimelineId::from_str(parameters[1])
-            .with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
-        Ok(Self {
-            tenant_id,
-            timeline_id,
-        })
-    }
-}
-
-impl FullBackupCmd {
-    fn parse(query: &str) -> anyhow::Result<Self> {
-        let parameters = query.split_whitespace().collect_vec();
-        if parameters.len() < 2 || parameters.len() > 4 {
-            bail!(
-                "invalid number of parameters for basebackup command: {}",
-                query
-            );
-        }
-        let tenant_id = TenantId::from_str(parameters[0])
-            .with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
-        let timeline_id = TimelineId::from_str(parameters[1])
-            .with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
-        // The caller is responsible for providing correct lsn and prev_lsn.
-        let lsn = if let Some(lsn_str) = parameters.get(2) {
-            Some(
-                Lsn::from_str(lsn_str)
-                    .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
-            )
-        } else {
-            None
-        };
-        let prev_lsn = if let Some(prev_lsn_str) = parameters.get(3) {
-            Some(
-                Lsn::from_str(prev_lsn_str)
-                    .with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
-            )
-        } else {
-            None
-        };
-        Ok(Self {
-            tenant_id,
-            timeline_id,
-            lsn,
-            prev_lsn,
-        })
-    }
-}
-
-impl BaseBackupCmd {
-    fn parse(query: &str) -> anyhow::Result<Self> {
-        let parameters = query.split_whitespace().collect_vec();
-        if parameters.len() < 2 {
-            bail!(
-                "invalid number of parameters for basebackup command: {}",
-                query
-            );
-        }
-        let tenant_id = TenantId::from_str(parameters[0])
-            .with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
-        let timeline_id = TimelineId::from_str(parameters[1])
-            .with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
-        let lsn;
-        let flags_parse_from;
-        if let Some(maybe_lsn) = parameters.get(2) {
-            if *maybe_lsn == "latest" {
-                lsn = None;
-                flags_parse_from = 3;
-            } else if maybe_lsn.starts_with("--") {
-                lsn = None;
-                flags_parse_from = 2;
-            } else {
-                lsn = Some(
-                    Lsn::from_str(maybe_lsn)
-                        .with_context(|| format!("Failed to parse lsn from {maybe_lsn}"))?,
-                );
-                flags_parse_from = 3;
-            }
-        } else {
-            lsn = None;
-            flags_parse_from = 2;
-        }
-
-        let mut gzip = false;
-        let mut replica = false;
-
-        for &param in &parameters[flags_parse_from..] {
-            match param {
-                "--gzip" => {
-                    if gzip {
-                        bail!("duplicate parameter for basebackup command: {param}")
-                    }
-                    gzip = true
-                }
-                "--replica" => {
-                    if replica {
-                        bail!("duplicate parameter for basebackup command: {param}")
-                    }
-                    replica = true
-                }
-                _ => bail!("invalid parameter for basebackup command: {param}"),
-            }
-        }
-        Ok(Self {
-            tenant_id,
-            timeline_id,
-            lsn,
-            gzip,
-            replica,
-        })
-    }
-}
-
-impl LeaseLsnCmd {
-    fn parse(query: &str) -> anyhow::Result<Self> {
-        let parameters = query.split_whitespace().collect_vec();
-        if parameters.len() != 3 {
-            bail!(
-                "invalid number of parameters for lease lsn command: {}",
-                query
-            );
-        }
-        let tenant_shard_id = TenantShardId::from_str(parameters[0])
-            .with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
-        let timeline_id = TimelineId::from_str(parameters[1])
-            .with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
-        let lsn = Lsn::from_str(parameters[2])
-            .with_context(|| format!("Failed to parse lsn from {}", parameters[2]))?;
-        Ok(Self {
-            tenant_shard_id,
-            timeline_id,
-            lsn,
-        })
-    }
-}
-
-impl PageServiceCmd {
-    fn parse(query: &str) -> anyhow::Result<Self> {
-        let query = query.trim();
-        let Some((cmd, other)) = query.split_once(' ') else {
-            bail!("cannot parse query: {query}")
-        };
-        match cmd.to_ascii_lowercase().as_str() {
-            "pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(other)?)),
-            "basebackup" => Ok(Self::BaseBackup(BaseBackupCmd::parse(other)?)),
-            "fullbackup" => Ok(Self::FullBackup(FullBackupCmd::parse(other)?)),
-            "lease" => {
-                let Some((cmd2, other)) = other.split_once(' ') else {
-                    bail!("invalid lease command: {cmd}");
-                };
-                let cmd2 = cmd2.to_ascii_lowercase();
-                if cmd2 == "lsn" {
-                    Ok(Self::LeaseLsn(LeaseLsnCmd::parse(other)?))
-                } else {
-                    bail!("invalid lease command: {cmd}");
-                }
-            }
-            "set" => Ok(Self::Set),
-            _ => Err(anyhow::anyhow!("unsupported command {cmd} in {query}")),
-        }
-    }
-}
-
 impl<IO> postgres_backend::Handler<IO> for PageServerHandler
 where
    IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -1494,137 +1277,206 @@ where
        fail::fail_point!("ps::connection-start::process-query");

        let ctx = self.connection_ctx.attached_child();
-        debug!("process query {query_string}");
-        let query = PageServiceCmd::parse(query_string)?;
-        match query {
-            PageServiceCmd::PageStream(PageStreamCmd {
-                tenant_id,
-                timeline_id,
-            }) => {
-                tracing::Span::current()
-                    .record("tenant_id", field::display(tenant_id))
-                    .record("timeline_id", field::display(timeline_id));
-
-                self.check_permission(Some(tenant_id))?;
-
-                COMPUTE_COMMANDS_COUNTERS
-                    .for_command(ComputeCommandKind::PageStreamV2)
-                    .inc();
-
-                self.handle_pagerequests(
-                    pgb,
-                    tenant_id,
-                    timeline_id,
-                    PagestreamProtocolVersion::V2,
-                    ctx,
-                )
-                .await?;
+        debug!("process query {query_string:?}");
+        let parts = query_string.split_whitespace().collect::<Vec<_>>();
+        if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
+            if params.len() != 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for pagestream command"
+                )));
            }
-            PageServiceCmd::BaseBackup(BaseBackupCmd {
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::PageStreamV2)
+                .inc();
+
+            self.handle_pagerequests(
+                pgb,
                tenant_id,
                timeline_id,
-                lsn,
-                gzip,
-                replica,
-            }) => {
-                tracing::Span::current()
-                    .record("tenant_id", field::display(tenant_id))
-                    .record("timeline_id", field::display(timeline_id));
+                PagestreamProtocolVersion::V2,
+                ctx,
+            )
+            .await?;
+        } else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
+            if params.len() < 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for basebackup command"
+                )));
+            }

-                self.check_permission(Some(tenant_id))?;
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;

-                COMPUTE_COMMANDS_COUNTERS
-                    .for_command(ComputeCommandKind::Basebackup)
-                    .inc();
-                let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
-                let res = async {
-                    self.handle_basebackup_request(
-                        pgb,
-                        tenant_id,
-                        timeline_id,
-                        lsn,
-                        None,
-                        false,
-                        gzip,
-                        replica,
-                        &ctx,
-                    )
-                    .await?;
-                    pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                    Result::<(), QueryError>::Ok(())
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::Basebackup)
+                .inc();
+
+            let mut lsn = None;
+            let mut replica = false;
+            let mut gzip = false;
+            for param in &params[2..] {
+                if param.starts_with("--") {
+                    match *param {
+                        "--gzip" => gzip = true,
+                        "--replica" => replica = true,
+                        _ => {
+                            return Err(QueryError::Other(anyhow::anyhow!(
+                                "Unknown parameter {param}",
+                            )))
+                        }
+                    }
+                } else {
+                    lsn = Some(
+                        Lsn::from_str(param)
+                            .with_context(|| format!("Failed to parse Lsn from {param}"))?,
+                    );
                }
-                .await;
-                metric_recording.observe(&res);
-                res?;
            }
-            // same as basebackup, but result includes relational data as well
-            PageServiceCmd::FullBackup(FullBackupCmd {
-                tenant_id,
-                timeline_id,
-                lsn,
-                prev_lsn,
-            }) => {
-                tracing::Span::current()
-                    .record("tenant_id", field::display(tenant_id))
-                    .record("timeline_id", field::display(timeline_id));

-                self.check_permission(Some(tenant_id))?;
-
-                COMPUTE_COMMANDS_COUNTERS
-                    .for_command(ComputeCommandKind::Fullbackup)
-                    .inc();
-
-                // Check that the timeline exists
+            let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
+            let res = async {
                self.handle_basebackup_request(
                    pgb,
                    tenant_id,
                    timeline_id,
                    lsn,
-                    prev_lsn,
-                    true,
-                    false,
+                    None,
                    false,
+                    gzip,
+                    replica,
                    &ctx,
                )
                .await?;
                pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                Result::<(), QueryError>::Ok(())
            }
-            PageServiceCmd::Set => {
-                // important because psycopg2 executes "SET datestyle TO 'ISO'"
-                // on connect
-                pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+            .await;
+            metric_recording.observe(&res);
+            res?;
+        }
+        // same as basebackup, but result includes relational data as well
+        else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
+            if params.len() < 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for fullbackup command"
+                )));
            }
-            PageServiceCmd::LeaseLsn(LeaseLsnCmd {
-                tenant_shard_id,
+
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            // The caller is responsible for providing correct lsn and prev_lsn.
+            let lsn = if let Some(lsn_str) = params.get(2) {
+                Some(
+                    Lsn::from_str(lsn_str)
+                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
+                )
+            } else {
+                None
+            };
+            let prev_lsn = if let Some(prev_lsn_str) = params.get(3) {
+                Some(
+                    Lsn::from_str(prev_lsn_str)
+                        .with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
+                )
+            } else {
+                None
+            };
+
+            self.check_permission(Some(tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::Fullbackup)
+                .inc();
+
+            // Check that the timeline exists
+            self.handle_basebackup_request(
+                pgb,
+                tenant_id,
                timeline_id,
                lsn,
-            }) => {
-                tracing::Span::current()
-                    .record("tenant_id", field::display(tenant_shard_id))
-                    .record("timeline_id", field::display(timeline_id));
-
-                self.check_permission(Some(tenant_shard_id.tenant_id))?;
-
-                COMPUTE_COMMANDS_COUNTERS
-                    .for_command(ComputeCommandKind::LeaseLsn)
-                    .inc();
-
-                match self
-                    .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
-                    .await
-                {
-                    Ok(()) => {
-                        pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?
-                    }
-                    Err(e) => {
-                        error!("error obtaining lsn lease for {lsn}: {e:?}");
-                        pgb.write_message_noflush(&BeMessage::ErrorResponse(
-                            &e.to_string(),
-                            Some(e.pg_error_code()),
-                        ))?
-                    }
-                };
+                prev_lsn,
+                true,
+                false,
+                false,
+                &ctx,
+            )
+            .await?;
+            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        } else if query_string.to_ascii_lowercase().starts_with("set ") {
+            // important because psycopg2 executes "SET datestyle TO 'ISO'"
+            // on connect
+            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        } else if query_string.starts_with("lease lsn ") {
+            let params = &parts[2..];
+            if params.len() != 3 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number {} for lease lsn command",
+                    params.len()
+                )));
            }
+
+            let tenant_shard_id = TenantShardId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_shard_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_shard_id.tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::LeaseLsn)
+                .inc();
+
+            // The caller is responsible for providing correct lsn.
+            let lsn = Lsn::from_str(params[2])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
+
+            match self
+                .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
+                .await
+            {
+                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
+                Err(e) => {
+                    error!("error obtaining lsn lease for {lsn}: {e:?}");
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?
+                }
+            };
+        } else {
+            return Err(QueryError::Other(anyhow::anyhow!(
+                "unknown command {query_string}"
+            )));
        }

        Ok(())
@@ -1673,181 +1525,3 @@ fn set_tracing_field_shard_id(timeline: &Timeline) {
    );
    debug_assert_current_span_has_tenant_and_timeline_id();
 }
-
-#[cfg(test)]
-mod tests {
-    use utils::shard::ShardCount;
-
-    use super::*;
-
-    #[test]
-    fn pageservice_cmd_parse() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-        let cmd =
-            PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id} {timeline_id}")).unwrap();
-        assert_eq!(
-            cmd,
-            PageServiceCmd::PageStream(PageStreamCmd {
-                tenant_id,
-                timeline_id
-            })
-        );
-        let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id}")).unwrap();
-        assert_eq!(
-            cmd,
-            PageServiceCmd::BaseBackup(BaseBackupCmd {
-                tenant_id,
-                timeline_id,
-                lsn: None,
-                gzip: false,
-                replica: false
-            })
-        );
-        let cmd =
-            PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} --gzip")).unwrap();
-        assert_eq!(
-            cmd,
-            PageServiceCmd::BaseBackup(BaseBackupCmd {
-                tenant_id,
-                timeline_id,
-                lsn: None,
-                gzip: true,
-                replica: false
-            })
-        );
-        let cmd =
-            PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} latest")).unwrap();
-        assert_eq!(
-            cmd,
-            PageServiceCmd::BaseBackup(BaseBackupCmd {
-                tenant_id,
-                timeline_id,
-                lsn: None,
-                gzip: false,
-                replica: false
-            })
-        );
-        let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} 0/16ABCDE"))
-            .unwrap();
-        assert_eq!(
-            cmd,
-            PageServiceCmd::BaseBackup(BaseBackupCmd {
-                tenant_id,
-                timeline_id,
-                lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
-                gzip: false,
-                replica: false
-            })
-        );
-        let cmd = PageServiceCmd::parse(&format!(
-            "basebackup {tenant_id} {timeline_id} --replica --gzip"
-        ))
-        .unwrap();
-        assert_eq!(
-            cmd,
-            PageServiceCmd::BaseBackup(BaseBackupCmd {
-                tenant_id,
-                timeline_id,
-                lsn: None,
-                gzip: true,
-                replica: true
-            })
-        );
-        let cmd = PageServiceCmd::parse(&format!(
-            "basebackup {tenant_id} {timeline_id} 0/16ABCDE --replica --gzip"
-        ))
-        .unwrap();
-        assert_eq!(
-            cmd,
-            PageServiceCmd::BaseBackup(BaseBackupCmd {
-                tenant_id,
-                timeline_id,
-                lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
-                gzip: true,
-                replica: true
-            })
-        );
-        let cmd = PageServiceCmd::parse(&format!("fullbackup {tenant_id} {timeline_id}")).unwrap();
-        assert_eq!(
-            cmd,
-            PageServiceCmd::FullBackup(FullBackupCmd {
-                tenant_id,
-                timeline_id,
-                lsn: None,
-                prev_lsn: None
-            })
-        );
-        let cmd = PageServiceCmd::parse(&format!(
-            "fullbackup {tenant_id} {timeline_id} 0/16ABCDE 0/16ABCDF"
-        ))
-        .unwrap();
-        assert_eq!(
-            cmd,
-            PageServiceCmd::FullBackup(FullBackupCmd {
-                tenant_id,
-                timeline_id,
-                lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
-                prev_lsn: Some(Lsn::from_str("0/16ABCDF").unwrap()),
-            })
-        );
-        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-        let cmd = PageServiceCmd::parse(&format!(
-            "lease lsn {tenant_shard_id} {timeline_id} 0/16ABCDE"
-        ))
-        .unwrap();
-        assert_eq!(
-            cmd,
-            PageServiceCmd::LeaseLsn(LeaseLsnCmd {
-                tenant_shard_id,
-                timeline_id,
-                lsn: Lsn::from_str("0/16ABCDE").unwrap(),
-            })
-        );
-        let tenant_shard_id = TenantShardId::split(&tenant_shard_id, ShardCount(8))[1];
-        let cmd = PageServiceCmd::parse(&format!(
-            "lease lsn {tenant_shard_id} {timeline_id} 0/16ABCDE"
-        ))
-        .unwrap();
-        assert_eq!(
-            cmd,
-            PageServiceCmd::LeaseLsn(LeaseLsnCmd {
-                tenant_shard_id,
-                timeline_id,
-                lsn: Lsn::from_str("0/16ABCDE").unwrap(),
-            })
-        );
-        let cmd = PageServiceCmd::parse("set a = b").unwrap();
-        assert_eq!(cmd, PageServiceCmd::Set);
-        let cmd = PageServiceCmd::parse("SET foo").unwrap();
-        assert_eq!(cmd, PageServiceCmd::Set);
-    }
-
-    #[test]
-    fn pageservice_cmd_err_handling() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-        let cmd = PageServiceCmd::parse("unknown_command");
-        assert!(cmd.is_err());
-        let cmd = PageServiceCmd::parse("pagestream_v2");
-        assert!(cmd.is_err());
-        let cmd = PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id}xxx"));
-        assert!(cmd.is_err());
-        let cmd = PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id}xxx {timeline_id}xxx"));
-        assert!(cmd.is_err());
-        let cmd = PageServiceCmd::parse(&format!(
-            "basebackup {tenant_id} {timeline_id} --gzip --gzip"
-        ));
-        assert!(cmd.is_err());
-        let cmd = PageServiceCmd::parse(&format!(
-            "basebackup {tenant_id} {timeline_id} --gzip --unknown"
-        ));
-        assert!(cmd.is_err());
-        let cmd = PageServiceCmd::parse(&format!(
-            "basebackup {tenant_id} {timeline_id} --gzip 0/16ABCDE"
-        ));
-        assert!(cmd.is_err());
-        let cmd = PageServiceCmd::parse(&format!("lease {tenant_id} {timeline_id} gzip 0/16ABCDE"));
-        assert!(cmd.is_err());
-    }
-}
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -45,7 +45,7 @@ use wal_decoder::serialized_batch::SerializedValueBatch;
 pub const MAX_AUX_FILE_DELTAS: usize = 1024;

 /// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached.
-pub const MAX_AUX_FILE_V2_DELTAS: usize = 16;
+pub const MAX_AUX_FILE_V2_DELTAS: usize = 64;

 #[derive(Debug)]
 pub enum LsnForTimestamp {
@@ -1164,12 +1164,19 @@ impl<'a> DatadirModification<'a> {
            .get_rel_exists(rel, Version::Modified(self), ctx)
            .await?
        {
+            tracing::debug!("Creating relation {rel:?} at lsn {}", self.get_lsn());
+
            // create it with 0 size initially, the logic below will extend it
            self.put_rel_creation(rel, 0, ctx)
                .await
                .context("Relation Error")?;
            Ok(0)
        } else {
+            tracing::debug!(
+                "Skipping relation {rel:?} creation at lsn {}",
+                self.get_lsn()
+            );
+
            self.tline
                .get_rel_size(rel, Version::Modified(self), ctx)
                .await
@@ -1210,6 +1217,8 @@ impl<'a> DatadirModification<'a> {
        shard: &ShardIdentity,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
+        tracing::debug!("Ingesting batch with metadata: {:?}", batch.metadata);
+
        let mut gaps_at_lsns = Vec::default();

        for meta in batch.metadata.iter() {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2493,8 +2493,7 @@ impl Tenant {
            timelines_to_compact_or_offload = timelines
                .iter()
                .filter_map(|(timeline_id, timeline)| {
-                    let (is_active, (can_offload, _)) =
-                        (timeline.is_active(), timeline.can_offload());
+                    let (is_active, can_offload) = (timeline.is_active(), timeline.can_offload());
                    let has_no_unoffloaded_children = {
                        !timelines
                            .iter()
@@ -4780,12 +4779,10 @@ async fn run_initdb(

    let _permit = INIT_DB_SEMAPHORE.acquire().await;

-    let mut initdb_command = tokio::process::Command::new(&initdb_bin_path);
-    initdb_command
+    let initdb_command = tokio::process::Command::new(&initdb_bin_path)
        .args(["--pgdata", initdb_target_dir.as_ref()])
        .args(["--username", &conf.superuser])
        .args(["--encoding", "utf8"])
-        .args(["--locale", &conf.locale])
        .arg("--no-instructions")
        .arg("--no-sync")
        .env_clear()
@@ -4795,27 +4792,15 @@ async fn run_initdb(
        // stdout invocation produces the same output every time, we don't need it
        .stdout(std::process::Stdio::null())
        // we would be interested in the stderr output, if there was any
-        .stderr(std::process::Stdio::piped());
-
-    // Before version 14, only the libc provide was available.
-    if pg_version > 14 {
-        // Version 17 brought with it a builtin locale provider which only provides
-        // C and C.UTF-8. While being safer for collation purposes since it is
-        // guaranteed to be consistent throughout a major release, it is also more
-        // performant.
-        let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" };
-
-        initdb_command.args(["--locale-provider", locale_provider]);
-    }
-
-    let initdb_proc = initdb_command.spawn()?;
+        .stderr(std::process::Stdio::piped())
+        .spawn()?;

    // Ideally we'd select here with the cancellation token, but the problem is that
    // we can't safely terminate initdb: it launches processes of its own, and killing
    // initdb doesn't kill them. After we return from this function, we want the target
    // directory to be able to be cleaned up.
    // See https://github.com/neondatabase/neon/issues/6385
-    let initdb_output = initdb_proc.wait_with_output().await?;
+    let initdb_output = initdb_command.wait_with_output().await?;
    if !initdb_output.status.success() {
        return Err(InitdbError::Failed(
            initdb_output.status,
@@ -7757,13 +7742,13 @@ mod tests {
            (
                get_key(3),
                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_clear("c")),
+                Value::WalRecord(NeonWalRecord::wal_clear()),
            ),
            (get_key(4), Lsn(0x10), Value::Image("0x10".into())),
            (
                get_key(4),
                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_init("i")),
+                Value::WalRecord(NeonWalRecord::wal_init()),
            ),
        ];
        let image1 = vec![(get_key(1), "0x10".into())];
@@ -7912,30 +7897,8 @@ mod tests {

    #[cfg(feature = "testing")]
    #[tokio::test]
-    async fn test_simple_bottom_most_compaction_deltas_1() -> anyhow::Result<()> {
-        test_simple_bottom_most_compaction_deltas_helper(
-            "test_simple_bottom_most_compaction_deltas_1",
-            false,
-        )
-        .await
-    }
-
-    #[cfg(feature = "testing")]
-    #[tokio::test]
-    async fn test_simple_bottom_most_compaction_deltas_2() -> anyhow::Result<()> {
-        test_simple_bottom_most_compaction_deltas_helper(
-            "test_simple_bottom_most_compaction_deltas_2",
-            true,
-        )
-        .await
-    }
-
-    #[cfg(feature = "testing")]
-    async fn test_simple_bottom_most_compaction_deltas_helper(
-        test_name: &'static str,
-        use_delta_bottom_layer: bool,
-    ) -> anyhow::Result<()> {
-        let harness = TenantHarness::create(test_name).await?;
+    async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
        let (tenant, ctx) = harness.load().await;

        fn get_key(id: u32) -> Key {
@@ -7966,16 +7929,6 @@ mod tests {
        let img_layer = (0..10)
            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
            .collect_vec();
-        // or, delta layer at 0x10 if `use_delta_bottom_layer` is true
-        let delta4 = (0..10)
-            .map(|id| {
-                (
-                    get_key(id),
-                    Lsn(0x08),
-                    Value::WalRecord(NeonWalRecord::wal_init(format!("value {id}@0x10"))),
-                )
-            })
-            .collect_vec();

        let delta1 = vec![
            (
@@ -8029,61 +7982,21 @@ mod tests {
            ),
        ];

-        let tline = if use_delta_bottom_layer {
-            tenant
-                .create_test_timeline_with_layers(
-                    TIMELINE_ID,
-                    Lsn(0x08),
-                    DEFAULT_PG_VERSION,
-                    &ctx,
-                    vec![
-                        DeltaLayerTestDesc::new_with_inferred_key_range(
-                            Lsn(0x08)..Lsn(0x10),
-                            delta4,
-                        ),
-                        DeltaLayerTestDesc::new_with_inferred_key_range(
-                            Lsn(0x20)..Lsn(0x48),
-                            delta1,
-                        ),
-                        DeltaLayerTestDesc::new_with_inferred_key_range(
-                            Lsn(0x20)..Lsn(0x48),
-                            delta2,
-                        ),
-                        DeltaLayerTestDesc::new_with_inferred_key_range(
-                            Lsn(0x48)..Lsn(0x50),
-                            delta3,
-                        ),
-                    ], // delta layers
-                    vec![], // image layers
-                    Lsn(0x50),
-                )
-                .await?
-        } else {
-            tenant
-                .create_test_timeline_with_layers(
-                    TIMELINE_ID,
-                    Lsn(0x10),
-                    DEFAULT_PG_VERSION,
-                    &ctx,
-                    vec![
-                        DeltaLayerTestDesc::new_with_inferred_key_range(
-                            Lsn(0x10)..Lsn(0x48),
-                            delta1,
-                        ),
-                        DeltaLayerTestDesc::new_with_inferred_key_range(
-                            Lsn(0x10)..Lsn(0x48),
-                            delta2,
-                        ),
-                        DeltaLayerTestDesc::new_with_inferred_key_range(
-                            Lsn(0x48)..Lsn(0x50),
-                            delta3,
-                        ),
-                    ], // delta layers
-                    vec![(Lsn(0x10), img_layer)], // image layers
-                    Lsn(0x50),
-                )
-                .await?
-        };
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
+                ], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
@@ -8193,7 +8106,7 @@ mod tests {
            (
                key,
                Lsn(0x10),
-                Value::WalRecord(NeonWalRecord::wal_init("0x10")),
+                Value::Image(Bytes::copy_from_slice(b"0x10")),
            ),
            (
                key,
@@ -8255,7 +8168,7 @@ mod tests {
                    Lsn(0x20),
                    KeyLogAtLsn(vec![(
                        Lsn(0x20),
-                        Value::Image(Bytes::from_static(b"0x10;0x20")),
+                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20")),
                    )]),
                ),
                (
@@ -9237,7 +9150,7 @@ mod tests {

            let will_init = will_init_keys.contains(&i);
            if will_init {
-                delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init(""))));
+                delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init())));

                expected_key_values.insert(key, "".to_string());
            } else {
@@ -9295,23 +9208,6 @@ mod tests {
        Ok(())
    }

-    fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
-        (
-            k1.is_delta,
-            k1.key_range.start,
-            k1.key_range.end,
-            k1.lsn_range.start,
-            k1.lsn_range.end,
-        )
-            .cmp(&(
-                k2.is_delta,
-                k2.key_range.start,
-                k2.key_range.end,
-                k2.lsn_range.start,
-                k2.lsn_range.end,
-            ))
-    }
-
    async fn inspect_and_sort(
        tline: &Arc<Timeline>,
        filter: Option<std::ops::Range<Key>>,
@@ -9320,30 +9216,25 @@ mod tests {
        if let Some(filter) = filter {
            all_layers.retain(|layer| overlaps_with(&layer.key_range, &filter));
        }
-        all_layers.sort_by(sort_layer_key);
+        all_layers.sort_by(|k1, k2| {
+            (
+                k1.is_delta,
+                k1.key_range.start,
+                k1.key_range.end,
+                k1.lsn_range.start,
+                k1.lsn_range.end,
+            )
+                .cmp(&(
+                    k2.is_delta,
+                    k2.key_range.start,
+                    k2.key_range.end,
+                    k2.lsn_range.start,
+                    k2.lsn_range.end,
+                ))
+        });
        all_layers
    }

-    #[cfg(feature = "testing")]
-    fn check_layer_map_key_eq(
-        mut left: Vec<PersistentLayerKey>,
-        mut right: Vec<PersistentLayerKey>,
-    ) {
-        left.sort_by(sort_layer_key);
-        right.sort_by(sort_layer_key);
-        if left != right {
-            eprintln!("---LEFT---");
-            for left in left.iter() {
-                eprintln!("{}", left);
-            }
-            eprintln!("---RIGHT---");
-            for right in right.iter() {
-                eprintln!("{}", right);
-            }
-            assert_eq!(left, right);
-        }
-    }
-
    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_simple_partial_bottom_most_compaction() -> anyhow::Result<()> {
@@ -9436,206 +9327,127 @@ mod tests {

        let cancel = CancellationToken::new();

-        // Do a partial compaction on key range 0..2
+        // Do a partial compaction on key range 0..4, we should generate a image layer; no other layers
+        // can be removed because they might be used for other key ranges.
        tline
-            .partial_compact_with_gc(get_key(0)..get_key(2), &cancel, EnumSet::new(), &ctx)
+            .partial_compact_with_gc(Some(get_key(0)..get_key(4)), &cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
-        check_layer_map_key_eq(
+        assert_eq!(
            all_layers,
            vec![
-                // newly-generated image layer for the partial compaction range 0-2
                PersistentLayerKey {
-                    key_range: get_key(0)..get_key(2),
+                    key_range: get_key(0)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false,
+                    is_delta: false
                },
                PersistentLayerKey {
                    key_range: get_key(0)..get_key(10),
                    lsn_range: Lsn(0x10)..Lsn(0x11),
-                    is_delta: false,
+                    is_delta: false
                },
-                // delta1 is split and the second part is rewritten
                PersistentLayerKey {
-                    key_range: get_key(2)..get_key(4),
+                    key_range: get_key(1)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true,
+                    is_delta: true
                },
                PersistentLayerKey {
                    key_range: get_key(5)..get_key(7),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true,
+                    is_delta: true
                },
                PersistentLayerKey {
                    key_range: get_key(8)..get_key(10),
                    lsn_range: Lsn(0x48)..Lsn(0x50),
-                    is_delta: true,
-                },
-            ],
+                    is_delta: true
+                }
+            ]
        );

-        // Do a partial compaction on key range 2..4
+        // Do a partial compaction on key range 4..10
        tline
-            .partial_compact_with_gc(get_key(2)..get_key(4), &cancel, EnumSet::new(), &ctx)
+            .partial_compact_with_gc(Some(get_key(4)..get_key(10)), &cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
-        check_layer_map_key_eq(
+        assert_eq!(
            all_layers,
            vec![
                PersistentLayerKey {
-                    key_range: get_key(0)..get_key(2),
+                    key_range: get_key(0)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false,
+                    is_delta: false
                },
                PersistentLayerKey {
+                    // if (in the future) GC kicks in, this layer will be removed
                    key_range: get_key(0)..get_key(10),
                    lsn_range: Lsn(0x10)..Lsn(0x11),
-                    is_delta: false,
+                    is_delta: false
                },
-                // image layer generated for the compaction range 2-4
                PersistentLayerKey {
-                    key_range: get_key(2)..get_key(4),
+                    key_range: get_key(4)..get_key(10),
                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false,
+                    is_delta: false
                },
-                // we have key2/key3 above the retain_lsn, so we still need this delta layer
                PersistentLayerKey {
-                    key_range: get_key(2)..get_key(4),
+                    key_range: get_key(1)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true,
+                    is_delta: true
                },
                PersistentLayerKey {
                    key_range: get_key(5)..get_key(7),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true,
+                    is_delta: true
                },
                PersistentLayerKey {
                    key_range: get_key(8)..get_key(10),
                    lsn_range: Lsn(0x48)..Lsn(0x50),
-                    is_delta: true,
-                },
-            ],
-        );
-
-        // Do a partial compaction on key range 4..9
-        tline
-            .partial_compact_with_gc(get_key(4)..get_key(9), &cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
-        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
-        check_layer_map_key_eq(
-            all_layers,
-            vec![
-                PersistentLayerKey {
-                    key_range: get_key(0)..get_key(2),
-                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false,
-                },
-                PersistentLayerKey {
-                    key_range: get_key(0)..get_key(10),
-                    lsn_range: Lsn(0x10)..Lsn(0x11),
-                    is_delta: false,
-                },
-                PersistentLayerKey {
-                    key_range: get_key(2)..get_key(4),
-                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false,
-                },
-                PersistentLayerKey {
-                    key_range: get_key(2)..get_key(4),
-                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true,
-                },
-                // image layer generated for this compaction range
-                PersistentLayerKey {
-                    key_range: get_key(4)..get_key(9),
-                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false,
-                },
-                PersistentLayerKey {
-                    key_range: get_key(8)..get_key(10),
-                    lsn_range: Lsn(0x48)..Lsn(0x50),
-                    is_delta: true,
-                },
-            ],
-        );
-
-        // Do a partial compaction on key range 9..10
-        tline
-            .partial_compact_with_gc(get_key(9)..get_key(10), &cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
-        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
-        check_layer_map_key_eq(
-            all_layers,
-            vec![
-                PersistentLayerKey {
-                    key_range: get_key(0)..get_key(2),
-                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false,
-                },
-                PersistentLayerKey {
-                    key_range: get_key(0)..get_key(10),
-                    lsn_range: Lsn(0x10)..Lsn(0x11),
-                    is_delta: false,
-                },
-                PersistentLayerKey {
-                    key_range: get_key(2)..get_key(4),
-                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false,
-                },
-                PersistentLayerKey {
-                    key_range: get_key(2)..get_key(4),
-                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true,
-                },
-                PersistentLayerKey {
-                    key_range: get_key(4)..get_key(9),
-                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false,
-                },
-                // image layer generated for the compaction range
-                PersistentLayerKey {
-                    key_range: get_key(9)..get_key(10),
-                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false,
-                },
-                PersistentLayerKey {
-                    key_range: get_key(8)..get_key(10),
-                    lsn_range: Lsn(0x48)..Lsn(0x50),
-                    is_delta: true,
-                },
-            ],
+                    is_delta: true
+                }
+            ]
        );

        // Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
        tline
-            .partial_compact_with_gc(get_key(0)..get_key(10), &cancel, EnumSet::new(), &ctx)
+            .partial_compact_with_gc(Some(get_key(0)..get_key(10)), &cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
-        check_layer_map_key_eq(
+        assert_eq!(
            all_layers,
            vec![
-                // aha, we removed all unnecessary image/delta layers and got a very clean layer map!
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false
+                },
                PersistentLayerKey {
                    key_range: get_key(0)..get_key(10),
                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false,
+                    is_delta: false
                },
                PersistentLayerKey {
-                    key_range: get_key(2)..get_key(4),
+                    key_range: get_key(4)..get_key(10),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false
+                },
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true,
+                    is_delta: true
+                },
+                PersistentLayerKey {
+                    key_range: get_key(5)..get_key(7),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true
                },
                PersistentLayerKey {
                    key_range: get_key(8)..get_key(10),
                    lsn_range: Lsn(0x48)..Lsn(0x50),
-                    is_delta: true,
-                },
-            ],
+                    is_delta: true
+                }
+            ]
        );

        Ok(())
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1959,7 +1959,7 @@ impl TenantManager {
            attempt.before_reset_tenant();

            let (_guard, progress) = utils::completion::channel();
-            match tenant.shutdown(progress, ShutdownMode::Flush).await {
+            match tenant.shutdown(progress, ShutdownMode::Hard).await {
                Ok(()) => {
                    slot_guard.drop_old_value().expect("it was just shutdown");
                }
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1445,7 +1445,7 @@ impl RemoteTimelineClient {
        let remote_path = remote_layer_path(
            &self.tenant_shard_id.tenant_id,
            &self.timeline_id,
-            uploaded.metadata().shard,
+            self.tenant_shard_id.to_index(),
            &uploaded.layer_desc().layer_name(),
            uploaded.metadata().generation,
        );
@@ -1486,7 +1486,7 @@ impl RemoteTimelineClient {
            &adopted
                .get_timeline_id()
                .expect("Source timeline should be alive"),
-            adopted.metadata().shard,
+            self.tenant_shard_id.to_index(),
            &adopted.layer_desc().layer_name(),
            adopted.metadata().generation,
        );
@@ -1494,7 +1494,7 @@ impl RemoteTimelineClient {
        let target_remote_path = remote_layer_path(
            &self.tenant_shard_id.tenant_id,
            &self.timeline_id,
-            adopted_as.metadata().shard,
+            self.tenant_shard_id.to_index(),
            &adopted_as.layer_desc().layer_name(),
            adopted_as.metadata().generation,
        );
@@ -2201,18 +2201,6 @@ impl RemoteTimelineClient {
        inner.initialized_mut()?;
        Ok(UploadQueueAccessor { inner })
    }
-
-    pub(crate) fn no_pending_work(&self) -> bool {
-        let inner = self.upload_queue.lock().unwrap();
-        match &*inner {
-            UploadQueue::Uninitialized
-            | UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => true,
-            UploadQueue::Stopped(UploadQueueStopped::Deletable(x)) => {
-                x.upload_queue_for_deletion.no_pending_work()
-            }
-            UploadQueue::Initialized(x) => x.no_pending_work(),
-        }
-    }
 }

 pub(crate) struct UploadQueueAccessor<'a> {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -12,7 +12,7 @@ pub mod merge_iterator;

 use crate::context::{AccessStatsBehavior, RequestContext};
 use bytes::Bytes;
-use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE};
+use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::value::Value;
@@ -196,9 +196,6 @@ impl ValuesReconstructState {
    /// Returns true if this was the last value needed for the key and false otherwise.
    ///
    /// If the key is done after the update, mark it as such.
-    ///
-    /// If the key is in the sparse keyspace (i.e., aux files), we do not track them in
-    /// `key_done`.
    pub(crate) fn update_key(
        &mut self,
        key: &Key,
@@ -209,18 +206,10 @@ impl ValuesReconstructState {
            .keys
            .entry(*key)
            .or_insert(Ok(VectoredValueReconstructState::default()));
-        let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key);
+
        if let Ok(state) = state {
            let key_done = match state.situation {
-                ValueReconstructSituation::Complete => {
-                    if is_sparse_key {
-                        // Sparse keyspace might be visited multiple times because
-                        // we don't track unmapped keyspaces.
-                        return ValueReconstructSituation::Complete;
-                    } else {
-                        unreachable!()
-                    }
-                }
+                ValueReconstructSituation::Complete => unreachable!(),
                ValueReconstructSituation::Continue => match value {
                    Value::Image(img) => {
                        state.img = Some((lsn, img));
@@ -245,9 +234,7 @@ impl ValuesReconstructState {

            if key_done && state.situation == ValueReconstructSituation::Continue {
                state.situation = ValueReconstructSituation::Complete;
-                if !is_sparse_key {
-                    self.keys_done.add_key(*key);
-                }
+                self.keys_done.add_key(*key);
            }

            state.situation
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -653,10 +653,6 @@ impl DeltaLayerWriter {
        })
    }

-    pub fn is_empty(&self) -> bool {
-        self.inner.as_ref().unwrap().num_keys == 0
-    }
-
    ///
    /// Append a key-value pair to the file.
    ///
--- a/pageserver/src/tenant/storage_layer/filter_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs
@@ -1,4 +1,4 @@
-use std::{ops::Range, sync::Arc};
+use std::ops::Range;

 use anyhow::bail;
 use pageserver_api::{
@@ -9,10 +9,7 @@ use utils::lsn::Lsn;

 use pageserver_api::value::Value;

-use super::{
-    merge_iterator::{MergeIterator, MergeIteratorItem},
-    PersistentLayerKey,
-};
+use super::merge_iterator::MergeIterator;

 /// A filter iterator over merge iterators (and can be easily extended to other types of iterators).
 ///
@@ -51,10 +48,10 @@ impl<'a> FilterIterator<'a> {
        })
    }

-    async fn next_inner<R: MergeIteratorItem>(&mut self) -> anyhow::Result<Option<R>> {
-        while let Some(item) = self.inner.next_inner::<R>().await? {
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        while let Some(item) = self.inner.next().await? {
            while self.current_filter_idx < self.retain_key_filters.len()
-                && item.key_lsn_value().0 >= self.retain_key_filters[self.current_filter_idx].end
+                && item.0 >= self.retain_key_filters[self.current_filter_idx].end
            {
                // [filter region]    [filter region]     [filter region]
                //                                     ^ item
@@ -71,7 +68,7 @@ impl<'a> FilterIterator<'a> {
                //                                                 ^ current filter (nothing)
                return Ok(None);
            }
-            if self.retain_key_filters[self.current_filter_idx].contains(&item.key_lsn_value().0) {
+            if self.retain_key_filters[self.current_filter_idx].contains(&item.0) {
                // [filter region]    [filter region]     [filter region]
                //                                              ^ item
                //                                        ^ current filter
@@ -84,16 +81,6 @@ impl<'a> FilterIterator<'a> {
        }
        Ok(None)
    }
-
-    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        self.next_inner().await
-    }
-
-    pub async fn next_with_trace(
-        &mut self,
-    ) -> anyhow::Result<Option<((Key, Lsn, Value), Arc<PersistentLayerKey>)>> {
-        self.next_inner().await
-    }
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -67,8 +67,6 @@ pub struct InMemoryLayer {
    /// The above fields never change, except for `end_lsn`, which is only set once.
    /// All other changing parts are in `inner`, and protected by a mutex.
    inner: RwLock<InMemoryLayerInner>,
-
-    estimated_in_mem_size: AtomicU64,
 }

 impl std::fmt::Debug for InMemoryLayer {
@@ -545,10 +543,6 @@ impl InMemoryLayer {
        Ok(inner.file.len())
    }

-    pub fn estimated_in_mem_size(&self) -> u64 {
-        self.estimated_in_mem_size.load(AtomicOrdering::Relaxed)
-    }
-
    /// Create a new, empty, in-memory layer
    pub async fn create(
        conf: &'static PageServerConf,
@@ -578,7 +572,6 @@ impl InMemoryLayer {
                file,
                resource_units: GlobalResourceUnits::new(),
            }),
-            estimated_in_mem_size: AtomicU64::new(0),
        })
    }

@@ -649,12 +642,6 @@ impl InMemoryLayer {
                // because this case is unexpected, and we would like tests to fail if this happens.
                warn!("Key {} at {} written twice at same LSN", key, lsn);
            }
-            self.estimated_in_mem_size.fetch_add(
-                (std::mem::size_of::<CompactKey>()
-                    + std::mem::size_of::<Lsn>()
-                    + std::mem::size_of::<IndexEntry>()) as u64,
-                AtomicOrdering::Relaxed,
-            );
        }

        inner.resource_units.maybe_publish_size(new_size);
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -1,7 +1,6 @@
 use std::{
    cmp::Ordering,
    collections::{binary_heap, BinaryHeap},
-    sync::Arc,
 };

 use anyhow::bail;
@@ -14,11 +13,10 @@ use pageserver_api::value::Value;
 use super::{
    delta_layer::{DeltaLayerInner, DeltaLayerIterator},
    image_layer::{ImageLayerInner, ImageLayerIterator},
-    PersistentLayerDesc, PersistentLayerKey,
 };

 #[derive(Clone, Copy)]
-pub(crate) enum LayerRef<'a> {
+enum LayerRef<'a> {
    Image(&'a ImageLayerInner),
    Delta(&'a DeltaLayerInner),
 }
@@ -64,20 +62,18 @@ impl LayerIterRef<'_> {
 /// 1. Unified iterator for image and delta layers.
 /// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
 /// 3. Lazy creation of the real delta/image iterator.
-pub(crate) enum IteratorWrapper<'a> {
+enum IteratorWrapper<'a> {
    NotLoaded {
        ctx: &'a RequestContext,
        first_key_lower_bound: (Key, Lsn),
        layer: LayerRef<'a>,
-        source_desc: Arc<PersistentLayerKey>,
    },
    Loaded {
        iter: PeekableLayerIterRef<'a>,
-        source_desc: Arc<PersistentLayerKey>,
    },
 }

-pub(crate) struct PeekableLayerIterRef<'a> {
+struct PeekableLayerIterRef<'a> {
    iter: LayerIterRef<'a>,
    peeked: Option<(Key, Lsn, Value)>, // None == end
 }
@@ -155,12 +151,6 @@ impl<'a> IteratorWrapper<'a> {
            layer: LayerRef::Image(image_layer),
            first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()),
            ctx,
-            source_desc: PersistentLayerKey {
-                key_range: image_layer.key_range().clone(),
-                lsn_range: PersistentLayerDesc::image_layer_lsn_range(image_layer.lsn()),
-                is_delta: false,
-            }
-            .into(),
        }
    }

@@ -172,18 +162,12 @@ impl<'a> IteratorWrapper<'a> {
            layer: LayerRef::Delta(delta_layer),
            first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start),
            ctx,
-            source_desc: PersistentLayerKey {
-                key_range: delta_layer.key_range().clone(),
-                lsn_range: delta_layer.lsn_range().clone(),
-                is_delta: true,
-            }
-            .into(),
        }
    }

    fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> {
        match self {
-            Self::Loaded { iter, .. } => iter
+            Self::Loaded { iter } => iter
                .peek()
                .as_ref()
                .map(|(key, lsn, val)| (key, *lsn, Some(val))),
@@ -207,7 +191,6 @@ impl<'a> IteratorWrapper<'a> {
            ctx,
            first_key_lower_bound,
            layer,
-            source_desc,
        } = self
        else {
            unreachable!()
@@ -223,10 +206,7 @@ impl<'a> IteratorWrapper<'a> {
                );
            }
        }
-        *self = Self::Loaded {
-            iter,
-            source_desc: source_desc.clone(),
-        };
+        *self = Self::Loaded { iter };
        Ok(())
    }

@@ -240,19 +220,11 @@ impl<'a> IteratorWrapper<'a> {
    /// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and
    /// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        let Self::Loaded { iter, .. } = self else {
+        let Self::Loaded { iter } = self else {
            panic!("must load the iterator before using")
        };
        iter.next().await
    }
-
-    /// Get the persistent layer key corresponding to this iterator
-    fn trace_source(&self) -> Arc<PersistentLayerKey> {
-        match self {
-            Self::Loaded { source_desc, .. } => source_desc.clone(),
-            Self::NotLoaded { source_desc, .. } => source_desc.clone(),
-        }
-    }
 }

 /// A merge iterator over delta/image layer iterators.
@@ -270,32 +242,6 @@ pub struct MergeIterator<'a> {
    heap: BinaryHeap<IteratorWrapper<'a>>,
 }

-pub(crate) trait MergeIteratorItem {
-    fn new(item: (Key, Lsn, Value), iterator: &IteratorWrapper<'_>) -> Self;
-
-    fn key_lsn_value(&self) -> &(Key, Lsn, Value);
-}
-
-impl MergeIteratorItem for (Key, Lsn, Value) {
-    fn new(item: (Key, Lsn, Value), _: &IteratorWrapper<'_>) -> Self {
-        item
-    }
-
-    fn key_lsn_value(&self) -> &(Key, Lsn, Value) {
-        self
-    }
-}
-
-impl MergeIteratorItem for ((Key, Lsn, Value), Arc<PersistentLayerKey>) {
-    fn new(item: (Key, Lsn, Value), iter: &IteratorWrapper<'_>) -> Self {
-        (item, iter.trace_source().clone())
-    }
-
-    fn key_lsn_value(&self) -> &(Key, Lsn, Value) {
-        &self.0
-    }
-}
-
 impl<'a> MergeIterator<'a> {
    pub fn create(
        deltas: &[&'a DeltaLayerInner],
@@ -314,7 +260,7 @@ impl<'a> MergeIterator<'a> {
        }
    }

-    pub(crate) async fn next_inner<R: MergeIteratorItem>(&mut self) -> anyhow::Result<Option<R>> {
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
        while let Some(mut iter) = self.heap.peek_mut() {
            if !iter.is_loaded() {
                // Once we load the iterator, we can know the real first key-value pair in the iterator.
@@ -329,22 +275,10 @@ impl<'a> MergeIterator<'a> {
                binary_heap::PeekMut::pop(iter);
                continue;
            };
-            return Ok(Some(R::new(item, &iter)));
+            return Ok(Some(item));
        }
        Ok(None)
    }
-
-    /// Get the next key-value pair from the iterator.
-    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        self.next_inner().await
-    }
-
-    /// Get the next key-value pair from the iterator, and trace where the key comes from.
-    pub async fn next_with_trace(
-        &mut self,
-    ) -> anyhow::Result<Option<((Key, Lsn, Value), Arc<PersistentLayerKey>)>> {
-        self.next_inner().await
-    }
 }

 #[cfg(test)]
@@ -562,7 +496,7 @@ mod tests {
            (
                get_key(0),
                Lsn(0x10),
-                Value::WalRecord(NeonWalRecord::wal_init("")),
+                Value::WalRecord(NeonWalRecord::wal_init()),
            ),
            (
                get_key(0),
@@ -572,7 +506,7 @@ mod tests {
            (
                get_key(5),
                Lsn(0x10),
-                Value::WalRecord(NeonWalRecord::wal_init("")),
+                Value::WalRecord(NeonWalRecord::wal_init()),
            ),
            (
                get_key(5),
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,7 +23,6 @@ use handle::ShardTimelineId;
 use offload::OffloadError;
 use once_cell::sync::Lazy;
 use pageserver_api::{
-    config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
    key::{
        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
        NON_INHERITED_SPARSE_RANGE,
@@ -853,10 +852,6 @@ pub(crate) enum ShutdownMode {
    /// While we are flushing, we continue to accept read I/O for LSNs ingested before
    /// the call to [`Timeline::shutdown`].
    FreezeAndFlush,
-    /// Only flush the layers to the remote storage without freezing any open layers. This is the
-    /// mode used by ancestor detach and any other operations that reloads a tenant but not increasing
-    /// the generation number.
-    Flush,
    /// Shut down immediately, without waiting for any open layers to flush.
    Hard,
 }
@@ -1570,16 +1565,12 @@ impl Timeline {
    ///
    /// This is neccessary but not sufficient for offloading of the timeline as it might have
    /// child timelines that are not offloaded yet.
-    pub(crate) fn can_offload(&self) -> (bool, &'static str) {
+    pub(crate) fn can_offload(&self) -> bool {
        if self.remote_client.is_archived() != Some(true) {
-            return (false, "the timeline is not archived");
-        }
-        if !self.remote_client.no_pending_work() {
-            // if the remote client is still processing some work, we can't offload
-            return (false, "the upload queue is not drained yet");
+            return false;
        }

-        (true, "ok")
+        true
    }

    /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending
@@ -1687,6 +1678,11 @@ impl Timeline {
    pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
        debug_assert_current_span_has_tenant_and_timeline_id();

+        let try_freeze_and_flush = match mode {
+            ShutdownMode::FreezeAndFlush => true,
+            ShutdownMode::Hard => false,
+        };
+
        // Regardless of whether we're going to try_freeze_and_flush
        // or not, stop ingesting any more data. Walreceiver only provides
        // cancellation but no "wait until gone", because it uses the Timeline::gate.
@@ -1708,7 +1704,7 @@ impl Timeline {
        // ... and inform any waiters for newer LSNs that there won't be any.
        self.last_record_lsn.shutdown();

-        if let ShutdownMode::FreezeAndFlush = mode {
+        if try_freeze_and_flush {
            if let Some((open, frozen)) = self
                .layers
                .read()
@@ -1750,20 +1746,6 @@ impl Timeline {
                    warn!("failed to freeze and flush: {e:#}");
                }
            }
-
-            // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
-            // we also do a final check here to ensure that the queue is empty.
-            if !self.remote_client.no_pending_work() {
-                warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
-            }
-        }
-
-        if let ShutdownMode::Flush = mode {
-            // drain the upload queue
-            self.remote_client.shutdown().await;
-            if !self.remote_client.no_pending_work() {
-                warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
-            }
        }

        // Signal any subscribers to our cancellation token to drop out
@@ -3506,37 +3488,18 @@ impl Timeline {

                let timer = self.metrics.flush_time_histo.start_timer();

-                let num_frozen_layers;
-                let frozen_layer_total_size;
                let layer_to_flush = {
                    let guard = self.layers.read().await;
                    let Ok(lm) = guard.layer_map() else {
                        info!("dropping out of flush loop for timeline shutdown");
                        return;
                    };
-                    num_frozen_layers = lm.frozen_layers.len();
-                    frozen_layer_total_size = lm
-                        .frozen_layers
-                        .iter()
-                        .map(|l| l.estimated_in_mem_size())
-                        .sum::<u64>();
                    lm.frozen_layers.front().cloned()
                    // drop 'layers' lock to allow concurrent reads and writes
                };
                let Some(layer_to_flush) = layer_to_flush else {
                    break Ok(());
                };
-                if num_frozen_layers
-                    > std::cmp::max(
-                        self.get_compaction_threshold(),
-                        DEFAULT_COMPACTION_THRESHOLD,
-                    )
-                    && frozen_layer_total_size >= /* 128 MB */ 128000000
-                {
-                    tracing::warn!(
-                        "too many frozen layers: {num_frozen_layers} layers with estimated in-mem size of {frozen_layer_total_size} bytes",
-                    );
-                }
                match self.flush_frozen_layer(layer_to_flush, ctx).await {
                    Ok(this_layer_to_lsn) => {
                        flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);
@@ -4127,7 +4090,6 @@ impl Timeline {
    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
        // Metadata keys image layer creation.
        let mut reconstruct_state = ValuesReconstructState::default();
-        let begin = Instant::now();
        let data = self
            .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
            .await?;
@@ -4144,11 +4106,14 @@ impl Timeline {
            (new_data, total_kb_retrieved / 1024, total_keys_retrieved)
        };
        let delta_files_accessed = reconstruct_state.get_delta_layers_visited();
-        let elapsed = begin.elapsed();

        let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
-        info!(
-            "metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s", elapsed.as_secs_f64()
+        debug!(
+            trigger_generation,
+            delta_files_accessed,
+            total_kb_retrieved,
+            total_keys_retrieved,
+            "generate metadata images"
        );

        if !trigger_generation && mode == ImageLayerCreationMode::Try {
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,7 +4,7 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.

-use std::collections::{BinaryHeap, HashMap, HashSet};
+use std::collections::{BinaryHeap, HashSet};
 use std::ops::{Deref, Range};
 use std::sync::Arc;

@@ -56,7 +56,7 @@ use pageserver_api::value::Value;

 use utils::lsn::Lsn;

-use pageserver_compaction::helpers::{fully_contains, overlaps_with};
+use pageserver_compaction::helpers::overlaps_with;
 use pageserver_compaction::interface::*;

 use super::CompactionError;
@@ -64,23 +64,6 @@ use super::CompactionError;
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;

-pub struct GcCompactionJobDescription {
-    /// All layers to read in the compaction job
-    selected_layers: Vec<Layer>,
-    /// GC cutoff of the job
-    gc_cutoff: Lsn,
-    /// LSNs to retain for the job
-    retain_lsns_below_horizon: Vec<Lsn>,
-    /// Maximum layer LSN processed in this compaction
-    max_layer_lsn: Lsn,
-    /// Only compact layers overlapping with this range
-    compaction_key_range: Range<Key>,
-    /// When partial compaction is enabled, these layers need to be rewritten to ensure no overlap.
-    /// This field is here solely for debugging. The field will not be read once the compaction
-    /// description is generated.
-    rewrite_layers: Vec<Arc<PersistentLayerDesc>>,
-}
-
 /// The result of bottom-most compaction for a single key at each LSN.
 #[derive(Debug)]
 #[cfg_attr(test, derive(PartialEq))]
@@ -1739,8 +1722,7 @@ impl Timeline {
        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        self.partial_compact_with_gc(Key::MIN..Key::MAX, cancel, flags, ctx)
-            .await
+        self.partial_compact_with_gc(None, cancel, flags, ctx).await
    }

    /// An experimental compaction building block that combines compaction with garbage collection.
@@ -1750,15 +1732,12 @@ impl Timeline {
    /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
    /// and create delta layers with all deltas >= gc horizon.
    ///
-    /// If `key_range` is provided, it will only compact the keys within the range, aka partial compaction.
-    /// Partial compaction will read and process all layers overlapping with the key range, even if it might
-    /// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained
-    /// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing
-    /// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
-    /// part of the range.
+    /// If `key_range`, it will only compact the keys within the range, aka partial compaction. This functionality
+    /// is not complete yet, and if it is set, only image layers will be generated.
+    ///
    pub(crate) async fn partial_compact_with_gc(
        self: &Arc<Self>,
-        compaction_key_range: Range<Key>,
+        compaction_key_range: Option<Range<Key>>,
        cancel: &CancellationToken,
        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
@@ -1783,8 +1762,9 @@ impl Timeline {
        .await?;

        let dry_run = flags.contains(CompactFlags::DryRun);
+        let partial_compaction = compaction_key_range.is_some();

-        if compaction_key_range == (Key::MIN..Key::MAX) {
+        if let Some(ref compaction_key_range) = compaction_key_range {
            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end);
        } else {
            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
@@ -1800,7 +1780,7 @@ impl Timeline {
        // The layer selection has the following properties:
        // 1. If a layer is in the selection, all layers below it are in the selection.
        // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
-        let job_desc = {
+        let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = if !partial_compaction {
            let guard = self.layers.read().await;
            let layers = guard.layer_map()?;
            let gc_info = self.gc_info.read().unwrap();
@@ -1830,21 +1810,9 @@ impl Timeline {
            };
            // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
            // layers to compact.
-            let mut rewrite_layers = Vec::new();
            for desc in layers.iter_historic_layers() {
-                if desc.get_lsn_range().end <= max_layer_lsn
-                    && overlaps_with(&desc.get_key_range(), &compaction_key_range)
-                {
-                    // If the layer overlaps with the compaction key range, we need to read it to obtain all keys within the range,
-                    // even if it might contain extra keys
+                if desc.get_lsn_range().end <= max_layer_lsn {
                    selected_layers.push(guard.get_from_desc(&desc));
-                    // If the layer is not fully contained within the key range, we need to rewrite it if it's a delta layer (it's fine
-                    // to overlap image layers)
-                    if desc.is_delta()
-                        && !fully_contains(&compaction_key_range, &desc.get_key_range())
-                    {
-                        rewrite_layers.push(desc);
-                    }
                }
            }
            if selected_layers.is_empty() {
@@ -1852,59 +1820,82 @@ impl Timeline {
                return Ok(());
            }
            retain_lsns_below_horizon.sort();
-            GcCompactionJobDescription {
-                selected_layers,
-                gc_cutoff,
-                retain_lsns_below_horizon,
-                max_layer_lsn,
-                compaction_key_range,
-                rewrite_layers,
+            (selected_layers, gc_cutoff, retain_lsns_below_horizon)
+        } else {
+            // In case of partial compaction, we currently only support generating image layers, and therefore,
+            // we pick all layers that are below the lowest retain_lsn and does not intersect with any of the layers.
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map()?;
+            let gc_info = self.gc_info.read().unwrap();
+            let mut min_lsn = gc_info.cutoffs.select_min();
+            for (lsn, _, _) in &gc_info.retain_lsns {
+                if lsn < &min_lsn {
+                    min_lsn = *lsn;
+                }
            }
+            for lsn in gc_info.leases.keys() {
+                if lsn < &min_lsn {
+                    min_lsn = *lsn;
+                }
+            }
+            let mut selected_layers = Vec::new();
+            drop(gc_info);
+            // |-------| |-------| |-------|
+            // | Delta | | Delta | | Delta | -- min_lsn could be intersecting with the layers
+            // |-------| |-------| |-------| <- we want to pick all the layers below min_lsn, so that
+            // | Delta | | Delta | | Delta |    ...we can remove them after compaction
+            // |-------| |-------| |-------|
+            // Pick all the layers intersect or below the min_lsn, get the largest LSN in the selected layers.
+            let Some(compaction_key_range) = compaction_key_range.as_ref() else {
+                unreachable!()
+            };
+            for desc in layers.iter_historic_layers() {
+                if desc.get_lsn_range().end <= min_lsn
+                    && overlaps_with(&desc.key_range, compaction_key_range)
+                {
+                    selected_layers.push(guard.get_from_desc(&desc));
+                }
+            }
+            if selected_layers.is_empty() {
+                info!("no layers to compact with gc");
+                return Ok(());
+            }
+            (selected_layers, min_lsn, Vec::new())
        };
        let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
+            if partial_compaction {
+                warn!("partial compaction cannot run on child branches (for now)");
+                return Ok(());
+            }
            Lsn(self.ancestor_lsn.0 + 1)
        } else {
-            let res = job_desc
-                .retain_lsns_below_horizon
+            let res = retain_lsns_below_horizon
                .first()
                .copied()
-                .unwrap_or(job_desc.gc_cutoff);
+                .unwrap_or(gc_cutoff);
            if cfg!(debug_assertions) {
                assert_eq!(
                    res,
-                    job_desc
-                        .retain_lsns_below_horizon
+                    retain_lsns_below_horizon
                        .iter()
                        .min()
                        .copied()
-                        .unwrap_or(job_desc.gc_cutoff)
+                        .unwrap_or(gc_cutoff)
                );
            }
            res
        };
        info!(
-            "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}",
-            job_desc.selected_layers.len(),
-            job_desc.rewrite_layers.len(),
-            job_desc.max_layer_lsn,
-            job_desc.gc_cutoff,
-            lowest_retain_lsn,
-            job_desc.compaction_key_range.start,
-            job_desc.compaction_key_range.end
+            "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
+            layer_selection.len(),
+            gc_cutoff,
+            lowest_retain_lsn
        );

-        for layer in &job_desc.selected_layers {
-            debug!("read layer: {}", layer.layer_desc().key());
-        }
-        for layer in &job_desc.rewrite_layers {
-            debug!("rewrite layer: {}", layer.key());
-        }
-
-        self.check_compaction_space(&job_desc.selected_layers)
-            .await?;
+        self.check_compaction_space(&layer_selection).await?;

        // Generate statistics for the compaction
-        for layer in &job_desc.selected_layers {
+        for layer in &layer_selection {
            let desc = layer.layer_desc();
            if desc.is_delta() {
                stat.visit_delta_layer(desc.file_size());
@@ -1915,25 +1906,25 @@ impl Timeline {

        // Step 1: construct a k-merge iterator over all layers.
        // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
-        let layer_names = job_desc
-            .selected_layers
+        let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
            .iter()
            .map(|layer| layer.layer_desc().layer_name())
            .collect_vec();
        if let Some(err) = check_valid_layermap(&layer_names) {
-            warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
+            bail!("cannot run gc-compaction because {}", err);
        }
        // The maximum LSN we are processing in this compaction loop
-        let end_lsn = job_desc
-            .selected_layers
+        let end_lsn = layer_selection
            .iter()
            .map(|l| l.layer_desc().lsn_range.end)
            .max()
            .unwrap();
+        // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
+        // as an L0 layer.
        let mut delta_layers = Vec::new();
        let mut image_layers = Vec::new();
        let mut downloaded_layers = Vec::new();
-        for layer in &job_desc.selected_layers {
+        for layer in &layer_selection {
            let resident_layer = layer.download_and_keep_resident().await?;
            downloaded_layers.push(resident_layer);
        }
@@ -1952,8 +1943,8 @@ impl Timeline {
            dense_ks,
            sparse_ks,
        )?;
-
-        // Step 2: Produce images+deltas.
+        // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
+        // Data of the same key.
        let mut accumulated_values = Vec::new();
        let mut last_key: Option<Key> = None;

@@ -1965,7 +1956,10 @@ impl Timeline {
                    self.conf,
                    self.timeline_id,
                    self.tenant_shard_id,
-                    job_desc.compaction_key_range.start,
+                    compaction_key_range
+                        .as_ref()
+                        .map(|x| x.start)
+                        .unwrap_or(Key::MIN),
                    lowest_retain_lsn,
                    self.get_compaction_target_size(),
                    ctx,
@@ -1985,13 +1979,6 @@ impl Timeline {
        )
        .await?;

-        #[derive(Default)]
-        struct RewritingLayers {
-            before: Option<DeltaLayerWriter>,
-            after: Option<DeltaLayerWriter>,
-        }
-        let mut delta_layer_rewriters = HashMap::<Arc<PersistentLayerKey>, RewritingLayers>::new();
-
        /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
        ///
        /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
@@ -2017,51 +2004,10 @@ impl Timeline {
        // the key and LSN range are determined. However, to keep things simple here, we still
        // create this writer, and discard the writer in the end.

-        while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? {
+        while let Some((key, lsn, val)) = merge_iter.next().await? {
            if cancel.is_cancelled() {
                return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
            }
-            if !job_desc.compaction_key_range.contains(&key) {
-                if !desc.is_delta {
-                    continue;
-                }
-                let rewriter = delta_layer_rewriters.entry(desc.clone()).or_default();
-                let rewriter = if key < job_desc.compaction_key_range.start {
-                    if rewriter.before.is_none() {
-                        rewriter.before = Some(
-                            DeltaLayerWriter::new(
-                                self.conf,
-                                self.timeline_id,
-                                self.tenant_shard_id,
-                                desc.key_range.start,
-                                desc.lsn_range.clone(),
-                                ctx,
-                            )
-                            .await?,
-                        );
-                    }
-                    rewriter.before.as_mut().unwrap()
-                } else if key >= job_desc.compaction_key_range.end {
-                    if rewriter.after.is_none() {
-                        rewriter.after = Some(
-                            DeltaLayerWriter::new(
-                                self.conf,
-                                self.timeline_id,
-                                self.tenant_shard_id,
-                                job_desc.compaction_key_range.end,
-                                desc.lsn_range.clone(),
-                                ctx,
-                            )
-                            .await?,
-                        );
-                    }
-                    rewriter.after.as_mut().unwrap()
-                } else {
-                    unreachable!()
-                };
-                rewriter.put_value(key, lsn, val, ctx).await?;
-                continue;
-            }
            match val {
                Value::Image(_) => stat.visit_image_key(&val),
                Value::WalRecord(_) => stat.visit_wal_key(&val),
@@ -2072,27 +2018,35 @@ impl Timeline {
                }
                accumulated_values.push((key, lsn, val));
            } else {
-                let last_key: &mut Key = last_key.as_mut().unwrap();
-                stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction
-                let retention = self
-                    .generate_key_retention(
-                        *last_key,
-                        &accumulated_values,
-                        job_desc.gc_cutoff,
-                        &job_desc.retain_lsns_below_horizon,
-                        COMPACTION_DELTA_THRESHOLD,
-                        get_ancestor_image(self, *last_key, ctx).await?,
-                    )
-                    .await?;
-                retention
-                    .pipe_to(
-                        *last_key,
-                        &mut delta_layer_writer,
-                        image_layer_writer.as_mut(),
-                        &mut stat,
-                        ctx,
-                    )
-                    .await?;
+                let last_key = last_key.as_mut().unwrap();
+                stat.on_unique_key_visited();
+                let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
+                    !compaction_key_range.contains(last_key)
+                } else {
+                    false
+                };
+                if !skip_adding_key {
+                    let retention = self
+                        .generate_key_retention(
+                            *last_key,
+                            &accumulated_values,
+                            gc_cutoff,
+                            &retain_lsns_below_horizon,
+                            COMPACTION_DELTA_THRESHOLD,
+                            get_ancestor_image(self, *last_key, ctx).await?,
+                        )
+                        .await?;
+                    // Put the image into the image layer. Currently we have a single big layer for the compaction.
+                    retention
+                        .pipe_to(
+                            *last_key,
+                            &mut delta_layer_writer,
+                            image_layer_writer.as_mut(),
+                            &mut stat,
+                            ctx,
+                        )
+                        .await?;
+                }
                accumulated_values.clear();
                *last_key = key;
                accumulated_values.push((key, lsn, val));
@@ -2103,42 +2057,34 @@ impl Timeline {
        let last_key = last_key.expect("no keys produced during compaction");
        stat.on_unique_key_visited();

-        let retention = self
-            .generate_key_retention(
-                last_key,
-                &accumulated_values,
-                job_desc.gc_cutoff,
-                &job_desc.retain_lsns_below_horizon,
-                COMPACTION_DELTA_THRESHOLD,
-                get_ancestor_image(self, last_key, ctx).await?,
-            )
-            .await?;
-        retention
-            .pipe_to(
-                last_key,
-                &mut delta_layer_writer,
-                image_layer_writer.as_mut(),
-                &mut stat,
-                ctx,
-            )
-            .await?;
-        // end: move the above part to the loop body
-
-        let mut rewrote_delta_layers = Vec::new();
-        for (key, writers) in delta_layer_rewriters {
-            if let Some(delta_writer_before) = writers.before {
-                let (desc, path) = delta_writer_before
-                    .finish(job_desc.compaction_key_range.start, ctx)
-                    .await?;
-                let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
-                rewrote_delta_layers.push(layer);
-            }
-            if let Some(delta_writer_after) = writers.after {
-                let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?;
-                let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
-                rewrote_delta_layers.push(layer);
-            }
+        let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
+            !compaction_key_range.contains(&last_key)
+        } else {
+            false
+        };
+        if !skip_adding_key {
+            let retention = self
+                .generate_key_retention(
+                    last_key,
+                    &accumulated_values,
+                    gc_cutoff,
+                    &retain_lsns_below_horizon,
+                    COMPACTION_DELTA_THRESHOLD,
+                    get_ancestor_image(self, last_key, ctx).await?,
+                )
+                .await?;
+            // Put the image into the image layer. Currently we have a single big layer for the compaction.
+            retention
+                .pipe_to(
+                    last_key,
+                    &mut delta_layer_writer,
+                    image_layer_writer.as_mut(),
+                    &mut stat,
+                    ctx,
+                )
+                .await?;
        }
+        // end: move the above part to the loop body

        let discard = |key: &PersistentLayerKey| {
            let key = key.clone();
@@ -2147,7 +2093,10 @@ impl Timeline {

        let produced_image_layers = if let Some(writer) = image_layer_writer {
            if !dry_run {
-                let end_key = job_desc.compaction_key_range.end;
+                let end_key = compaction_key_range
+                    .as_ref()
+                    .map(|x| x.end)
+                    .unwrap_or(Key::MAX);
                writer
                    .finish_with_discard_fn(self, ctx, end_key, discard)
                    .await?
@@ -2168,8 +2117,10 @@ impl Timeline {
            Vec::new()
        };

-        // TODO: make image/delta/rewrote_delta layers generation atomic. At this point, we already generated resident layers, and if
-        // compaction is cancelled at this point, we might have some layers that are not cleaned up.
+        if partial_compaction && !produced_delta_layers.is_empty() {
+            bail!("implementation error: partial compaction should not be producing delta layers (for now)");
+        }
+
        let mut compact_to = Vec::new();
        let mut keep_layers = HashSet::new();
        let produced_delta_layers_len = produced_delta_layers.len();
@@ -2177,83 +2128,51 @@ impl Timeline {
        for action in produced_delta_layers {
            match action {
                BatchWriterResult::Produced(layer) => {
-                    if cfg!(debug_assertions) {
-                        info!("produced delta layer: {}", layer.layer_desc().key());
-                    }
                    stat.produce_delta_layer(layer.layer_desc().file_size());
                    compact_to.push(layer);
                }
                BatchWriterResult::Discarded(l) => {
-                    if cfg!(debug_assertions) {
-                        info!("discarded delta layer: {}", l);
-                    }
                    keep_layers.insert(l);
                    stat.discard_delta_layer();
                }
            }
        }
-        for layer in &rewrote_delta_layers {
-            debug!(
-                "produced rewritten delta layer: {}",
-                layer.layer_desc().key()
-            );
-        }
-        compact_to.extend(rewrote_delta_layers);
        for action in produced_image_layers {
            match action {
                BatchWriterResult::Produced(layer) => {
-                    debug!("produced image layer: {}", layer.layer_desc().key());
                    stat.produce_image_layer(layer.layer_desc().file_size());
                    compact_to.push(layer);
                }
                BatchWriterResult::Discarded(l) => {
-                    debug!("discarded image layer: {}", l);
                    keep_layers.insert(l);
                    stat.discard_image_layer();
                }
            }
        }
-
-        let mut layer_selection = job_desc.selected_layers;
-
-        // Partial compaction might select more data than it processes, e.g., if
-        // the compaction_key_range only partially overlaps:
-        //
-        //         [---compaction_key_range---]
-        //   [---A----][----B----][----C----][----D----]
-        //
-        // For delta layers, we will rewrite the layers so that it is cut exactly at
-        // the compaction key range, so we can always discard them. However, for image
-        // layers, as we do not rewrite them for now, we need to handle them differently.
-        // Assume image layers  A, B, C, D are all in the `layer_selection`.
-        //
-        // The created image layers contain whatever is needed from B, C, and from
-        // `----]` of A, and from  `[---` of D.
-        //
-        // In contrast, `[---A` and `D----]` have not been processed, so, we must
-        // keep that data.
-        //
-        // The solution for now is to keep A and D completely if they are image layers.
-        // (layer_selection is what we'll remove from the layer map, so, retain what
-        // is _not_ fully covered by compaction_key_range).
-        for layer in &layer_selection {
-            if !layer.layer_desc().is_delta() {
-                if !overlaps_with(
-                    &layer.layer_desc().key_range,
-                    &job_desc.compaction_key_range,
-                ) {
-                    bail!("violated constraint: image layer outside of compaction key range");
-                }
-                if !fully_contains(
-                    &job_desc.compaction_key_range,
-                    &layer.layer_desc().key_range,
-                ) {
-                    keep_layers.insert(layer.layer_desc().key());
-                }
-            }
-        }
-
+        let mut layer_selection = layer_selection;
        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
+        if let Some(ref compaction_key_range) = compaction_key_range {
+            // Partial compaction might select more data than it processes, e.g., if
+            // the compaction_key_range only partially overlaps:
+            //
+            //         [---compaction_key_range---]
+            //   [---A----][----B----][----C----][----D----]
+            //
+            // A,B,C,D are all in the `layer_selection`. The created image layers contain
+            // whatever is needed from B, C, and from `----]` of A, and from  `[--` of D.
+            //
+            // In contrast, `[--A-` and `--D----]` have not been processed, so, we must
+            // keep that data.
+            //
+            // The solution for now is to keep A and D completely.
+            // (layer_selection is what we'll remove from the layer map, so,
+            //  retain what is _not_ fully covered by compaction_key_range).
+            layer_selection.retain(|x| {
+                let key_range = &x.layer_desc().key_range;
+                key_range.start >= compaction_key_range.start
+                    && key_range.end <= compaction_key_range.end
+            });
+        }

        info!(
            "gc-compaction statistics: {}",
@@ -2273,7 +2192,6 @@ impl Timeline {

        // Step 3: Place back to the layer map.
        {
-            // TODO: sanity check if the layer map is valid (i.e., should not have overlaps)
            let mut guard = self.layers.write().await;
            guard
                .open_mut()?
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -12,7 +12,7 @@ use crate::{
    virtual_file::{MaybeFatalIo, VirtualFile},
 };
 use anyhow::Context;
-use pageserver_api::{models::detach_ancestor::AncestorDetached, shard::ShardIdentity};
+use pageserver_api::models::detach_ancestor::AncestorDetached;
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
@@ -376,14 +376,8 @@ pub(super) async fn prepare(
        tasks.spawn(
            async move {
                let _permit = limiter.acquire().await;
-                let owned = remote_copy(
-                    &adopted,
-                    &timeline,
-                    timeline.generation,
-                    timeline.shard_identity,
-                    &timeline.cancel,
-                )
-                .await?;
+                let owned =
+                    remote_copy(&adopted, &timeline, timeline.generation, &timeline.cancel).await?;
                tracing::info!(layer=%owned, "remote copied");
                Ok(owned)
            }
@@ -635,7 +629,6 @@ async fn remote_copy(
    adopted: &Layer,
    adoptee: &Arc<Timeline>,
    generation: Generation,
-    shard_identity: ShardIdentity,
    cancel: &CancellationToken,
 ) -> Result<Layer, Error> {
    // depending if Layer::keep_resident we could hardlink
@@ -643,7 +636,6 @@ async fn remote_copy(
    let mut metadata = adopted.metadata();
    debug_assert!(metadata.generation <= generation);
    metadata.generation = generation;
-    metadata.shard = shard_identity.shard_index();

    let owned = crate::tenant::storage_layer::Layer::for_evicted(
        adoptee.conf,
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -47,18 +47,21 @@ pub(crate) async fn offload_timeline(
    match is_archived {
        Some(true) => (),
        Some(false) => {
-            tracing::warn!("tried offloading a non-archived timeline");
+            tracing::warn!(?is_archived, "tried offloading a non-archived timeline");
            return Err(OffloadError::NotArchived);
        }
        None => {
            // This is legal: calls to this function can race with the timeline shutting down
-            tracing::info!("tried offloading a timeline whose remote storage is not initialized");
+            tracing::info!(
+                ?is_archived,
+                "tried offloading a timeline whose remote storage is not initialized"
+            );
            return Err(OffloadError::Cancelled);
        }
    }

    // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
-    timeline.shutdown(super::ShutdownMode::Flush).await;
+    timeline.shutdown(super::ShutdownMode::Hard).await;

    // TODO extend guard mechanism above with method
    // to make deletions possible while offloading is in progress
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -36,7 +36,9 @@ use postgres_connection::PgConnectionConfig;
 use utils::backoff::{
    exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
-use utils::postgres_client::wal_stream_connection_config;
+use utils::postgres_client::{
+    wal_stream_connection_config, ConnectionConfigArgs, PAGESERVER_SAFEKEEPER_PROTO_VERSION, POSTGRES_PROTO_VERSION,
+};
 use utils::{
    id::{NodeId, TenantTimelineId},
    lsn::Lsn,
@@ -984,15 +986,29 @@ impl ConnectionManagerState {
                if info.safekeeper_connstr.is_empty() {
                    return None; // no connection string, ignore sk
                }
-                match wal_stream_connection_config(
-                    self.id,
-                    info.safekeeper_connstr.as_ref(),
-                    match &self.conf.auth_token {
-                        None => None,
-                        Some(x) => Some(x),
-                    },
-                    self.conf.availability_zone.as_deref(),
-                ) {
+
+                let shard_identity = self.timeline.get_shard_identity();
+                let connection_conf_args = ConnectionConfigArgs {
+                    protocol_version: PAGESERVER_SAFEKEEPER_PROTO_VERSION,
+                    ttid: self.id,
+                    shard_number: Some(shard_identity.number.0),
+                    shard_count: Some(shard_identity.count.0),
+                    shard_stripe_size: Some(shard_identity.stripe_size.0),
+                    listen_pg_addr_str: info.safekeeper_connstr.as_ref(),
+                    auth_token: self.conf.auth_token.as_ref().map(|t| t.as_str()),
+                    availability_zone: self.conf.availability_zone.as_deref()
+                };
+                // let connection_conf_args = ConnectionConfigArgs {
+                //     protocol_version: POSTGRES_PROTO_VERSION,
+                //     ttid: self.id,
+                //     shard_number: None,
+                //     shard_count: None,
+                //     shard_stripe_size: None,
+                //     listen_pg_addr_str: info.safekeeper_connstr.as_ref(),
+                //     auth_token: self.conf.auth_token.as_ref().map(|t| t.as_str()),
+                //     availability_zone: self.conf.availability_zone.as_deref()
+                // };
+                match wal_stream_connection_config(connection_conf_args) {
                    Ok(connstr) => Some((*sk_id, info, connstr)),
                    Err(e) => {
                        error!("Failed to create wal receiver connection string from broker data of safekeeper node {}: {e:#}", sk_id);
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -36,7 +36,7 @@ use crate::{
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use utils::{id::NodeId, lsn::Lsn};
+use utils::{bin_ser::BeSer, id::NodeId, lsn::Lsn};
 use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};

 /// Status of the connection.
@@ -278,6 +278,7 @@ pub(super) async fn handle_walreceiver_connection(
        // fails (e.g. in walingest), we still want to know latests LSNs from the safekeeper.
        match &replication_message {
            ReplicationMessage::XLogData(xlog_data) => {
+                // TODO(vlad) Is this crap needed?
                connection_status.latest_connection_update = now;
                connection_status.commit_lsn = Some(Lsn::from(xlog_data.wal_end()));
                connection_status.streaming_lsn = Some(Lsn::from(
@@ -299,6 +300,24 @@ pub(super) async fn handle_walreceiver_connection(
        }

        let status_update = match replication_message {
+            ReplicationMessage::RawInterpretedWalRecord(raw) => {
+                connection_status.latest_connection_update = now;
+                connection_status.latest_wal_update = now;
+                connection_status.commit_lsn = Some(Lsn::from(raw.wal_end()));
+
+                let interpreted = InterpretedWalRecord::des(raw.data()).unwrap();
+                let end_lsn = interpreted.end_lsn;
+
+                let mut modification = timeline.begin_modification(end_lsn);
+                walingest
+                    .ingest_record(interpreted, &mut modification, &ctx)
+                    .await
+                    .with_context(|| format!("could not ingest record at {}", end_lsn))?;
+                modification.commit(&ctx).await?;
+
+                Some(end_lsn)
+            }
+
            ReplicationMessage::XLogData(xlog_data) => {
                // Pass the WAL data to the decoder, and see if we can decode
                // more records as a result.
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -253,10 +253,6 @@ pub(crate) fn apply_in_neon(
            use bytes::BufMut;
            if *will_init {
                assert!(*clear, "init record must be clear to ensure correctness");
-                assert!(
-                    page.is_empty(),
-                    "init record must be the first entry to ensure correctness"
-                );
            }
            if *clear {
                page.clear();
--- a/pgxn/neon/logical_replication_monitor.c
+++ b/pgxn/neon/logical_replication_monitor.c
@@ -1,8 +1,7 @@
-#include <dirent.h>
 #include <limits.h>
 #include <string.h>
+#include <dirent.h>
 #include <signal.h>
-#include <sys/stat.h>

 #include "postgres.h"

@@ -22,35 +21,17 @@

 static int	logical_replication_max_snap_files = 300;

-/*
- * According to Chi (shyzh), the pageserver _should_ be good with 10 MB worth of
- * snapshot files. Let's use 8 MB since 8 is a power of 2.
- */
-static int	logical_replication_max_logicalsnapdir_size = 8000;
-
-/*
- * A primitive description of a logical snapshot file including the LSN of the
- * file and its size.
- */
-typedef struct SnapDesc {
-	XLogRecPtr	lsn;
-	off_t		sz;
-} SnapDesc;
-
 PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);

-/*
- * Sorts an array of snapshot descriptors by their LSN.
- */
 static int
-SnapDescComparator(const void *a, const void *b)
+LsnDescComparator(const void *a, const void *b)
 {
-	const SnapDesc	*desc1 = a;
-	const SnapDesc	*desc2 = b;
+	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
+	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);

-	if (desc1->lsn < desc2->lsn)
+	if (lsn1 < lsn2)
 		return 1;
-	else if (desc1->lsn == desc2->lsn)
+	else if (lsn1 == lsn2)
 		return 0;
 	else
 		return -1;
@@ -62,39 +43,28 @@ SnapDescComparator(const void *a, const void *b)
 * slots having lower restart_lsn should be dropped.
 */
 static XLogRecPtr
-get_snapshots_cutoff_lsn(void)
+get_num_snap_files_lsn_threshold(void)
 {
-/* PG 18 has a constant defined for this, PG_LOGICAL_SNAPSHOTS_DIR */
-#define SNAPDIR "pg_logical/snapshots"
-
 	DIR		   *dirdesc;
-	int			dirdesc_fd;
 	struct dirent *de;
-	size_t		snapshot_index = 0;
-	SnapDesc   *snapshot_descriptors;
-	size_t		descriptors_allocated = 1024;
-	XLogRecPtr	cutoff = 0;
-	off_t		logicalsnapdir_size = 0;
-	const int	logical_replication_max_logicalsnapdir_size_bytes = logical_replication_max_logicalsnapdir_size * 1000;
+	char	   *snap_path = "pg_logical/snapshots/";
+	int			lsns_allocated = 1024;
+	int			lsns_num = 0;
+	XLogRecPtr *lsns;
+	XLogRecPtr	cutoff;

-	if (logical_replication_max_snap_files < 0 && logical_replication_max_logicalsnapdir_size < 0)
+	if (logical_replication_max_snap_files < 0)
 		return 0;

-	snapshot_descriptors = palloc(sizeof(*snapshot_descriptors) * descriptors_allocated);
-
-	dirdesc = AllocateDir(SNAPDIR);
-	dirdesc_fd = dirfd(dirdesc);
-	if (dirdesc_fd == -1)
-		ereport(ERROR, errmsg("failed to get a file descriptor for " SNAPDIR ": %m"));
+	lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);

 	/* find all .snap files and get their lsns */
-	while ((de = ReadDir(dirdesc, SNAPDIR)) != NULL)
+	dirdesc = AllocateDir(snap_path);
+	while ((de = ReadDir(dirdesc, snap_path)) != NULL)
 	{
+		XLogRecPtr	lsn;
 		uint32		hi;
 		uint32		lo;
-		struct stat	st;
-		XLogRecPtr	lsn;
-		SnapDesc   *desc;

 		if (strcmp(de->d_name, ".") == 0 ||
 			strcmp(de->d_name, "..") == 0)
@@ -109,69 +79,28 @@ get_snapshots_cutoff_lsn(void)

 		lsn = ((uint64) hi) << 32 | lo;
 		elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
-
-		if (fstatat(dirdesc_fd, de->d_name, &st, 0) == -1)
-			ereport(ERROR, errmsg("failed to get the size of " SNAPDIR "/%s: %m", de->d_name));
-
-		if (descriptors_allocated == snapshot_index)
+		if (lsns_allocated == lsns_num)
 		{
-			descriptors_allocated *= 2;
-			snapshot_descriptors = repalloc(snapshot_descriptors, sizeof(*snapshot_descriptors) * descriptors_allocated);
+			lsns_allocated *= 2;
+			lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
 		}
-
-		desc = &snapshot_descriptors[snapshot_index++];
-		desc->lsn = lsn;
-		desc->sz = st.st_size;
+		lsns[lsns_num++] = lsn;
 	}
-
-	qsort(snapshot_descriptors, snapshot_index, sizeof(*snapshot_descriptors), SnapDescComparator);
-
-	/* Are there more snapshot files than specified? */
-	if (logical_replication_max_snap_files <= snapshot_index)
+	/* sort by lsn desc */
+	qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
+	/* and take cutoff at logical_replication_max_snap_files */
+	if (logical_replication_max_snap_files > lsns_num)
+		cutoff = 0;
+	/* have less files than cutoff */
+	else
 	{
-		cutoff = snapshot_descriptors[logical_replication_max_snap_files - 1].lsn;
-		elog(LOG,
-			"ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %zu snapshot files, limit is %d",
-			LSN_FORMAT_ARGS(cutoff), snapshot_index, logical_replication_max_snap_files);
+		cutoff = lsns[logical_replication_max_snap_files - 1];
+		elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
+			 LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
 	}
-
-	/* Is the size of the logical snapshots directory larger than specified?
-	 *
-	 * It's possible we could hit both thresholds, so remove any extra files
-	 * first, and then truncate based on size of the remaining files.
-	 */
-	if (logicalsnapdir_size > logical_replication_max_logicalsnapdir_size_bytes)
-	{
-		/* Unfortunately, iterating the directory does not guarantee any order
-		 * so we can't cache an index in the preceding loop.
-		 */
-
-		off_t		sz;
-		const XLogRecPtr original = cutoff;
-
-		sz = snapshot_descriptors[0].sz;
-		for (size_t i = 1; i < logical_replication_max_snap_files; ++i)
-		{
-			if (sz > logical_replication_max_logicalsnapdir_size_bytes)
-			{
-				cutoff = snapshot_descriptors[i - 1].lsn;
-				break;
-			}
-
-			sz += snapshot_descriptors[i].sz;
-		}
-
-		if (cutoff != original)
-			elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower than %X/%X, " SNAPDIR " is larger than %d KB",
-					LSN_FORMAT_ARGS(cutoff), logical_replication_max_logicalsnapdir_size);
-	}
-
-	pfree(snapshot_descriptors);
+	pfree(lsns);
 	FreeDir(dirdesc);
-
 	return cutoff;
-
-#undef SNAPDIR
 }

 void
@@ -189,16 +118,6 @@ InitLogicalReplicationMonitor(void)
 							0,
 							NULL, NULL, NULL);

-	DefineCustomIntVariable(
-							"neon.logical_replication_max_logicalsnapdir_size",
-							"Maximum allowed size of the pg_logical/snapshots directory (KB). When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
-							NULL,
-							&logical_replication_max_logicalsnapdir_size,
-							8000, -1, INT_MAX,
-							PGC_SIGHUP,
-							GUC_UNIT_KB,
-							NULL, NULL, NULL);
-
 	memset(&bgw, 0, sizeof(bgw));
 	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
 	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
@@ -243,7 +162,7 @@ LogicalSlotsMonitorMain(Datum main_arg)
 		 * If there are too many .snap files, just drop all logical slots to
 		 * prevent aux files bloat.
 		 */
-		cutoff_lsn = get_snapshots_cutoff_lsn();
+		cutoff_lsn = get_num_snap_files_lsn_threshold();
 		if (cutoff_lsn > 0)
 		{
 			for (int i = 0; i < max_replication_slots; i++)
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -611,17 +611,6 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
 	recptr = startptr;
 	nbytes = count;

-/* Try to read directly from WAL buffers first. */
-#if PG_MAJORVERSION_NUM >= 17
-	{
-		Size	rbytes;
-		rbytes = WALReadFromBuffers(p, recptr, nbytes, tli);
-		recptr += rbytes;
-		nbytes -= rbytes;
-		p += rbytes;
-	}
-#endif
-
 	while (nbytes > 0)
 	{
 		uint32		startoff;
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1361,35 +1361,29 @@ SendAppendRequests(Safekeeper *sk)
 		if (sk->active_state == SS_ACTIVE_READ_WAL)
 		{
 			char	   *errmsg;
-			int			req_len;

 			req = &sk->appendRequest;
-			req_len = req->endLsn - req->beginLsn;

-			/* We send zero sized AppenRequests as heartbeats; don't wal_read for these. */
-			if (req_len > 0)
+			switch (wp->api.wal_read(sk,
+									 &sk->outbuf.data[sk->outbuf.len],
+									 req->beginLsn,
+									 req->endLsn - req->beginLsn,
+									 &errmsg))
 			{
-				switch (wp->api.wal_read(sk,
-										&sk->outbuf.data[sk->outbuf.len],
-										req->beginLsn,
-										req_len,
-										&errmsg))
-				{
-					case NEON_WALREAD_SUCCESS:
-						break;
-					case NEON_WALREAD_WOULDBLOCK:
-						return true;
-					case NEON_WALREAD_ERROR:
-						wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
-							sk->host, sk->port, errmsg);
-						ShutdownConnection(sk);
-						return false;
-					default:
-						Assert(false);
-				}
+				case NEON_WALREAD_SUCCESS:
+					break;
+				case NEON_WALREAD_WOULDBLOCK:
+					return true;
+				case NEON_WALREAD_ERROR:
+					wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
+						   sk->host, sk->port, errmsg);
+					ShutdownConnection(sk);
+					return false;
+				default:
+					Assert(false);
 			}

-			sk->outbuf.len += req_len;
+			sk->outbuf.len += req->endLsn - req->beginLsn;

 			writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);

--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1489,11 +1489,33 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count,
 {
 	NeonWALReadResult res;

-	res = NeonWALRead(sk->xlogreader,
-					  buf,
-					  startptr,
-					  count,
-					  walprop_pg_get_timeline_id());
+#if PG_MAJORVERSION_NUM >= 17
+	if (!sk->wp->config->syncSafekeepers)
+	{
+		Size	rbytes;
+		rbytes = WALReadFromBuffers(buf, startptr, count,
+									walprop_pg_get_timeline_id());
+
+		startptr += rbytes;
+		count -= rbytes;
+	}
+#endif
+
+	if (count == 0)
+	{
+		res = NEON_WALREAD_SUCCESS;
+	}
+	else
+	{
+		Assert(count > 0);
+
+		/* Now read the remaining WAL from the WAL file */
+		res = NeonWALRead(sk->xlogreader,
+						  buf,
+						  startptr,
+						  count,
+						  walprop_pg_get_timeline_id());
+	}

 	if (res == NEON_WALREAD_SUCCESS)
 	{
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -60,7 +60,7 @@ prometheus.workspace = true
 rand.workspace = true
 regex.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
-reqwest = { workspace = true, features = ["rustls-tls-native-roots"] }
+reqwest.workspace = true
 reqwest-middleware = { workspace = true, features = ["json"] }
 reqwest-retry.workspace = true
 reqwest-tracing.workspace = true
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -7,11 +7,8 @@ use arc_swap::ArcSwapOption;
 use dashmap::DashMap;
 use jose_jwk::crypto::KeyInfo;
 use reqwest::{redirect, Client};
-use reqwest_retry::policies::ExponentialBackoff;
-use reqwest_retry::RetryTransientMiddleware;
 use serde::de::Visitor;
 use serde::{Deserialize, Deserializer};
-use serde_json::value::RawValue;
 use signature::Verifier;
 use thiserror::Error;
 use tokio::time::Instant;
@@ -19,7 +16,7 @@ use tokio::time::Instant;
 use crate::auth::backend::ComputeCredentialKeys;
 use crate::context::RequestMonitoring;
 use crate::control_plane::errors::GetEndpointJwksError;
-use crate::http::read_body_with_limit;
+use crate::http::parse_json_body_with_limit;
 use crate::intern::RoleNameInt;
 use crate::types::{EndpointId, RoleName};

@@ -31,10 +28,6 @@ const MAX_RENEW: Duration = Duration::from_secs(3600);
 const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
 const JWKS_USER_AGENT: &str = "neon-proxy";

-const JWKS_CONNECT_TIMEOUT: Duration = Duration::from_secs(2);
-const JWKS_FETCH_TIMEOUT: Duration = Duration::from_secs(5);
-const JWKS_FETCH_RETRIES: u32 = 3;
-
 /// How to get the JWT auth rules
 pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
    fn fetch_auth_rules(
@@ -62,7 +55,7 @@ pub(crate) struct AuthRule {
 }

 pub struct JwkCache {
-    client: reqwest_middleware::ClientWithMiddleware,
+    client: reqwest::Client,

    map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
 }
@@ -124,14 +117,6 @@ impl Default for JwkCacheEntryLock {
    }
 }

-#[derive(Deserialize)]
-struct JwkSet<'a> {
-    /// we parse into raw-value because not all keys in a JWKS are ones
-    /// we can parse directly, so we parse them lazily.
-    #[serde(borrow)]
-    keys: Vec<&'a RawValue>,
-}
-
 impl JwkCacheEntryLock {
    async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
        JwkRenewalPermit::acquire_permit(self).await
@@ -145,7 +130,7 @@ impl JwkCacheEntryLock {
        &self,
        _permit: JwkRenewalPermit<'_>,
        ctx: &RequestMonitoring,
-        client: &reqwest_middleware::ClientWithMiddleware,
+        client: &reqwest::Client,
        endpoint: EndpointId,
        auth_rules: &F,
    ) -> Result<Arc<JwkCacheEntry>, JwtError> {
@@ -169,73 +154,22 @@ impl JwkCacheEntryLock {
            let req = client.get(rule.jwks_url.clone());
            // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
            // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only.
-            match req.send().await.and_then(|r| {
-                r.error_for_status()
-                    .map_err(reqwest_middleware::Error::Reqwest)
-            }) {
+            match req.send().await.and_then(|r| r.error_for_status()) {
                // todo: should we re-insert JWKs if we want to keep this JWKs URL?
                // I expect these failures would be quite sparse.
                Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"),
                Ok(r) => {
                    let resp: http::Response<reqwest::Body> = r.into();
-
-                    let bytes = match read_body_with_limit(resp.into_body(), MAX_JWK_BODY_SIZE)
-                        .await
+                    match parse_json_body_with_limit::<jose_jwk::JwkSet>(
+                        resp.into_body(),
+                        MAX_JWK_BODY_SIZE,
+                    )
+                    .await
                    {
-                        Ok(bytes) => bytes,
-                        Err(e) => {
-                            tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
-                            continue;
-                        }
-                    };
-
-                    match serde_json::from_slice::<JwkSet>(&bytes) {
                        Err(e) => {
                            tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
                        }
                        Ok(jwks) => {
-                            // size_of::<&RawValue>() == 16
-                            // size_of::<jose_jwk::Jwk>() == 288
-                            // better to not pre-allocate this as it might be pretty large - especially if it has many
-                            // keys we don't want or need.
-                            // trivial 'attack': `{"keys":[` + repeat(`0`).take(30000).join(`,`) + `]}`
-                            // this would consume 8MiB just like that!
-                            let mut keys = vec![];
-                            let mut failed = 0;
-                            for key in jwks.keys {
-                                match serde_json::from_str::<jose_jwk::Jwk>(key.get()) {
-                                    Ok(key) => {
-                                        // if `use` (called `cls` in rust) is specified to be something other than signing,
-                                        // we can skip storing it.
-                                        if key
-                                            .prm
-                                            .cls
-                                            .as_ref()
-                                            .is_some_and(|c| *c != jose_jwk::Class::Signing)
-                                        {
-                                            continue;
-                                        }
-
-                                        keys.push(key);
-                                    }
-                                    Err(e) => {
-                                        tracing::debug!(url=?rule.jwks_url, failed=?e, "could not decode JWK");
-                                        failed += 1;
-                                    }
-                                }
-                            }
-                            keys.shrink_to_fit();
-
-                            if failed > 0 {
-                                tracing::warn!(url=?rule.jwks_url, failed, "could not decode JWKs");
-                            }
-
-                            if keys.is_empty() {
-                                tracing::warn!(url=?rule.jwks_url, "no valid JWKs found inside the response body");
-                                continue;
-                            }
-
-                            let jwks = jose_jwk::JwkSet { keys };
                            key_sets.insert(
                                rule.id,
                                KeySet {
@@ -245,7 +179,7 @@ impl JwkCacheEntryLock {
                                },
                            );
                        }
-                    };
+                    }
                }
            }
        }
@@ -262,7 +196,7 @@ impl JwkCacheEntryLock {
    async fn get_or_update_jwk_cache<F: FetchAuthRules>(
        self: &Arc<Self>,
        ctx: &RequestMonitoring,
-        client: &reqwest_middleware::ClientWithMiddleware,
+        client: &reqwest::Client,
        endpoint: EndpointId,
        fetch: &F,
    ) -> Result<Arc<JwkCacheEntry>, JwtError> {
@@ -316,7 +250,7 @@ impl JwkCacheEntryLock {
        self: &Arc<Self>,
        ctx: &RequestMonitoring,
        jwt: &str,
-        client: &reqwest_middleware::ClientWithMiddleware,
+        client: &reqwest::Client,
        endpoint: EndpointId,
        role_name: &RoleName,
        fetch: &F,
@@ -435,19 +369,8 @@ impl Default for JwkCache {
        let client = Client::builder()
            .user_agent(JWKS_USER_AGENT)
            .redirect(redirect::Policy::none())
-            .tls_built_in_native_certs(true)
-            .connect_timeout(JWKS_CONNECT_TIMEOUT)
-            .timeout(JWKS_FETCH_TIMEOUT)
            .build()
-            .expect("client config should be valid");
-
-        // Retry up to 3 times with increasing intervals between attempts.
-        let retry_policy = ExponentialBackoff::builder().build_with_max_retries(JWKS_FETCH_RETRIES);
-
-        let client = reqwest_middleware::ClientBuilder::new(client)
-            .with(RetryTransientMiddleware::new_with_policy(retry_policy))
-            .build();
-
+            .expect("using &str and standard redirect::Policy");
        JwkCache {
            client,
            map: DashMap::default(),
@@ -1286,63 +1209,4 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
            }
        }
    }
-
-    #[tokio::test]
-    async fn check_jwk_keycloak_regression() {
-        let (rs, valid_jwk) = new_rsa_jwk(RS1, "rs1".into());
-        let valid_jwk = serde_json::to_value(valid_jwk).unwrap();
-
-        // This is valid, but we cannot parse it as we have no support for encryption JWKs, only signature based ones.
-        // This is taken directly from keycloak.
-        let invalid_jwk = serde_json::json! {
-            {
-                "kid": "U-Jc9xRli84eNqRpYQoIPF-GNuRWV3ZvAIhziRW2sbQ",
-                "kty": "RSA",
-                "alg": "RSA-OAEP",
-                "use": "enc",
-                "n": "yypYWsEKmM_wWdcPnSGLSm5ytw1WG7P7EVkKSulcDRlrM6HWj3PR68YS8LySYM2D9Z-79oAdZGKhIfzutqL8rK1vS14zDuPpAM-RWY3JuQfm1O_-1DZM8-07PmVRegP5KPxsKblLf_My8ByH6sUOIa1p2rbe2q_b0dSTXYu1t0dW-cGL5VShc400YymvTwpc-5uYNsaVxZajnB7JP1OunOiuCJ48AuVp3PqsLzgoXqlXEB1ZZdch3xT3bxaTtNruGvG4xmLZY68O_T3yrwTCNH2h_jFdGPyXdyZToCMSMK2qSbytlfwfN55pT9Vv42Lz1YmoB7XRjI9aExKPc5AxFw",
-                "e": "AQAB",
-                "x5c": [
-                    "MIICmzCCAYMCBgGS41E6azANBgkqhkiG9w0BAQsFADARMQ8wDQYDVQQDDAZtYXN0ZXIwHhcNMjQxMDMxMTYwMTQ0WhcNMzQxMDMxMTYwMzI0WjARMQ8wDQYDVQQDDAZtYXN0ZXIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDLKlhawQqYz/BZ1w+dIYtKbnK3DVYbs/sRWQpK6VwNGWszodaPc9HrxhLwvJJgzYP1n7v2gB1kYqEh/O62ovysrW9LXjMO4+kAz5FZjcm5B+bU7/7UNkzz7Ts+ZVF6A/ko/GwpuUt/8zLwHIfqxQ4hrWnatt7ar9vR1JNdi7W3R1b5wYvlVKFzjTRjKa9PClz7m5g2xpXFlqOcHsk/U66c6K4InjwC5Wnc+qwvOCheqVcQHVll1yHfFPdvFpO02u4a8bjGYtljrw79PfKvBMI0faH+MV0Y/Jd3JlOgIxIwrapJvK2V/B83nmlP1W/jYvPViagHtdGMj1oTEo9zkDEXAgMBAAEwDQYJKoZIhvcNAQELBQADggEBAECYX59+Q9v6c9sb6Q0/C6IgLWG2nVCgVE1YWwIzz+68WrhlmNCRuPjY94roB+tc2tdHbj+Nh3LMzJk7L1KCQoW1+LPK6A6E8W9ad0YPcuw8csV2pUA3+H56exQMH0fUAPQAU7tXWvnQ7otcpV1XA8afn/NTMTsnxi9mSkor8MLMYQ3aeRyh1+LAchHBthWiltqsSUqXrbJF59u5p0ghquuKcWR3TXsA7klGYBgGU5KAJifr9XT87rN0bOkGvbeWAgKvnQnjZwxdnLqTfp/pRY/PiJJHhgIBYPIA7STGnMPjmJ995i34zhnbnd8WHXJA3LxrIMqLW/l8eIdvtM1w8KI="
-                ],
-                "x5t": "QhfzMMnuAfkReTgZ1HtrfyOeeZs",
-                "x5t#S256": "cmHDUdKgLiRCEN28D5FBy9IJLFmR7QWfm77SLhGTCTU"
-            }
-        };
-
-        let jwks = serde_json::json! {{ "keys": [invalid_jwk, valid_jwk ] }};
-        let jwks_addr = jwks_server(move |path| match path {
-            "/" => Some(serde_json::to_vec(&jwks).unwrap()),
-            _ => None,
-        })
-        .await;
-
-        let role_name = RoleName::from("anonymous");
-        let role = RoleNameInt::from(&role_name);
-
-        let rules = vec![AuthRule {
-            id: "foo".to_owned(),
-            jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
-            audience: None,
-            role_names: vec![role],
-        }];
-
-        let fetch = Fetch(rules);
-        let jwk_cache = JwkCache::default();
-
-        let endpoint = EndpointId::from("ep");
-
-        let token = new_rsa_jwt("rs1".into(), rs);
-
-        jwk_cache
-            .check_jwt(
-                &RequestMonitoring::test(),
-                endpoint.clone(),
-                &role_name,
-                &fetch,
-                &token,
-            )
-            .await
-            .unwrap();
-    }
 }
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -1,12 +1,13 @@
 use std::convert::Infallible;
 use std::future::pending;
 use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::{Arc, Mutex};
+use std::sync::Arc;

 use dashmap::DashSet;
 use redis::streams::{StreamReadOptions, StreamReadReply};
 use redis::{AsyncCommands, FromRedisValue, Value};
 use serde::Deserialize;
+use tokio::sync::Mutex;
 use tokio_util::sync::CancellationToken;
 use tracing::info;

@@ -31,17 +32,17 @@ struct ControlPlaneEvent {

 #[derive(Deserialize, Debug, Clone, PartialEq)]
 struct EndpointCreated {
-    endpoint_id: EndpointIdInt,
+    endpoint_id: String,
 }

 #[derive(Deserialize, Debug, Clone, PartialEq)]
 struct BranchCreated {
-    branch_id: BranchIdInt,
+    branch_id: String,
 }

 #[derive(Deserialize, Debug, Clone, PartialEq)]
 struct ProjectCreated {
-    project_id: ProjectIdInt,
+    project_id: String,
 }

 impl TryFrom<&Value> for ControlPlaneEvent {
@@ -75,72 +76,53 @@ impl EndpointsCache {
        }
    }

-    pub(crate) fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
+    pub(crate) async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
        if !self.ready.load(Ordering::Acquire) {
-            // the endpoint cache is not yet fully initialised.
            return true;
        }
-
-        if !self.should_reject(endpoint) {
-            ctx.set_rejected(false);
+        let rejected = self.should_reject(endpoint);
+        ctx.set_rejected(rejected);
+        info!(?rejected, "check endpoint is valid, disabled cache");
+        // If cache is disabled, just collect the metrics and return or
+        // If the limiter allows, we don't need to check the cache.
+        if self.config.disable_cache || self.limiter.lock().await.check() {
            return true;
        }
-
-        // report that we might want to reject this endpoint
-        ctx.set_rejected(true);
-
-        // If cache is disabled, just collect the metrics and return.
-        if self.config.disable_cache {
-            return true;
-        }
-
-        // If the limiter allows, we can pretend like it's valid
-        // (incase it is, due to redis channel lag).
-        if self.limiter.lock().unwrap().check() {
-            return true;
-        }
-
-        // endpoint not found, and there's too much load.
-        false
+        !rejected
    }

    fn should_reject(&self, endpoint: &EndpointId) -> bool {
        if endpoint.is_endpoint() {
-            let Some(endpoint) = EndpointIdInt::get(endpoint) else {
-                // if we haven't interned this endpoint, it's not in the cache.
-                return true;
-            };
-            !self.endpoints.contains(&endpoint)
+            !self.endpoints.contains(&EndpointIdInt::from(endpoint))
        } else if endpoint.is_branch() {
-            let Some(branch) = BranchIdInt::get(endpoint) else {
-                // if we haven't interned this branch, it's not in the cache.
-                return true;
-            };
-            !self.branches.contains(&branch)
+            !self
+                .branches
+                .contains(&BranchIdInt::from(&endpoint.as_branch()))
        } else {
-            let Some(project) = ProjectIdInt::get(endpoint) else {
-                // if we haven't interned this project, it's not in the cache.
-                return true;
-            };
-            !self.projects.contains(&project)
+            !self
+                .projects
+                .contains(&ProjectIdInt::from(&endpoint.as_project()))
        }
    }

    fn insert_event(&self, event: ControlPlaneEvent) {
        if let Some(endpoint_created) = event.endpoint_created {
-            self.endpoints.insert(endpoint_created.endpoint_id);
+            self.endpoints
+                .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
            Metrics::get()
                .proxy
                .redis_events_count
                .inc(RedisEventsCount::EndpointCreated);
        } else if let Some(branch_created) = event.branch_created {
-            self.branches.insert(branch_created.branch_id);
+            self.branches
+                .insert(BranchIdInt::from(&branch_created.branch_id.into()));
            Metrics::get()
                .proxy
                .redis_events_count
                .inc(RedisEventsCount::BranchCreated);
        } else if let Some(project_created) = event.project_created {
-            self.projects.insert(project_created.project_id);
+            self.projects
+                .insert(ProjectIdInt::from(&project_created.project_id.into()));
            Metrics::get()
                .proxy
                .redis_events_count
@@ -265,13 +247,11 @@ mod tests {
    fn test_parse_control_plane_event() {
        let s = r#"{"branch_created":null,"endpoint_created":{"endpoint_id":"ep-rapid-thunder-w0qqw2q9"},"project_created":null,"type":"endpoint_created"}"#;

-        let endpoint_id: EndpointId = "ep-rapid-thunder-w0qqw2q9".into();
-
        assert_eq!(
            serde_json::from_str::<ControlPlaneEvent>(s).unwrap(),
            ControlPlaneEvent {
                endpoint_created: Some(EndpointCreated {
-                    endpoint_id: endpoint_id.into(),
+                    endpoint_id: "ep-rapid-thunder-w0qqw2q9".into()
                }),
                branch_created: None,
                project_created: None,
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -316,6 +316,7 @@ impl ConnCfg {
        let client_config = client_config.with_no_client_auth();

        let mut mk_tls = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
+        // TODO(vlad): que?
        let tls = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
            &mut mk_tls,
            host,
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -72,6 +72,7 @@ impl NeonControlPlaneClient {
            .caches
            .endpoints_cache
            .is_valid(ctx, &user_info.endpoint.normalize())
+            .await
        {
            info!("endpoint is not valid, skipping the request");
            return Ok(AuthInfo::default());
@@ -144,6 +145,7 @@ impl NeonControlPlaneClient {
            .caches
            .endpoints_cache
            .is_valid(ctx, &endpoint.normalize())
+            .await
        {
            return Err(GetEndpointJwksError::EndpointNotFound);
        }
--- a/proxy/src/http/mod.rs
+++ b/proxy/src/http/mod.rs
@@ -6,6 +6,7 @@ pub mod health_server;

 use std::time::Duration;

+use anyhow::bail;
 use bytes::Bytes;
 use http::Method;
 use http_body_util::BodyExt;
@@ -15,7 +16,7 @@ use reqwest_middleware::RequestBuilder;
 pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error};
 pub(crate) use reqwest_retry::policies::ExponentialBackoff;
 pub(crate) use reqwest_retry::RetryTransientMiddleware;
-use thiserror::Error;
+use serde::de::DeserializeOwned;

 use crate::metrics::{ConsoleRequest, Metrics};
 use crate::url::ApiUrl;
@@ -121,19 +122,10 @@ impl Endpoint {
    }
 }

-#[derive(Error, Debug)]
-pub(crate) enum ReadBodyError {
-    #[error("Content length exceeds limit of {limit} bytes")]
-    BodyTooLarge { limit: usize },
-
-    #[error(transparent)]
-    Read(#[from] reqwest::Error),
-}
-
-pub(crate) async fn read_body_with_limit(
+pub(crate) async fn parse_json_body_with_limit<D: DeserializeOwned>(
    mut b: impl Body<Data = Bytes, Error = reqwest::Error> + Unpin,
    limit: usize,
-) -> Result<Vec<u8>, ReadBodyError> {
+) -> anyhow::Result<D> {
    // We could use `b.limited().collect().await.to_bytes()` here
    // but this ends up being slightly more efficient as far as I can tell.

@@ -141,20 +133,20 @@ pub(crate) async fn read_body_with_limit(
    // in reqwest, this value is influenced by the Content-Length header.
    let lower_bound = match usize::try_from(b.size_hint().lower()) {
        Ok(bound) if bound <= limit => bound,
-        _ => return Err(ReadBodyError::BodyTooLarge { limit }),
+        _ => bail!("Content length exceeds limit of {limit} bytes"),
    };
    let mut bytes = Vec::with_capacity(lower_bound);

    while let Some(frame) = b.frame().await.transpose()? {
        if let Ok(data) = frame.into_data() {
            if bytes.len() + data.len() > limit {
-                return Err(ReadBodyError::BodyTooLarge { limit });
+                bail!("Content length exceeds limit of {limit} bytes")
            }
            bytes.extend_from_slice(&data);
        }
    }

-    Ok(bytes)
+    Ok(serde_json::from_slice::<D>(&bytes)?)
 }

 #[cfg(test)]
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -1,6 +1,12 @@
 // rustc lints/lint groups
 // https://doc.rust-lang.org/rustc/lints/groups.html
-#![deny(deprecated, future_incompatible, let_underscore, nonstandard_style)]
+#![deny(
+    deprecated,
+    future_incompatible,
+    let_underscore,
+    nonstandard_style,
+    rust_2024_compatibility
+)]
 #![warn(clippy::all, clippy::pedantic, clippy::cargo)]
 // List of denied lints from the clippy::restriction group.
 // https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -16,7 +16,8 @@ use super::http_conn_pool::ClientDataHttp;
 use super::local_conn_pool::ClientDataLocal;
 use crate::auth::backend::ComputeUserInfo;
 use crate::context::RequestMonitoring;
-use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
+use crate::control_plane::messages::ColdStartInfo;
+use crate::control_plane::messages::MetricsAuxInfo;
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::types::{DbName, EndpointCacheKey, RoleName};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -7,6 +7,7 @@ use hyper::client::conn::http2;
 use hyper_util::rt::{TokioExecutor, TokioIo};
 use parking_lot::RwLock;
 use rand::Rng;
+use std::result::Result::Ok;
 use tokio::net::TcpStream;
 use tracing::{debug, error, info, info_span, Instrument};

--- a/proxy/src/types.rs
+++ b/proxy/src/types.rs
@@ -64,28 +64,24 @@ macro_rules! smol_str_wrapper {
 }

 const POOLER_SUFFIX: &str = "-pooler";
-pub(crate) const LOCAL_PROXY_SUFFIX: &str = "-local-proxy";

 impl EndpointId {
    #[must_use]
-    fn normalize_str(&self) -> &str {
+    pub fn normalize(&self) -> Self {
        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
-            stripped
-        } else if let Some(stripped) = self.as_ref().strip_suffix(LOCAL_PROXY_SUFFIX) {
-            stripped
+            stripped.into()
        } else {
-            self
+            self.clone()
        }
    }

-    #[must_use]
-    pub fn normalize(&self) -> Self {
-        self.normalize_str().into()
-    }
-
    #[must_use]
    pub fn normalize_intern(&self) -> EndpointIdInt {
-        EndpointIdTag::get_interner().get_or_intern(self.normalize_str())
+        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
+            EndpointIdTag::get_interner().get_or_intern(stripped)
+        } else {
+            self.into()
+        }
    }
 }

@@ -114,4 +110,13 @@ impl EndpointId {
    pub(crate) fn is_branch(&self) -> bool {
        self.0.starts_with("br-")
    }
+    // pub(crate) fn is_project(&self) -> bool {
+    //     !self.is_endpoint() && !self.is_branch()
+    // }
+    pub(crate) fn as_branch(&self) -> BranchId {
+        BranchId(self.0.clone())
+    }
+    pub(crate) fn as_project(&self) -> ProjectId {
+        ProjectId(self.0.clone())
+    }
 }
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -28,6 +28,7 @@ hyper0.workspace = true
 futures.workspace = true
 once_cell.workspace = true
 parking_lot.workspace = true
+pageserver_api.workspace = true
 postgres.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
@@ -57,18 +58,13 @@ sd-notify.workspace = true
 storage_broker.workspace = true
 tokio-stream.workspace = true
 utils.workspace = true
+wal_decoder.workspace = true

 workspace_hack.workspace = true

 [dev-dependencies]
-criterion.workspace = true
-itertools.workspace = true
 walproposer.workspace = true
 rand.workspace = true
 desim.workspace = true
 tracing.workspace = true
 tracing-subscriber = { workspace = true, features = ["json"] }
-
-[[bench]]
-name = "receive_wal"
-harness = false
--- a/safekeeper/benches/README.md
+++ b/safekeeper/benches/README.md
@@ -1,22 +0,0 @@
-## Safekeeper Benchmarks
-
-To run benchmarks:
-
-```sh
-# All benchmarks.
-cargo bench --package safekeeper
-
-# Specific file.
-cargo bench --package safekeeper --bench receive_wal
-
-# Specific benchmark.
-cargo bench --package safekeeper --bench receive_wal process_msg/fsync=false
-
-# List available benchmarks.
-cargo bench --package safekeeper --benches -- --list
-```
-
-Additional charts and statistics are available in `target/criterion/report/index.html`.
-
-Benchmarks are automatically compared against the previous run. To compare against other runs, see
-`--baseline` and `--save-baseline`.
--- a/safekeeper/benches/benchutils.rs
+++ b/safekeeper/benches/benchutils.rs
@@ -1,102 +0,0 @@
-use std::sync::Arc;
-
-use camino_tempfile::Utf8TempDir;
-use safekeeper::rate_limit::RateLimiter;
-use safekeeper::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory};
-use safekeeper::state::{TimelinePersistentState, TimelineState};
-use safekeeper::timeline::{get_timeline_dir, SharedState, StateSK, Timeline};
-use safekeeper::timelines_set::TimelinesSet;
-use safekeeper::wal_backup::remote_timeline_path;
-use safekeeper::{control_file, wal_storage, SafeKeeperConf};
-use tokio::fs::create_dir_all;
-use utils::id::{NodeId, TenantTimelineId};
-use utils::lsn::Lsn;
-
-/// A Safekeeper benchmarking environment. Uses a tempdir for storage, removed on drop.
-pub struct Env {
-    /// Whether to enable fsync.
-    pub fsync: bool,
-    /// Benchmark directory. Deleted when dropped.
-    pub tempdir: Utf8TempDir,
-}
-
-impl Env {
-    /// Creates a new benchmarking environment in a temporary directory. fsync controls whether to
-    /// enable fsyncing.
-    pub fn new(fsync: bool) -> anyhow::Result<Self> {
-        let tempdir = camino_tempfile::tempdir()?;
-        Ok(Self { fsync, tempdir })
-    }
-
-    /// Constructs a Safekeeper config for the given node ID.
-    fn make_conf(&self, node_id: NodeId) -> SafeKeeperConf {
-        let mut conf = SafeKeeperConf::dummy();
-        conf.my_id = node_id;
-        conf.no_sync = !self.fsync;
-        conf.workdir = self.tempdir.path().join(format!("safekeeper-{node_id}"));
-        conf
-    }
-
-    /// Constructs a Safekeeper with the given node and tenant/timeline ID.
-    ///
-    /// TODO: we should support using in-memory storage, to measure non-IO costs. This would be
-    /// easier if SafeKeeper used trait objects for storage rather than generics. It's also not
-    /// currently possible to construct a timeline using non-file storage since StateSK only accepts
-    /// SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>.
-    pub async fn make_safekeeper(
-        &self,
-        node_id: NodeId,
-        ttid: TenantTimelineId,
-    ) -> anyhow::Result<SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>> {
-        let conf = self.make_conf(node_id);
-
-        let timeline_dir = get_timeline_dir(&conf, &ttid);
-        create_dir_all(&timeline_dir).await?;
-
-        let mut pstate = TimelinePersistentState::empty();
-        pstate.tenant_id = ttid.tenant_id;
-        pstate.timeline_id = ttid.timeline_id;
-
-        let wal = wal_storage::PhysicalStorage::new(&ttid, &timeline_dir, &pstate, conf.no_sync)?;
-        let ctrl =
-            control_file::FileStorage::create_new(&timeline_dir, pstate, conf.no_sync).await?;
-        let state = TimelineState::new(ctrl);
-        let mut safekeeper = SafeKeeper::new(state, wal, conf.my_id)?;
-
-        // Emulate an initial election.
-        safekeeper
-            .process_msg(&ProposerAcceptorMessage::Elected(ProposerElected {
-                term: 1,
-                start_streaming_at: Lsn(0),
-                term_history: TermHistory(vec![(1, Lsn(0)).into()]),
-                timeline_start_lsn: Lsn(0),
-            }))
-            .await?;
-
-        Ok(safekeeper)
-    }
-
-    /// Constructs a timeline, including a new Safekeeper with the given node ID, and spawns its
-    /// manager task.
-    pub async fn make_timeline(
-        &self,
-        node_id: NodeId,
-        ttid: TenantTimelineId,
-    ) -> anyhow::Result<Arc<Timeline>> {
-        let conf = self.make_conf(node_id);
-        let timeline_dir = get_timeline_dir(&conf, &ttid);
-        let remote_path = remote_timeline_path(&ttid)?;
-
-        let safekeeper = self.make_safekeeper(node_id, ttid).await?;
-        let shared_state = SharedState::new(StateSK::Loaded(safekeeper));
-
-        let timeline = Timeline::new(ttid, &timeline_dir, &remote_path, shared_state);
-        timeline.bootstrap(
-            &mut timeline.write_shared_state().await,
-            &conf,
-            Arc::new(TimelinesSet::default()), // ignored for now
-            RateLimiter::new(0, 0),
-        );
-        Ok(timeline)
-    }
-}
--- a/safekeeper/benches/receive_wal.rs
+++ b/safekeeper/benches/receive_wal.rs
@@ -1,341 +0,0 @@
-//! WAL ingestion benchmarks.
-
-#[path = "benchutils.rs"]
-mod benchutils;
-
-use std::io::Write as _;
-
-use benchutils::Env;
-use camino_tempfile::tempfile;
-use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion};
-use itertools::Itertools as _;
-use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator};
-use safekeeper::receive_wal::{self, WalAcceptor};
-use safekeeper::safekeeper::{
-    AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage,
-};
-use tokio::io::AsyncWriteExt as _;
-use utils::id::{NodeId, TenantTimelineId};
-use utils::lsn::Lsn;
-
-const KB: usize = 1024;
-const MB: usize = 1024 * KB;
-const GB: usize = 1024 * MB;
-
-// Register benchmarks with Criterion.
-criterion_group!(
-    benches,
-    bench_process_msg,
-    bench_wal_acceptor,
-    bench_wal_acceptor_throughput,
-    bench_file_write
-);
-criterion_main!(benches);
-
-/// Benchmarks SafeKeeper::process_msg() as time per message and throughput. Each message is an
-/// AppendRequest with a single WAL record containing an XlLogicalMessage of varying size. When
-/// measuring throughput, only the logical message payload is considered, excluding
-/// segment/page/record headers.
-fn bench_process_msg(c: &mut Criterion) {
-    let mut g = c.benchmark_group("process_msg");
-    for fsync in [false, true] {
-        for commit in [false, true] {
-            for size in [8, KB, 8 * KB, 128 * KB, MB] {
-                // Kind of weird to change the group throughput per benchmark, but it's the only way
-                // to vary it per benchmark. It works.
-                g.throughput(criterion::Throughput::Bytes(size as u64));
-                g.bench_function(format!("fsync={fsync}/commit={commit}/size={size}"), |b| {
-                    run_bench(b, size, fsync, commit).unwrap()
-                });
-            }
-        }
-    }
-
-    // The actual benchmark. If commit is true, advance the commit LSN on every message.
-    fn run_bench(b: &mut Bencher, size: usize, fsync: bool, commit: bool) -> anyhow::Result<()> {
-        let runtime = tokio::runtime::Builder::new_current_thread() // single is fine, sync IO only
-            .enable_all()
-            .build()?;
-
-        // Construct the payload. The prefix counts towards the payload (including NUL terminator).
-        let prefix = c"p";
-        let prefixlen = prefix.to_bytes_with_nul().len();
-        assert!(size >= prefixlen);
-        let message = vec![0; size - prefixlen];
-
-        let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message));
-
-        // Set up the Safekeeper.
-        let env = Env::new(fsync)?;
-        let mut safekeeper =
-            runtime.block_on(env.make_safekeeper(NodeId(1), TenantTimelineId::generate()))?;
-
-        b.iter_batched_ref(
-            // Pre-construct WAL records and requests. Criterion will batch them.
-            || {
-                let (lsn, record) = walgen.next().expect("endless WAL");
-                ProposerAcceptorMessage::AppendRequest(AppendRequest {
-                    h: AppendRequestHeader {
-                        term: 1,
-                        term_start_lsn: Lsn(0),
-                        begin_lsn: lsn,
-                        end_lsn: lsn + record.len() as u64,
-                        commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record
-                        truncate_lsn: Lsn(0),
-                        proposer_uuid: [0; 16],
-                    },
-                    wal_data: record,
-                })
-            },
-            // Benchmark message processing (time per message).
-            |msg| {
-                runtime
-                    .block_on(safekeeper.process_msg(msg))
-                    .expect("message failed")
-            },
-            BatchSize::SmallInput, // automatically determine a batch size
-        );
-        Ok(())
-    }
-}
-
-/// Benchmarks WalAcceptor message processing time by sending it a batch of WAL records and waiting
-/// for it to confirm that the last LSN has been flushed to storage. We pipeline a bunch of messages
-/// instead of measuring each individual message to amortize costs (e.g. fsync), which is more
-/// realistic. Records are XlLogicalMessage with a tiny payload (~64 bytes per record including
-/// headers). Records are pre-constructed to avoid skewing the benchmark.
-///
-/// TODO: add benchmarks with in-memory storage, see comment on `Env::make_safekeeper()`:
-fn bench_wal_acceptor(c: &mut Criterion) {
-    let mut g = c.benchmark_group("wal_acceptor");
-    for fsync in [false, true] {
-        for n in [1, 100, 10000] {
-            g.bench_function(format!("fsync={fsync}/n={n}"), |b| {
-                run_bench(b, n, fsync).unwrap()
-            });
-        }
-    }
-
-    /// The actual benchmark. n is the number of WAL records to send in a pipelined batch.
-    fn run_bench(b: &mut Bencher, n: usize, fsync: bool) -> anyhow::Result<()> {
-        let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded
-
-        let env = Env::new(fsync)?;
-        let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(c"prefix", b"message"));
-
-        // Create buffered channels that can fit all requests, to avoid blocking on channels.
-        let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(n);
-        let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(n);
-
-        // Spawn the WalAcceptor task.
-        runtime.block_on(async {
-            // TODO: WalAcceptor doesn't actually need a full timeline, only
-            // Safekeeper::process_msg(). Consider decoupling them to simplify the setup.
-            let tli = env
-                .make_timeline(NodeId(1), TenantTimelineId::generate())
-                .await?
-                .wal_residence_guard()
-                .await?;
-            WalAcceptor::spawn(tli, msg_rx, reply_tx, Some(0));
-            anyhow::Ok(())
-        })?;
-
-        b.iter_batched(
-            // Pre-construct a batch of WAL records and requests.
-            || {
-                walgen
-                    .take(n)
-                    .map(|(lsn, record)| AppendRequest {
-                        h: AppendRequestHeader {
-                            term: 1,
-                            term_start_lsn: Lsn(0),
-                            begin_lsn: lsn,
-                            end_lsn: lsn + record.len() as u64,
-                            commit_lsn: Lsn(0),
-                            truncate_lsn: Lsn(0),
-                            proposer_uuid: [0; 16],
-                        },
-                        wal_data: record,
-                    })
-                    .collect_vec()
-            },
-            // Benchmark batch ingestion (time per batch).
-            |reqs| {
-                runtime.block_on(async {
-                    let final_lsn = reqs.last().unwrap().h.end_lsn;
-                    // Stuff all the messages into the buffered channel to pipeline them.
-                    for req in reqs {
-                        let msg = ProposerAcceptorMessage::AppendRequest(req);
-                        msg_tx.send(msg).await.expect("send failed");
-                    }
-                    // Wait for the last message to get flushed.
-                    while let Some(reply) = reply_rx.recv().await {
-                        if let AcceptorProposerMessage::AppendResponse(resp) = reply {
-                            if resp.flush_lsn >= final_lsn {
-                                return;
-                            }
-                        }
-                    }
-                    panic!("disconnected")
-                })
-            },
-            BatchSize::PerIteration, // only run one request batch at a time
-        );
-        Ok(())
-    }
-}
-
-/// Benchmarks WalAcceptor throughput by sending 1 GB of data with varying message sizes and waiting
-/// for the last LSN to be flushed to storage. Only the actual message payload counts towards
-/// throughput, headers are excluded and considered overhead. Records are XlLogicalMessage.
-///
-/// To avoid running out of memory, messages are constructed during the benchmark.
-fn bench_wal_acceptor_throughput(c: &mut Criterion) {
-    const VOLUME: usize = GB; // NB: excludes message/page/segment headers and padding
-
-    let mut g = c.benchmark_group("wal_acceptor_throughput");
-    g.sample_size(10);
-    g.throughput(criterion::Throughput::Bytes(VOLUME as u64));
-
-    for fsync in [false, true] {
-        for commit in [false, true] {
-            for size in [KB, 8 * KB, 128 * KB, MB] {
-                assert_eq!(VOLUME % size, 0, "volume must be divisible by size");
-                let count = VOLUME / size;
-                g.bench_function(format!("fsync={fsync}/commit={commit}/size={size}"), |b| {
-                    run_bench(b, count, size, fsync, commit).unwrap()
-                });
-            }
-        }
-    }
-
-    /// The actual benchmark. size is the payload size per message, count is the number of messages.
-    /// If commit is true, advance the commit LSN on each message.
-    fn run_bench(
-        b: &mut Bencher,
-        count: usize,
-        size: usize,
-        fsync: bool,
-        commit: bool,
-    ) -> anyhow::Result<()> {
-        let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded
-
-        // Construct the payload. The prefix counts towards the payload (including NUL terminator).
-        let prefix = c"p";
-        let prefixlen = prefix.to_bytes_with_nul().len();
-        assert!(size >= prefixlen);
-        let message = vec![0; size - prefixlen];
-
-        let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message));
-
-        // Construct and spawn the WalAcceptor task.
-        let env = Env::new(fsync)?;
-
-        let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE);
-        let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE);
-
-        runtime.block_on(async {
-            let tli = env
-                .make_timeline(NodeId(1), TenantTimelineId::generate())
-                .await?
-                .wal_residence_guard()
-                .await?;
-            WalAcceptor::spawn(tli, msg_rx, reply_tx, Some(0));
-            anyhow::Ok(())
-        })?;
-
-        // Ingest the WAL.
-        b.iter(|| {
-            runtime.block_on(async {
-                let reqgen = walgen.take(count).map(|(lsn, record)| AppendRequest {
-                    h: AppendRequestHeader {
-                        term: 1,
-                        term_start_lsn: Lsn(0),
-                        begin_lsn: lsn,
-                        end_lsn: lsn + record.len() as u64,
-                        commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record
-                        truncate_lsn: Lsn(0),
-                        proposer_uuid: [0; 16],
-                    },
-                    wal_data: record,
-                });
-
-                // Send requests.
-                for req in reqgen {
-                    _ = reply_rx.try_recv(); // discard any replies, to avoid blocking
-                    let msg = ProposerAcceptorMessage::AppendRequest(req);
-                    msg_tx.send(msg).await.expect("send failed");
-                }
-
-                // Wait for last message to get flushed.
-                while let Some(reply) = reply_rx.recv().await {
-                    if let AcceptorProposerMessage::AppendResponse(resp) = reply {
-                        if resp.flush_lsn >= walgen.lsn {
-                            return;
-                        }
-                    }
-                }
-                panic!("disconnected")
-            })
-        });
-        Ok(())
-    }
-}
-
-/// Benchmarks OS write throughput by appending blocks of a given size to a file. This is intended
-/// to compare Tokio and stdlib writes, and give a baseline for optimal WAL throughput.
-fn bench_file_write(c: &mut Criterion) {
-    let mut g = c.benchmark_group("file_write");
-
-    for kind in ["stdlib", "tokio"] {
-        for fsync in [false, true] {
-            for size in [8, KB, 8 * KB, 128 * KB, MB] {
-                // Kind of weird to change the group throughput per benchmark, but it's the only way to
-                // vary it per benchmark. It works.
-                g.throughput(criterion::Throughput::Bytes(size as u64));
-                g.bench_function(
-                    format!("{kind}/fsync={fsync}/size={size}"),
-                    |b| match kind {
-                        "stdlib" => run_bench_stdlib(b, size, fsync).unwrap(),
-                        "tokio" => run_bench_tokio(b, size, fsync).unwrap(),
-                        name => panic!("unknown kind {name}"),
-                    },
-                );
-            }
-        }
-    }
-
-    fn run_bench_stdlib(b: &mut Bencher, size: usize, fsync: bool) -> anyhow::Result<()> {
-        let mut file = tempfile()?;
-        let buf = vec![0u8; size];
-
-        b.iter(|| {
-            file.write_all(&buf).unwrap();
-            file.flush().unwrap();
-            if fsync {
-                file.sync_data().unwrap();
-            }
-        });
-
-        Ok(())
-    }
-
-    fn run_bench_tokio(b: &mut Bencher, size: usize, fsync: bool) -> anyhow::Result<()> {
-        let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded
-
-        let mut file = tokio::fs::File::from_std(tempfile()?);
-        let buf = vec![0u8; size];
-
-        b.iter(|| {
-            runtime.block_on(async {
-                file.write_all(&buf).await.unwrap();
-                file.flush().await.unwrap();
-                if fsync {
-                    file.sync_data().await.unwrap();
-                }
-            })
-        });
-
-        Ok(())
-    }
-}
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -20,8 +20,7 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
            | Scope::PageServerApi
            | Scope::GenerationsApi
            | Scope::Infra
-            | Scope::Scrubber
-            | Scope::ControllerPeer,
+            | Scope::Scrubber,
            _,
        ) => Err(AuthError(
            format!(
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -2,11 +2,14 @@
 //! protocol commands.

 use anyhow::Context;
+use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
 use std::future::Future;
 use std::str::{self, FromStr};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{debug, info, info_span, Instrument};
+use utils::postgres_client::PAGESERVER_SAFEKEEPER_PROTO_VERSION;
+use utils::shard::{ShardCount, ShardNumber};

 use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
@@ -35,6 +38,8 @@ pub struct SafekeeperPostgresHandler {
    pub tenant_id: Option<TenantId>,
    pub timeline_id: Option<TimelineId>,
    pub ttid: TenantTimelineId,
+    pub shard: Option<ShardIdentity>,
+    pub protocol_version: Option<u8>,
    /// Unique connection id is logged in spans for observability.
    pub conn_id: ConnectionId,
    /// Auth scope allowed on the connections and public key used to check auth tokens. None if auth is not configured.
@@ -107,11 +112,21 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
    ) -> Result<(), QueryError> {
        if let FeStartupPacket::StartupMessage { params, .. } = sm {
            if let Some(options) = params.options_raw() {
+                let mut shard_count: Option<u8> = None;
+                let mut shard_number: Option<u8> = None;
+                let mut shard_stripe_size: Option<u32> = None;
+
                for opt in options {
                    // FIXME `ztenantid` and `ztimelineid` left for compatibility during deploy,
                    // remove these after the PR gets deployed:
                    // https://github.com/neondatabase/neon/pull/2433#discussion_r970005064
                    match opt.split_once('=') {
+                        Some(("protocol_version", value)) => {
+                            self.protocol_version =
+                                Some(value.parse::<u8>().with_context(|| {
+                                    format!("Failed to parse {value} as protocol_version")
+                                })?);
+                        }
                        Some(("ztenantid", value)) | Some(("tenant_id", value)) => {
                            self.tenant_id = Some(value.parse().with_context(|| {
                                format!("Failed to parse {value} as tenant id")
@@ -127,9 +142,44 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                                metrics.set_client_az(client_az)
                            }
                        }
+                        Some(("shard_count", value)) => {
+                            shard_count = Some(value.parse::<u8>().with_context(|| {
+                                format!("Failed to parse {value} as shard count")
+                            })?);
+                        }
+                        Some(("shard_number", value)) => {
+                            shard_number = Some(value.parse::<u8>().with_context(|| {
+                                format!("Failed to parse {value} as shard number")
+                            })?);
+                        }
+                        Some(("shard_stripe_size", value)) => {
+                            shard_stripe_size = Some(value.parse::<u32>().with_context(|| {
+                                format!("Failed to parse {value} as shard stripe size")
+                            })?);
+                        }
                        _ => continue,
                    }
                }
+
+                if self.protocol_version == Some(PAGESERVER_SAFEKEEPER_PROTO_VERSION) {
+                    match (shard_count, shard_number, shard_stripe_size) {
+                        (Some(count), Some(number), Some(stripe_size)) => {
+                            self.shard = Some(
+                                ShardIdentity::new(
+                                    ShardNumber(number),
+                                    ShardCount(count),
+                                    ShardStripeSize(stripe_size),
+                                )
+                                .with_context(|| "Failed to create shard identity")?,
+                            );
+                        }
+                        _ => {
+                            return Err(QueryError::Other(anyhow::anyhow!(
+                                "Shard params were not specified"
+                            )));
+                        }
+                    }
+                }
            }

            if let Some(app_name) = params.get("application_name") {
@@ -150,6 +200,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                    tracing::field::debug(self.appname.clone()),
                );

+            if let Some(shard) = self.shard.as_ref() {
+                tracing::Span::current()
+                    .record("shard", tracing::field::display(shard.shard_slug()));
+            }
+
            Ok(())
        } else {
            Err(QueryError::Other(anyhow::anyhow!(
@@ -258,6 +313,8 @@ impl SafekeeperPostgresHandler {
            tenant_id: None,
            timeline_id: None,
            ttid: TenantTimelineId::empty(),
+            shard: None,
+            protocol_version: None,
            conn_id,
            claims: None,
            auth,
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -112,7 +112,9 @@ impl SafeKeeperConf {
 }

 impl SafeKeeperConf {
-    pub fn dummy() -> Self {
+    #[cfg(test)]
+    #[allow(unused)]
+    fn dummy() -> Self {
        SafeKeeperConf {
            workdir: Utf8PathBuf::from("./"),
            no_sync: false,
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -55,7 +55,7 @@ pub static WRITE_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
 pub static FLUSH_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "safekeeper_flush_wal_seconds",
-        "Seconds spent syncing WAL to a disk (excluding segment initialization)",
+        "Seconds spent syncing WAL to a disk",
        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
    )
    .expect("Failed to register safekeeper_flush_wal_seconds histogram")
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -17,6 +17,7 @@ use tokio::{
 use tokio_postgres::replication::ReplicationStream;
 use tokio_postgres::types::PgLsn;
 use tracing::*;
+use utils::postgres_client::{ConnectionConfigArgs, POSTGRES_PROTO_VERSION};
 use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config};

 use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
@@ -325,7 +326,17 @@ async fn recovery_stream(
    conf: &SafeKeeperConf,
 ) -> anyhow::Result<String> {
    // TODO: pass auth token
-    let cfg = wal_stream_connection_config(tli.ttid, &donor.pg_connstr, None, None)?;
+    let connection_conf_args = ConnectionConfigArgs {
+        protocol_version: POSTGRES_PROTO_VERSION,
+        ttid: tli.ttid,
+        shard_number: None,
+        shard_count: None,
+        shard_stripe_size: None,
+        listen_pg_addr_str: &donor.pg_connstr,
+        auth_token: None,
+        availability_zone: None,
+    };
+    let cfg = wal_stream_connection_config(connection_conf_args)?;
    let mut cfg = cfg.to_tokio_postgres_config();
    // It will make safekeeper give out not committed WAL (up to flush_lsn).
    cfg.application_name(&format!("safekeeper_{}", conf.my_id));
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -979,8 +979,7 @@ where
            self.wal_store.flush_wal().await?;
        }

-        // Update commit_lsn. It will be flushed to the control file regularly by the timeline
-        // manager, off of the WAL ingest hot path.
+        // Update commit_lsn.
        if msg.h.commit_lsn != Lsn(0) {
            self.update_commit_lsn(msg.h.commit_lsn).await?;
        }
@@ -993,6 +992,15 @@ where
        self.state.inmem.peer_horizon_lsn =
            max(self.state.inmem.peer_horizon_lsn, msg.h.truncate_lsn);

+        // Update truncate and commit LSN in control file.
+        // To avoid negative impact on performance of extra fsync, do it only
+        // when commit_lsn delta exceeds WAL segment size.
+        if self.state.commit_lsn + (self.state.server.wal_seg_size as u64)
+            < self.state.inmem.commit_lsn
+        {
+            self.state.flush().await?;
+        }
+
        trace!(
            "processed AppendRequest of len {}, begin_lsn={}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
            msg.wal_data.len(),
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -11,17 +11,21 @@ use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
 use anyhow::{bail, Context as AnyhowContext};
 use bytes::Bytes;
+use pageserver_api::shard::ShardIdentity;
 use parking_lot::Mutex;
 use postgres_backend::PostgresBackend;
 use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
 use postgres_ffi::get_current_timestamp;
+use postgres_ffi::waldecoder::WalStreamDecoder;
 use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
-use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
+use pq_proto::{BeMessage, InterpretedWalRecordBody, WalSndKeepAlive, XLogDataBody};
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
 use utils::failpoint_support;
 use utils::id::TenantTimelineId;
 use utils::pageserver_feedback::PageserverFeedback;
+use utils::postgres_client::{PAGESERVER_SAFEKEEPER_PROTO_VERSION, POSTGRES_PROTO_VERSION};
+use wal_decoder::models::InterpretedWalRecord;

 use std::cmp::{max, min};
 use std::net::SocketAddr;
@@ -377,6 +381,10 @@ impl Drop for WalSenderGuard {
 }

 impl SafekeeperPostgresHandler {
+    pub fn protocol_version(&self) -> u8 {
+        self.protocol_version.unwrap_or(POSTGRES_PROTO_VERSION)
+    }
+
    /// Wrapper around handle_start_replication_guts handling result. Error is
    /// handled here while we're still in walsender ttid span; with API
    /// extension, this can probably be moved into postgres_backend.
@@ -412,6 +420,7 @@ impl SafekeeperPostgresHandler {
        let appname = self.appname.clone();

        // Use a guard object to remove our entry from the timeline when we are done.
+        // TODO(vlad): maybe thread shard stuff into here
        let ws_guard = Arc::new(tli.get_walsenders().register(
            self.ttid,
            *pgb.get_peer_addr(),
@@ -475,9 +484,10 @@ impl SafekeeperPostgresHandler {
            tli,
        };

+        let protocol_version = self.protocol_version();
        let res = tokio::select! {
            // todo: add read|write .context to these errors
-            r = sender.run() => r,
+            r = sender.run(protocol_version, self.shard.as_ref()) => r,
            r = reply_reader.run() => r,
        };

@@ -560,7 +570,35 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
    ///
    /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
    /// convenience.
-    async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> {
+    /// TODO(vlad): add a run variant which accumulates a full wall record
+    /// and interprets it.
+    async fn run(
+        &mut self,
+        protocol_version: u8,
+        shard: Option<&ShardIdentity>,
+    ) -> Result<(), CopyStreamHandlerEnd> {
+        match protocol_version {
+            POSTGRES_PROTO_VERSION => self.run_wal_sender().await,
+            PAGESERVER_SAFEKEEPER_PROTO_VERSION => {
+                self.run_interpreted_record_sender(shard.unwrap()).await
+            }
+            // TODO: make the proto version an enum
+            _ => unreachable!(),
+        }
+    }
+
+    async fn run_interpreted_record_sender(
+        &mut self,
+        shard: &ShardIdentity,
+    ) -> Result<(), CopyStreamHandlerEnd> {
+        let mut last_logged_at = std::time::Instant::now();
+        let mut interpreted_records = 0;
+        let mut interpreted_bytes = 0;
+        let mut useful_bytes = 0;
+
+        let pg_version = self.tli.tli.get_state().await.1.server.pg_version / 10000;
+        let mut wal_decoder = WalStreamDecoder::new(self.start_pos, pg_version);
+
        loop {
            // Wait for the next portion if it is not there yet, or just
            // update our end of WAL available for sending value, we
@@ -601,6 +639,141 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
            };
            let send_buf = &send_buf[..send_size];

+            wal_decoder.feed_bytes(send_buf);
+
+            // How fast or slow is this. Write a little benchmark
+            // to see how quiclky we can decode 1GiB of WAL.
+            // If this is slow, then we have a problem since it bottlenecks
+            // the whole afair. SK can send about 60-70MiB of raw WAL and
+            // about 13-17MiB of useful interpreted WAL per second (these
+            // number are for one shard).
+            while let Some((record_end_lsn, recdata)) = wal_decoder
+                .poll_decode()
+                .with_context(|| "Failed to decode WAL")?
+            {
+                assert!(record_end_lsn.is_aligned());
+
+                // Deserialize and interpret WAL record
+                let interpreted = InterpretedWalRecord::from_bytes_filtered(
+                    recdata,
+                    shard,
+                    record_end_lsn,
+                    pg_version,
+                )
+                .with_context(|| "Failed to interpret WAL")?;
+
+                let useful_size = interpreted.batch.buffer_size();
+
+                let mut buf = Vec::new();
+                interpreted
+                    .ser_into(&mut buf)
+                    .with_context(|| "Failed to serialize interpreted WAL")?;
+
+                let size = buf.len();
+
+                self.pgb
+                    .write_message(&BeMessage::InterpretedWalRecord(InterpretedWalRecordBody {
+                        wal_end: self.end_pos.0,
+                        data: buf.as_slice(),
+                    }))
+                    .await?;
+
+                interpreted_records += 1;
+                interpreted_bytes += size;
+                useful_bytes += useful_size;
+            }
+
+            // and send it
+            // self.pgb
+            //     .write_message(&BeMessage::XLogData(XLogDataBody {
+            //         wal_start: self.start_pos.0,
+            //         wal_end: self.end_pos.0,
+            //         timestamp: get_current_timestamp(),
+            //         data: send_buf,
+            //     }))
+            //     .await?;
+
+            // if let Some(appname) = &self.appname {
+            //     if appname == "replica" {
+            //         failpoint_support::sleep_millis_async!("sk-send-wal-replica-sleep");
+            //     }
+            // }
+            // trace!(
+            //     "sent {} bytes of WAL {}-{}",
+            //     send_size,
+            //     self.start_pos,
+            //     self.start_pos + send_size as u64
+            // );
+
+            self.start_pos += send_size as u64;
+
+            let elapsed = last_logged_at.elapsed();
+            if elapsed >= Duration::from_secs(5) {
+                let records_rate = interpreted_records / elapsed.as_millis() * 1000;
+                let bytes_rate = interpreted_bytes / elapsed.as_millis() as usize * 1000;
+                let useful_bytes_rate = useful_bytes / elapsed.as_millis() as usize * 1000;
+                tracing::info!(
+                    "Shard {} sender rate: rps={} bps={} ubps={}",
+                    shard.number.0,
+                    records_rate,
+                    bytes_rate,
+                    useful_bytes_rate
+                );
+
+                last_logged_at = std::time::Instant::now();
+                interpreted_records = 0;
+                interpreted_bytes = 0;
+                useful_bytes = 0;
+            }
+        }
+    }
+
+    async fn run_wal_sender(&mut self) -> Result<(), CopyStreamHandlerEnd> {
+        let mut useful_bytes = 0;
+        let mut last_logged_at = std::time::Instant::now();
+
+        loop {
+            // Wait for the next portion if it is not there yet, or just
+            // update our end of WAL available for sending value, we
+            // communicate it to the receiver.
+            self.wait_wal().await?;
+            assert!(
+                self.end_pos > self.start_pos,
+                "nothing to send after waiting for WAL"
+            );
+
+            // try to send as much as available, capped by MAX_SEND_SIZE
+            let mut chunk_end_pos = self.start_pos + MAX_SEND_SIZE as u64;
+            // if we went behind available WAL, back off
+            if chunk_end_pos >= self.end_pos {
+                chunk_end_pos = self.end_pos;
+            } else {
+                // If sending not up to end pos, round down to page boundary to
+                // avoid breaking WAL record not at page boundary, as protocol
+                // demands. See walsender.c (XLogSendPhysical).
+                chunk_end_pos = chunk_end_pos
+                    .checked_sub(chunk_end_pos.block_offset())
+                    .unwrap();
+            }
+            let send_size = (chunk_end_pos.0 - self.start_pos.0) as usize;
+            let send_buf = &mut self.send_buf[..send_size];
+            let send_size: usize;
+            {
+                // If uncommitted part is being pulled, check that the term is
+                // still the expected one.
+                let _term_guard = if let Some(t) = self.term {
+                    Some(self.tli.acquire_term(t).await?)
+                } else {
+                    None
+                };
+                // Read WAL into buffer. send_size can be additionally capped to
+                // segment boundary here.
+                send_size = self.wal_reader.read(send_buf).await?
+            };
+            let send_buf = &send_buf[..send_size];
+
+            useful_bytes += send_buf.len();
+
            // and send it
            self.pgb
                .write_message(&BeMessage::XLogData(XLogDataBody {
@@ -623,6 +796,18 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                self.start_pos + send_size as u64
            );
            self.start_pos += send_size as u64;
+
+            let elapsed = last_logged_at.elapsed();
+            if elapsed >= Duration::from_secs(5) {
+                let useful_bytes_rate = useful_bytes / elapsed.as_millis() as usize * 1000;
+                tracing::info!(
+                    "Sender rate: ubps={}",
+                    useful_bytes_rate
+                );
+
+                last_logged_at = std::time::Instant::now();
+                useful_bytes = 0;
+            }
        }
    }

--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -4,7 +4,6 @@
 use std::{cmp::max, ops::Deref};

 use anyhow::{bail, Result};
-use postgres_ffi::WAL_SEGMENT_SIZE;
 use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
 use utils::{
@@ -139,13 +138,14 @@ impl TimelinePersistentState {
        })
    }

+    #[cfg(test)]
    pub fn empty() -> Self {
        TimelinePersistentState::new(
            &TenantTimelineId::empty(),
            ServerInfo {
                pg_version: 170000, /* Postgres server version (major * 10000) */
                system_id: 0,       /* Postgres system identifier */
-                wal_seg_size: WAL_SEGMENT_SIZE as u32,
+                wal_seg_size: 16 * 1024 * 1024,
            },
            vec![],
            Lsn::INVALID,
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -2,7 +2,7 @@
 //! to glue together SafeKeeper and all other background services.

 use anyhow::{anyhow, bail, Result};
-use camino::{Utf8Path, Utf8PathBuf};
+use camino::Utf8PathBuf;
 use remote_storage::RemotePath;
 use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
@@ -108,11 +108,16 @@ pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>;
 pub struct WriteGuardSharedState<'a> {
    tli: Arc<Timeline>,
    guard: RwLockWriteGuard<'a, SharedState>,
+    skip_update: bool,
 }

 impl<'a> WriteGuardSharedState<'a> {
    fn new(tli: Arc<Timeline>, guard: RwLockWriteGuard<'a, SharedState>) -> Self {
-        WriteGuardSharedState { tli, guard }
+        WriteGuardSharedState {
+            tli,
+            guard,
+            skip_update: false,
+        }
    }
 }

@@ -154,10 +159,12 @@ impl Drop for WriteGuardSharedState<'_> {
            }
        });

-        // send notification about shared state update
-        self.tli.shared_state_version_tx.send_modify(|old| {
-            *old += 1;
-        });
+        if !self.skip_update {
+            // send notification about shared state update
+            self.tli.shared_state_version_tx.send_modify(|old| {
+                *old += 1;
+            });
+        }
    }
 }

@@ -318,17 +325,8 @@ pub struct SharedState {
 }

 impl SharedState {
-    /// Creates a new SharedState.
-    pub fn new(sk: StateSK) -> Self {
-        Self {
-            sk,
-            peers_info: PeersInfo(vec![]),
-            wal_removal_on_hold: false,
-        }
-    }
-
    /// Restore SharedState from control file. If file doesn't exist, bails out.
-    pub fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
+    fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
        let timeline_dir = get_timeline_dir(conf, ttid);
        let control_store = control_file::FileStorage::restore_new(&timeline_dir, conf.no_sync)?;
        if control_store.server.wal_seg_size == 0 {
@@ -354,7 +352,11 @@ impl SharedState {
            }
        };

-        Ok(Self::new(sk))
+        Ok(Self {
+            sk,
+            peers_info: PeersInfo(vec![]),
+            wal_removal_on_hold: false,
+        })
    }

    pub(crate) fn get_wal_seg_size(&self) -> usize {
@@ -478,13 +480,11 @@ pub struct Timeline {
 }

 impl Timeline {
-    /// Constructs a new timeline.
-    pub fn new(
-        ttid: TenantTimelineId,
-        timeline_dir: &Utf8Path,
-        remote_path: &RemotePath,
-        shared_state: SharedState,
-    ) -> Arc<Self> {
+    /// Load existing timeline from disk.
+    pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
+        let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
+
+        let shared_state = SharedState::restore(conf, &ttid)?;
        let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
            watch::channel(shared_state.sk.state().commit_lsn);
        let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) = watch::channel(TermLsn::from((
@@ -494,11 +494,10 @@ impl Timeline {
        let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);

        let walreceivers = WalReceivers::new();
-
-        Arc::new(Self {
+        let remote_path = remote_timeline_path(&ttid)?;
+        Ok(Arc::new(Timeline {
            ttid,
-            remote_path: remote_path.to_owned(),
-            timeline_dir: timeline_dir.to_owned(),
+            remote_path,
            commit_lsn_watch_tx,
            commit_lsn_watch_rx,
            term_flush_lsn_watch_tx,
@@ -509,28 +508,13 @@ impl Timeline {
            walsenders: WalSenders::new(walreceivers.clone()),
            walreceivers,
            cancel: CancellationToken::default(),
+            timeline_dir: get_timeline_dir(conf, &ttid),
            manager_ctl: ManagerCtl::new(),
            broker_active: AtomicBool::new(false),
            wal_backup_active: AtomicBool::new(false),
            last_removed_segno: AtomicU64::new(0),
            mgr_status: AtomicStatus::new(),
-        })
-    }
-
-    /// Load existing timeline from disk.
-    pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
-        let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
-
-        let shared_state = SharedState::restore(conf, &ttid)?;
-        let timeline_dir = get_timeline_dir(conf, &ttid);
-        let remote_path = remote_timeline_path(&ttid)?;
-
-        Ok(Timeline::new(
-            ttid,
-            &timeline_dir,
-            &remote_path,
-            shared_state,
-        ))
+        }))
    }

    /// Initialize fresh timeline on disk and start background tasks. If init
@@ -1144,13 +1128,13 @@ async fn delete_dir(path: &Utf8PathBuf) -> Result<bool> {

 /// Get a path to the tenant directory. If you just need to get a timeline directory,
 /// use WalResidentTimeline::get_timeline_dir instead.
-pub fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf {
+pub(crate) fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf {
    conf.workdir.join(tenant_id.to_string())
 }

 /// Get a path to the timeline directory. If you need to read WAL files from disk,
 /// use WalResidentTimeline::get_timeline_dir instead. This function does not check
 /// timeline eviction status and WAL files might not be present on disk.
-pub fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf {
+pub(crate) fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf {
    get_tenant_dir(conf, &ttid.tenant_id).join(ttid.timeline_id.to_string())
 }
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -515,12 +515,7 @@ impl Manager {
            return;
        }

-        if state.cfile_last_persist_at.elapsed() > self.conf.control_file_save_interval
-            // If the control file's commit_lsn lags more than one segment behind the current
-            // commit_lsn, flush immediately to limit recovery time in case of a crash. We don't do
-            // this on the WAL ingest hot path since it incurs fsync latency.
-            || state.commit_lsn.saturating_sub(state.cfile_commit_lsn).0 >= self.wal_seg_size as u64
-        {
+        if state.cfile_last_persist_at.elapsed() > self.conf.control_file_save_interval {
            let mut write_guard = self.tli.write_shared_state().await;
            // it should be done in the background because it blocks manager task, but flush() should
            // be fast enough not to be a problem now
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Vlad Lazar	b3ef315041	more wip	2024-11-06 19:41:22 +01:00
Vlad Lazar	f0044b8651	wip	2024-11-06 16:13:14 +01:00
Vlad Lazar	b7ff993df6	wal_decoder: make InterpretedWalRecord serde	2024-11-06 16:13:14 +01:00
Vlad Lazar	5d096f127e	safekeeper: parse new connection configs	2024-11-06 16:13:14 +01:00
Vlad Lazar	70cdd56294	pageserver: include shard id when subscribing to safekeeper	2024-11-06 16:13:14 +01:00
				`@@ -0,0 +1 @@`
				`SELECT neon.backpressure_throttling_time() AS throttled;`
				`@@ -1 +0,0 @@`
				`SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled;`