Restore checkl for FSM/VM fork in neon_wallog_page

Do not write pages to the local disk during unlogged build
Rebase with main
2026-06-18 21:00:38 +00:00 · 2024-06-04 14:08:45 +03:00 · 2024-06-04 09:20:51 +03:00 · 2024-06-03 21:36:37 +03:00 · 2024-06-03 21:16:04 +03:00 · 2024-06-03 21:16:02 +03:00
190 changed files with 2651 additions and 7087 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -8,7 +8,6 @@
 !scripts/combine_control_files.py
 !scripts/ninstall.sh
 !vm-cgconfig.conf
-!docker-compose/run-tests.sh

 # Directories
 !.cargo/
@@ -21,7 +20,7 @@
 !patches/
 !pgxn/
 !proxy/
-!storage_scrubber/
+!s3_scrubber/
 !safekeeper/
 !storage_broker/
 !storage_controller/
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -69,41 +69,15 @@ jobs:
        with:
          ref: main
          token: ${{ secrets.CI_ACCESS_TOKEN }}
-      
-      - name: Look for existing PR
-        id: get-pr
-        env:
-          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-        run: |
-          ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
-          echo "ALREADY_CREATED=${ALREADY_CREATED}" >> ${GITHUB_OUTPUT}
-      
-      - name: Get changed labels
-        id: get-labels
-        if: steps.get-pr.outputs.ALREADY_CREATED != ''
-        env:
-          ALREADY_CREATED: ${{ steps.get-pr.outputs.ALREADY_CREATED }}
-          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-        run: |
-          LABELS_TO_REMOVE=$(comm -23 <(gh pr --repo ${GITHUB_REPOSITORY} view ${ALREADY_CREATED} --json labels --jq '.labels.[].name'| ( grep -E '^run' || true ) | sort) \
-          <(gh pr --repo ${GITHUB_REPOSITORY} view ${PR_NUMBER} --json labels --jq '.labels.[].name' | ( grep -E '^run' || true ) | sort ) |\
-          ( grep -v run-e2e-tests-in-draft || true ) | paste -sd , -)
-          LABELS_TO_ADD=$(comm -13 <(gh pr --repo ${GITHUB_REPOSITORY} view ${ALREADY_CREATED} --json labels --jq '.labels.[].name'| ( grep -E '^run' || true ) |sort) \
-          <(gh pr --repo ${GITHUB_REPOSITORY} view ${PR_NUMBER} --json labels --jq '.labels.[].name' |  ( grep -E '^run' || true ) | sort ) |\
-          paste -sd , -)
-          echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT}
-          echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT}

      - run: gh pr checkout "${PR_NUMBER}"

      - run: git checkout -b "${BRANCH}"

      - run: git push --force origin "${BRANCH}"
-        if: steps.get-pr.outputs.ALREADY_CREATED == ''

      - name: Create a Pull Request for CI run (if required)
-        if: steps.get-pr.outputs.ALREADY_CREATED == ''
-        env: 
+        env:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
          cat << EOF > body.md
@@ -114,33 +88,16 @@ jobs:
            Feel free to review/comment/discuss the original PR #${PR_NUMBER}.
          EOF

-          LABELS=$( (gh pr --repo "${GITHUB_REPOSITORY}" view ${PR_NUMBER}  --json labels --jq '.labels.[].name'; echo run-e2e-tests-in-draft  )| \
-          grep -E '^run' | paste -sd , -)
-          gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
+          ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
+          if [ -z "${ALREADY_CREATED}" ]; then
+            gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
                                                       --body-file "body.md" \
                                                       --head "${BRANCH}" \
                                                       --base "main" \
-                                                       --label ${LABELS} \
+                                                       --label "run-e2e-tests-in-draft" \
                                                       --draft
-      - name: Modify the existing pull request (if required)
-        if: steps.get-pr.outputs.ALREADY_CREATED != ''
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          LABELS_TO_ADD: ${{ steps.get-labels.outputs.LABELS_TO_ADD }}
-          LABELS_TO_REMOVE: ${{ steps.get-labels.outputs.LABELS_TO_REMOVE }}
-          ALREADY_CREATED: ${{ steps.get-pr.outputs.ALREADY_CREATED }}
-        run: |
-          ADD_CMD=
-          REMOVE_CMD=
-          [ -z "${LABELS_TO_ADD}" ] || ADD_CMD="--add-label ${LABELS_TO_ADD}"
-          [ -z "${LABELS_TO_REMOVE}" ] || REMOVE_CMD="--remove-label ${LABELS_TO_REMOVE}"
-          if [ -n "${ADD_CMD}" ] || [ -n "${REMOVE_CMD}" ]; then
-            gh pr --repo "${GITHUB_REPOSITORY}" edit ${ALREADY_CREATED} ${ADD_CMD} ${REMOVE_CMD}
          fi

-      - run: git push --force origin "${BRANCH}"
-        if: steps.get-pr.outputs.ALREADY_CREATED != ''
-             
  cleanup:
    # Close PRs and delete branchs if the original PR is closed.

--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -99,7 +99,7 @@ jobs:
        # Set --sparse-ordering option of pytest-order plugin
        # to ensure tests are running in order of appears in the file.
        # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
-        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
+        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py
      env:
        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -410,14 +410,14 @@ jobs:
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}

-    - name: Benchmark pgvector queries
+    - name: Benchmark pgvector hnsw queries
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
-        test_selection: performance/test_perf_pgvector_queries.py
+        test_selection: performance
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600 
+        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_pgvector
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -30,6 +30,7 @@ jobs:
  check-image:
    uses: ./.github/workflows/check-build-tools-image.yml

+  # This job uses older version of GitHub Actions because it's run on gen2 runners, which don't support node 20 (for newer versions)
  build-image:
    needs: [ check-image ]
    if: needs.check-image.outputs.found == 'false'
@@ -54,7 +55,7 @@ jobs:
            exit 1
          fi

-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v3

      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
      # The default value is ~/.docker
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -299,21 +299,21 @@ jobs:
        uses: actions/cache@v4
        with:
          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Cache postgres v15 build
        id: cache_pg_15
        uses: actions/cache@v4
        with:
          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Cache postgres v16 build
        id: cache_pg_16
        uses: actions/cache@v4
        with:
          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
@@ -337,8 +337,34 @@ jobs:
        run: |
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests

-      # Do install *before* running rust tests because they might recompile the
-      # binaries with different features/flags.
+      - name: Run rust tests
+        env:
+          NEXTEST_RETRIES: 3
+        run: |
+          #nextest does not yet support running doctests
+          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
+
+          for io_engine in std-fs tokio-epoll-uring ; do
+            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          done
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
+
+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
+
      - name: Install rust binaries
        run: |
          # Install target binaries
@@ -379,32 +405,6 @@ jobs:
            done
          fi

-      - name: Run rust tests
-        env:
-          NEXTEST_RETRIES: 3
-        run: |
-          #nextest does not yet support running doctests
-          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
-
-          for io_engine in std-fs tokio-epoll-uring ; do
-            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
-          done
-
-          # Run separate tests for real S3
-          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
-          export REMOTE_STORAGE_S3_REGION=eu-central-1
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
-
-          # Run separate tests for real Azure Blob Storage
-          # XXX: replace region with `eu-central-1`-like region
-          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
-          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
-          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
-
      - name: Install postgres binaries
        run: cp -a pg_install /tmp/neon/pg_install

@@ -859,26 +859,6 @@ jobs:
          tags: |
            neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

-      - name: Build neon extensions test image
-        if: matrix.version == 'v16'
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          build-args: |
-            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-            PG_VERSION=${{ matrix.version }}
-            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
-          provenance: false
-          push: true
-          pull: true
-          file: Dockerfile.compute-node
-          target: neon-pg-ext-test
-          cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
-          cache-to: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
-          tags: |
-            neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
-
      - name: Build compute-tools image
        # compute-tools are Postgres independent, so build it only once
        if: matrix.version == 'v16'
@@ -922,13 +902,6 @@ jobs:
                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64

-      - name: Create multi-arch neon-test-extensions image
-        if: matrix.version == 'v16'
-        run: |
-          docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
-
      - name: Create multi-arch compute-tools image
        if: matrix.version == 'v16'
        run: |
@@ -965,7 +938,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v1
        with:
          fetch-depth: 0

@@ -1047,7 +1020,7 @@ jobs:
            exit 1
          fi

-      - name: Verify docker-compose example and test extensions
+      - name: Verify docker-compose example
        timeout-minutes: 20
        run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh

@@ -1101,8 +1074,6 @@ jobs:
                                                 $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
            done
          done
-          docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
-                                             neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}

  trigger-custom-extensions-build-and-wait:
    needs: [ check-permissions, tag ]
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -25,17 +25,26 @@ jobs:
      found: ${{ steps.check-image.outputs.found }}

    steps:
-      - uses: actions/checkout@v4
-
      - name: Get build-tools image tag for the current commit
        id: get-build-tools-tag
        env:
-          IMAGE_TAG: |
-            ${{ hashFiles('Dockerfile.build-tools',
-                          '.github/workflows/check-build-tools-image.yml',
-                          '.github/workflows/build-build-tools-image.yml') }}
+          # Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
+          # we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
+          COMMIT_SHA: ${{ github.sha }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
-          echo "image-tag=${IMAGE_TAG}" | tee -a $GITHUB_OUTPUT
+          LAST_BUILD_TOOLS_SHA=$(
+            gh api \
+              -H "Accept: application/vnd.github+json" \
+              -H "X-GitHub-Api-Version: 2022-11-28" \
+              --method GET \
+              --field path=Dockerfile.build-tools \
+              --field sha=${COMMIT_SHA} \
+              --field per_page=1 \
+              --jq ".[0].sha" \
+              "/repos/${GITHUB_REPOSITORY}/commits"
+          )
+          echo "image-tag=${LAST_BUILD_TOOLS_SHA}" | tee -a $GITHUB_OUTPUT

      - name: Check if such tag found in the registry
        id: check-image
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5109,6 +5109,53 @@ version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"

+[[package]]
+name = "s3_scrubber"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-stream",
+ "aws-config",
+ "aws-sdk-s3",
+ "aws-smithy-async",
+ "bincode",
+ "bytes",
+ "camino",
+ "chrono",
+ "clap",
+ "crc32c",
+ "either",
+ "futures",
+ "futures-util",
+ "hex",
+ "histogram",
+ "itertools",
+ "once_cell",
+ "pageserver",
+ "pageserver_api",
+ "postgres_ffi",
+ "rand 0.8.5",
+ "remote_storage",
+ "reqwest 0.12.4",
+ "rustls 0.22.4",
+ "rustls-native-certs 0.7.0",
+ "serde",
+ "serde_json",
+ "serde_with",
+ "thiserror",
+ "tokio",
+ "tokio-postgres",
+ "tokio-postgres-rustls",
+ "tokio-rustls 0.25.0",
+ "tokio-stream",
+ "tokio-util",
+ "tracing",
+ "tracing-appender",
+ "tracing-subscriber",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "safekeeper"
 version = "0.1.0"
@@ -5158,7 +5205,6 @@ dependencies = [
 "tokio-io-timeout",
 "tokio-postgres",
 "tokio-stream",
- "tokio-tar",
 "tokio-util",
 "toml_edit",
 "tracing",
@@ -5766,54 +5812,6 @@ dependencies = [
 "workspace_hack",
 ]

-[[package]]
-name = "storage_scrubber"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "async-stream",
- "aws-config",
- "aws-sdk-s3",
- "aws-smithy-async",
- "bincode",
- "bytes",
- "camino",
- "chrono",
- "clap",
- "crc32c",
- "either",
- "futures",
- "futures-util",
- "hex",
- "histogram",
- "humantime",
- "itertools",
- "once_cell",
- "pageserver",
- "pageserver_api",
- "postgres_ffi",
- "rand 0.8.5",
- "remote_storage",
- "reqwest 0.12.4",
- "rustls 0.22.4",
- "rustls-native-certs 0.7.0",
- "serde",
- "serde_json",
- "serde_with",
- "thiserror",
- "tokio",
- "tokio-postgres",
- "tokio-postgres-rustls",
- "tokio-rustls 0.25.0",
- "tokio-stream",
- "tokio-util",
- "tracing",
- "tracing-appender",
- "tracing-subscriber",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "storcon_cli"
 version = "0.1.0"
@@ -5821,8 +5819,6 @@ dependencies = [
 "anyhow",
 "clap",
 "comfy-table",
- "futures",
- "humantime",
 "hyper 0.14.26",
 "pageserver_api",
 "pageserver_client",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,7 @@ members = [
    "safekeeper",
    "storage_broker",
    "storage_controller",
-    "storage_scrubber",
+    "s3_scrubber",
    "workspace_hack",
    "trace",
    "libs/compute_api",
@@ -120,7 +120,7 @@ num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
 opentelemetry = "0.20.0"
-opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
 parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
@@ -128,7 +128,7 @@ parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.14"
-prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
+prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
@@ -184,7 +184,7 @@ tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
-tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 urlencoding = "2.1"
--- a/2
+++ b/2
@@ -69,6 +69,8 @@ RUN set -e \
    && apt install -y \
        libreadline-dev \
        libseccomp-dev \
+        libicu67 \
+        openssl \
        ca-certificates \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
    && useradd -d /data neon \
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -112,45 +112,6 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
    && make install \
    && rm -rf ../lcov.tar.gz

-# Compile and install the static OpenSSL library
-ENV OPENSSL_VERSION=3.2.2
-ENV OPENSSL_PREFIX=/usr/local/openssl
-RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
-    echo "197149c18d9e9f292c43f0400acaba12e5f52cacfe050f3d199277ea738ec2e7 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
-    cd /tmp && \
-    tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
-    rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
-    cd /tmp/openssl-${OPENSSL_VERSION} && \
-    ./config --prefix=${OPENSSL_PREFIX}  -static --static no-shared -fPIC && \
-    make -j "$(nproc)" && \
-    make install && \
-    cd /tmp && \
-    rm -rf /tmp/openssl-${OPENSSL_VERSION}
-
-# Use the same version of libicu as the compute nodes so that
-# clusters created using inidb on pageserver can be used by computes.
-#
-# TODO: at this time, Dockerfile.compute-node uses the debian bullseye libicu
-# package, which is 67.1. We're duplicating that knowledge here, and also, technically,
-# Debian has a few patches on top of 67.1 that we're not adding here.
-ENV ICU_VERSION=67.1
-ENV ICU_PREFIX=/usr/local/icu
-
-# Download and build static ICU
-RUN wget -O /tmp/libicu-${ICU_VERSION}.tgz https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION//./-}/icu4c-${ICU_VERSION//./_}-src.tgz && \
-    echo "94a80cd6f251a53bd2a997f6f1b5ac6653fe791dfab66e1eb0227740fb86d5dc /tmp/libicu-${ICU_VERSION}.tgz" | sha256sum --check && \
-    mkdir /tmp/icu && \
-    pushd /tmp/icu && \
-    tar -xzf /tmp/libicu-${ICU_VERSION}.tgz && \
-    pushd icu/source && \
-    ./configure --prefix=${ICU_PREFIX}  --enable-static --enable-shared=no CXXFLAGS="-fPIC" CFLAGS="-fPIC" && \
-    make -j "$(nproc)" && \
-    make install && \
-    popd && \
-    rm -rf icu && \
-    rm -f /tmp/libicu-${ICU_VERSION}.tgz && \
-    popd
-
 # Switch to nonroot user
 USER nonroot:nonroot
 WORKDIR /home/nonroot
@@ -180,7 +141,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.79.0
+ENV RUSTC_VERSION=1.78.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
@@ -209,6 +170,3 @@ RUN whoami \
    && rustup --version --verbose \
    && rustc --version --verbose \
    && clang --version
-
-# Set following flag to check in Makefile if its running in Docker
-RUN touch /home/nonroot/.docker_build
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -89,7 +89,7 @@ RUN apt update && \
 # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
 RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
    echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
-    mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
+    mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
    cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
    DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
    make clean && cp -R /sfcgal/* /
@@ -98,7 +98,7 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"

 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
    echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
-    mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
+    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    ./autogen.sh && \
    ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
@@ -124,7 +124,7 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postg

 RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
    echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
-    mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \
+    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
    mkdir build && cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -149,7 +149,7 @@ RUN apt update && \

 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
    echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
-    mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \
+    mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
    # generate and copy upgrade scripts
    mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
    cp upgrade/* /usr/local/pgsql/share/extension/ && \
@@ -194,7 +194,7 @@ RUN case "$(uname -m)" in \

 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
    echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
-    mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \
+    mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \
    mkdir build && cd build && \
    cmake .. -DCMAKE_BUILD_TYPE=Release && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -204,7 +204,7 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz

 RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
-    mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
+    mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -222,7 +222,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
    echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \
-    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
+    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
@@ -243,12 +243,12 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 COPY patches/pgvector.patch /pgvector.patch

-# By default, pgvector Makefile uses `-march=native`. We don't want that,
+# By default, pgvector Makefile uses `-march=native`. We don't want that, 
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
-    echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \
-    mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
+    echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
+    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    patch -p1 < /pgvector.patch && \
    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -266,7 +266,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 # 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
 RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
    echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \
-    mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
+    mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control

@@ -281,7 +281,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
    echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
-    mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
+    mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
@@ -297,7 +297,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
    echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
-    mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
+    mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
@@ -313,7 +313,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
    echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
-    mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
+    mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
@@ -329,7 +329,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
    echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \
-    mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
+    mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
@@ -345,7 +345,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
    echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
-    mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
+    mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
@@ -361,7 +361,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
    echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
-    mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
+    mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
@@ -377,7 +377,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
    echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
-    mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
+    mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
@@ -393,7 +393,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
    echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
-    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
+    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
@@ -424,7 +424,7 @@ RUN case "${PG_VERSION}" in \
    apt-get install -y cmake && \
    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
    echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
-    mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \
+    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
    cd build && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -462,7 +462,7 @@ RUN case "${PG_VERSION}" in \
    esac && \
    wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \
    echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \
-    mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
+    mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xvzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make install -j $(getconf _NPROCESSORS_ONLN) && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control
@@ -481,7 +481,7 @@ RUN apt-get update && \
    apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \
    wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
    echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
-    mkdir kq_imcx-src && cd kq_imcx-src && tar xzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
+    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    mkdir build && cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
@@ -505,7 +505,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
-    mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
+    mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
@@ -531,7 +531,7 @@ RUN apt-get update && \
 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
    echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
-    mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
+    mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
    cmake \
        -D RDK_BUILD_CAIRO_SUPPORT=OFF \
        -D RDK_BUILD_INCHI_SUPPORT=ON \
@@ -571,7 +571,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
    echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
-    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
+    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
@@ -588,7 +588,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
    echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
-    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
+    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
@@ -605,7 +605,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
    echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
-    mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
+    mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
@@ -631,7 +631,7 @@ RUN case "${PG_VERSION}" in \
    esac && \
    wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
    echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
-    mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
+    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install

@@ -647,7 +647,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
    echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
-    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
+    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
@@ -696,7 +696,7 @@ ARG PG_VERSION

 RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
    echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
-    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
+    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
@@ -713,7 +713,7 @@ ARG PG_VERSION

 RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
    echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
-    mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
+    mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    cargo pgrx install --release && \
    # it's needed to enable extension because it uses untrusted C language
@@ -733,7 +733,7 @@ ARG PG_VERSION
 # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
 RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
    echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
-    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
+    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control

@@ -749,7 +749,7 @@ ARG PG_VERSION

 RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
    echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
-    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
+    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
    echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
    wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
    patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
@@ -771,7 +771,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
    echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
-    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
+    mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install

@@ -787,7 +787,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
    echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
-    mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
+    mkdir pg_ivm-src && cd pg_ivm-src && tar xvzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
@@ -804,7 +804,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
    echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
-    mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
+    mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
@@ -928,69 +928,6 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

-
-#########################################################################################
-#
-# Layer neon-pg-ext-test
-#
-#########################################################################################
-
-FROM neon-pg-ext-build AS neon-pg-ext-test
-ARG PG_VERSION
-RUN mkdir /ext-src
-
-#COPY --from=postgis-build /postgis.tar.gz /ext-src/
-#COPY --from=postgis-build /sfcgal/* /usr
-COPY --from=plv8-build /plv8.tar.gz /ext-src/
-COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
-COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
-COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
-COPY --from=vector-pg-build /pgvector.patch /ext-src/
-COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
-#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
-#COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
-#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
-COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
-COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
-#COPY --from=rum-pg-build /rum.tar.gz /ext-src
-#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
-COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
-COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
-COPY --from=hll-pg-build /hll.tar.gz /ext-src
-COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
-#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
-COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
-COPY patches/pg_hintplan.patch /ext-src
-#COPY --from=kq-imcx-pg-build /kq_imcx.tar.gz /ext-src
-COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
-COPY patches/pg_cron.patch /ext-src
-#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
-COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
-COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
-COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
-COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
-#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
-#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
-COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
-COPY patches/pg_anon.patch /ext-src
-COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
-COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
-RUN cd /ext-src/ && for f in *.tar.gz; \
-    do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
-    rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
-    || exit 1; rm -f $f; done
-RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
-# cmake is required for the h3 test
-RUN apt-get update && apt-get install -y cmake
-RUN patch -p1 < /ext-src/pg_hintplan.patch
-COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
-RUN patch -p1 </ext-src/pg_anon.patch
-RUN patch -p1 </ext-src/pg_cron.patch
-ENV PATH=/usr/local/pgsql/bin:$PATH
-ENV PGHOST=compute
-ENV PGPORT=55433
-ENV PGUSER=cloud_admin
-ENV PGDATABASE=postgres
 #########################################################################################
 #
 # Final layer
--- a/17
+++ b/17
@@ -3,9 +3,6 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Where to install Postgres, default is ./pg_install, maybe useful for package managers
 POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/

-OPENSSL_PREFIX_DIR := /usr/local/openssl
-ICU_PREFIX_DIR := /usr/local/icu
-
 #
 # We differentiate between release / debug build types using the BUILD_TYPE
 # environment variable.
@@ -23,16 +20,6 @@ else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif

-ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
-	# Exclude static build openssl, icu for local build (MacOS, Linux)
-	# Only keep for build type release and debug
-	PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include
-	PG_CONFIGURE_OPTS += --with-icu
-	PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION'
-	PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm'
-	PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread'
-endif
-
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
@@ -41,7 +28,7 @@ else ifeq ($(UNAME_S),Darwin)
 	ifndef DISABLE_HOMEBREW
 		# macOS with brew-installed openssl requires explicit paths
 		# It can be configured with OPENSSL_PREFIX variable
-		OPENSSL_PREFIX := $(shell brew --prefix openssl@3)
+		OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
 		PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
 		PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
 		# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
@@ -137,8 +124,6 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
 	+@echo "Compiling amcheck $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
-	+@echo "Compiling test_decoding $*"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/test_decoding install

 .PHONY: postgres-clean-%
 postgres-clean-%:
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -735,7 +735,7 @@ fn cli() -> clap::Command {
            Arg::new("filecache-connstr")
                .long("filecache-connstr")
                .default_value(
-                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor",
+                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable",
                )
                .value_name("FILECACHE_CONNSTR"),
        )
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -918,39 +918,38 @@ impl ComputeNode {
        // temporarily reset max_cluster_size in config
        // to avoid the possibility of hitting the limit, while we are reconfiguring:
        // creating new extensions, roles, etc...
-        config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
-            self.pg_reload_conf()?;
+        config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
+        self.pg_reload_conf()?;

-            let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
+        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;

-            // Proceed with post-startup configuration. Note, that order of operations is important.
-            // Disable DDL forwarding because control plane already knows about these roles/databases.
-            if spec.mode == ComputeMode::Primary {
-                client.simple_query("SET neon.forward_ddl = false")?;
-                cleanup_instance(&mut client)?;
-                handle_roles(&spec, &mut client)?;
-                handle_databases(&spec, &mut client)?;
-                handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-                handle_grants(
-                    &spec,
-                    &mut client,
-                    self.connstr.as_str(),
-                    self.has_feature(ComputeFeature::AnonExtension),
-                )?;
-                handle_extensions(&spec, &mut client)?;
-                handle_extension_neon(&mut client)?;
-                // We can skip handle_migrations here because a new migration can only appear
-                // if we have a new version of the compute_ctl binary, which can only happen
-                // if compute got restarted, in which case we'll end up inside of apply_config
-                // instead of reconfigure.
-            }
+        // Proceed with post-startup configuration. Note, that order of operations is important.
+        // Disable DDL forwarding because control plane already knows about these roles/databases.
+        if spec.mode == ComputeMode::Primary {
+            client.simple_query("SET neon.forward_ddl = false")?;
+            cleanup_instance(&mut client)?;
+            handle_roles(&spec, &mut client)?;
+            handle_databases(&spec, &mut client)?;
+            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
+            handle_grants(
+                &spec,
+                &mut client,
+                self.connstr.as_str(),
+                self.has_feature(ComputeFeature::AnonExtension),
+            )?;
+            handle_extensions(&spec, &mut client)?;
+            handle_extension_neon(&mut client)?;
+            // We can skip handle_migrations here because a new migration can only appear
+            // if we have a new version of the compute_ctl binary, which can only happen
+            // if compute got restarted, in which case we'll end up inside of apply_config
+            // instead of reconfigure.
+        }

-            // 'Close' connection
-            drop(client);
-
-            Ok(())
-        })?;
+        // 'Close' connection
+        drop(client);

+        // reset max_cluster_size in config back to original value and reload config
+        config::compute_ctl_temp_override_remove(pgdata_path)?;
        self.pg_reload_conf()?;

        let unknown_op = "unknown".to_string();
@@ -1041,17 +1040,12 @@ impl ComputeNode {
                // temporarily reset max_cluster_size in config
                // to avoid the possibility of hitting the limit, while we are applying config:
                // creating new extensions, roles, etc...
-                config::with_compute_ctl_tmp_override(
-                    pgdata_path,
-                    "neon.max_cluster_size=-1",
-                    || {
-                        self.pg_reload_conf()?;
+                config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
+                self.pg_reload_conf()?;

-                        self.apply_config(&compute_state)?;
+                self.apply_config(&compute_state)?;

-                        Ok(())
-                    },
-                )?;
+                config::compute_ctl_temp_override_remove(pgdata_path)?;
                self.pg_reload_conf()?;
            }
            self.post_apply_config()?;
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -131,17 +131,18 @@ pub fn write_postgres_conf(
    Ok(())
 }

-pub fn with_compute_ctl_tmp_override<F>(pgdata_path: &Path, options: &str, exec: F) -> Result<()>
-where
-    F: FnOnce() -> Result<()>,
-{
+/// create file compute_ctl_temp_override.conf in pgdata_dir
+/// add provided options to this file
+pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> {
    let path = pgdata_path.join("compute_ctl_temp_override.conf");
    let mut file = File::create(path)?;
    write!(file, "{}", options)?;
-
-    let res = exec();
-
-    file.set_len(0)?;
-
-    res
+    Ok(())
+}
+
+/// remove file compute_ctl_temp_override.conf in pgdata_dir
+pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> {
+    let path = pgdata_path.join("compute_ctl_temp_override.conf");
+    std::fs::remove_file(path)?;
+    Ok(())
 }
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -17,7 +17,7 @@ use hyper::header::CONTENT_TYPE;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use tokio::task;
-use tracing::{debug, error, info, warn};
+use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;
 use utils::http::request::must_get_query_param;

@@ -48,7 +48,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
    match (req.method(), req.uri().path()) {
        // Serialized compute state.
        (&Method::GET, "/status") => {
-            debug!("serving /status GET request");
+            info!("serving /status GET request");
            let state = compute.state.lock().unwrap();
            let status_response = status_response_from_state(&state);
            Response::new(Body::from(serde_json::to_string(&status_response).unwrap()))
--- a/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql
+++ b/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql
@@ -1 +0,0 @@
-ALTER ROLE neon_superuser BYPASSRLS;
--- a/compute_tools/src/migrations/0001-alter_roles.sql
+++ b/compute_tools/src/migrations/0001-alter_roles.sql
@@ -1,18 +0,0 @@
-DO $$
-DECLARE
-    role_name text;
-BEGIN
-    FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
-    LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
-        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
-    END LOOP;
-
-    FOR role_name IN SELECT rolname FROM pg_roles
-        WHERE
-            NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
-    LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
-        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
-    END LOOP;
-END $$;
--- a/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql
@@ -1,6 +0,0 @@
-DO $$
-BEGIN
-    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
-        EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
-    END IF;
-END $$;
--- a/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql
@@ -1 +0,0 @@
-GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION;
--- a/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql
@@ -1,4 +0,0 @@
-- SKIP: Deemed insufficient for allowing relations created by extensions to be
--       interacted with by neon_superuser without permission issues.
-
-ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser;
--- a/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql
@@ -1,4 +0,0 @@
-- SKIP: Deemed insufficient for allowing relations created by extensions to be
--       interacted with by neon_superuser without permission issues.
-
-ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser;
--- a/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
@@ -1,3 +0,0 @@
-- SKIP: Moved inline to the handle_grants() functions.
-
-ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;
--- a/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
@@ -1,3 +0,0 @@
-- SKIP: Moved inline to the handle_grants() functions.
-
-ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;
--- a/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql
+++ b/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql
@@ -1,13 +0,0 @@
-- SKIP: The original goal of this migration was to prevent creating
--       subscriptions, but this migration was insufficient.
-
-DO $$
-DECLARE
-    role_name TEXT;
-BEGIN
-    FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
-    LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
-        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
-    END LOOP;
-END $$;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -774,21 +774,44 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
    // !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN!
    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

-    // Add new migrations in numerical order.
    let migrations = [
-        include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
-        include_str!("./migrations/0001-alter_roles.sql"),
-        include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
-        include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
-        include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
-        include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
-        include_str!(
-            "./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
-        ),
-        include_str!(
-            "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
-        ),
-        include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
+        "ALTER ROLE neon_superuser BYPASSRLS",
+        r#"
+DO $$
+DECLARE
+    role_name text;
+BEGIN
+    FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
+    END LOOP;
+
+    FOR role_name IN SELECT rolname FROM pg_roles
+        WHERE
+            NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
+    END LOOP;
+END $$;
+"#,
+        r#"
+DO $$
+BEGIN
+    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
+        EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
+    END IF;
+END
+$$;"#,
+        "GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
+        // Don't remove: these are some SQLs that we originally applied in migrations but turned out to execute somewhere else.
+        "",
+        "",
+        "",
+        "",
+        "",
+        // Add new migrations below.
    ];

    let mut func = || {
@@ -824,13 +847,10 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {

    while current_migration < migrations.len() {
        let migration = &migrations[current_migration];
-        if migration.starts_with("-- SKIP") {
-            info!("Skipping migration id={}", current_migration);
+        if migration.is_empty() {
+            info!("Skip migration id={}", current_migration);
        } else {
-            info!(
-                "Running migration id={}:\n{}\n",
-                current_migration, migration
-            );
+            info!("Running migration:\n{}\n", migration);
            client.simple_query(migration).with_context(|| {
                format!("handle_migrations current_migration={}", current_migration)
            })?;
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -862,13 +862,20 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re

            let allow_multiple = sub_args.get_flag("allow-multiple");

-            // If --safekeepers argument is given, use only the listed
-            // safekeeper nodes; otherwise all from the env.
-            let safekeepers = if let Some(safekeepers) = parse_safekeepers(&sub_args)? {
-                safekeepers
-            } else {
-                env.safekeepers.iter().map(|sk| sk.id).collect()
-            };
+            // If --safekeepers argument is given, use only the listed safekeeper nodes.
+            let safekeepers =
+                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
+                    let mut safekeepers: Vec<NodeId> = Vec::new();
+                    for sk_id in safekeepers_str.split(',').map(str::trim) {
+                        let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| {
+                            anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list")
+                        })?);
+                        safekeepers.push(sk_id);
+                    }
+                    safekeepers
+                } else {
+                    env.safekeepers.iter().map(|sk| sk.id).collect()
+                };

            let endpoint = cplane
                .endpoints
@@ -972,10 +979,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                        })
                        .collect::<Vec<_>>()
                };
-            // If --safekeepers argument is given, use only the listed
-            // safekeeper nodes; otherwise all from the env.
-            let safekeepers = parse_safekeepers(&sub_args)?;
-            endpoint.reconfigure(pageservers, None, safekeepers).await?;
+            endpoint.reconfigure(pageservers, None).await?;
        }
        "stop" => {
            let endpoint_id = sub_args
@@ -997,23 +1001,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
    Ok(())
 }

-/// Parse --safekeepers as list of safekeeper ids.
-fn parse_safekeepers(sub_args: &ArgMatches) -> Result<Option<Vec<NodeId>>> {
-    if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
-        let mut safekeepers: Vec<NodeId> = Vec::new();
-        for sk_id in safekeepers_str.split(',').map(str::trim) {
-            let sk_id = NodeId(
-                u64::from_str(sk_id)
-                    .map_err(|_| anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list"))?,
-            );
-            safekeepers.push(sk_id);
-        }
-        Ok(Some(safekeepers))
-    } else {
-        Ok(None)
-    }
-}
-
 fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
    let (sub_name, sub_args) = match sub_match.subcommand() {
        Some(ep_subcommand_data) => ep_subcommand_data,
@@ -1586,7 +1573,7 @@ fn cli() -> Command {
                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
                    .arg(endpoint_id_arg.clone())
                    .arg(endpoint_pageserver_id_arg.clone())
-                    .arg(safekeepers_arg.clone())
+                    .arg(safekeepers_arg)
                    .arg(remote_ext_config_args)
                    .arg(create_test_user)
                    .arg(allow_multiple.clone())
@@ -1594,7 +1581,6 @@ fn cli() -> Command {
                .subcommand(Command::new("reconfigure")
                            .about("Reconfigure the endpoint")
                            .arg(endpoint_pageserver_id_arg)
-                            .arg(safekeepers_arg)
                            .arg(endpoint_id_arg.clone())
                            .arg(tenant_id_arg.clone())
                )
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -499,23 +499,6 @@ impl Endpoint {
            .join(",")
    }

-    /// Map safekeepers ids to the actual connection strings.
-    fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
-        let mut safekeeper_connstrings = Vec::new();
-        if self.mode == ComputeMode::Primary {
-            for sk_id in sk_ids {
-                let sk = self
-                    .env
-                    .safekeepers
-                    .iter()
-                    .find(|node| node.id == sk_id)
-                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
-                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
-            }
-        }
-        Ok(safekeeper_connstrings)
-    }
-
    pub async fn start(
        &self,
        auth_token: &Option<String>,
@@ -540,7 +523,18 @@ impl Endpoint {
        let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
        assert!(!pageserver_connstring.is_empty());

-        let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
+        let mut safekeeper_connstrings = Vec::new();
+        if self.mode == ComputeMode::Primary {
+            for sk_id in safekeepers {
+                let sk = self
+                    .env
+                    .safekeepers
+                    .iter()
+                    .find(|node| node.id == sk_id)
+                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
+                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
+            }
+        }

        // check for file remote_extensions_spec.json
        // if it is present, read it and pass to compute_ctl
@@ -747,7 +741,6 @@ impl Endpoint {
        &self,
        mut pageservers: Vec<(Host, u16)>,
        stripe_size: Option<ShardStripeSize>,
-        safekeepers: Option<Vec<NodeId>>,
    ) -> Result<()> {
        let mut spec: ComputeSpec = {
            let spec_path = self.endpoint_path().join("spec.json");
@@ -782,12 +775,6 @@ impl Endpoint {
            spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
        }

-        // If safekeepers are not specified, don't change them.
-        if let Some(safekeepers) = safekeepers {
-            let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
-            spec.safekeeper_connstrings = safekeeper_connstrings;
-        }
-
        let client = reqwest::Client::builder()
            .timeout(Duration::from_secs(30))
            .build()
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -14,7 +14,6 @@ use camino::Utf8PathBuf;
 use postgres_connection::PgConnectionConfig;
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
-use utils::auth::{Claims, Scope};
 use utils::{http::error::HttpErrorBody, id::NodeId};

 use crate::{
@@ -198,7 +197,7 @@ impl SafekeeperNode {
            &datadir,
            &self.env.safekeeper_bin(),
            &args,
-            self.safekeeper_env_variables()?,
+            [],
            background_process::InitialPidFile::Expect(self.pid_file()),
            || async {
                match self.check_status().await {
@@ -211,18 +210,6 @@ impl SafekeeperNode {
        .await
    }

-    fn safekeeper_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
-        // Generate a token to connect from safekeeper to peers
-        if self.conf.auth_enabled {
-            let token = self
-                .env
-                .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
-            Ok(vec![("SAFEKEEPER_AUTH_TOKEN".to_owned(), token)])
-        } else {
-            Ok(Vec::new())
-        }
-    }
-
    ///
    /// Stop the server.
    ///
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -9,8 +9,6 @@ license.workspace = true
 anyhow.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
-futures.workspace = true
-humantime.workspace = true
 hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,4 +1,3 @@
-use futures::StreamExt;
 use std::{collections::HashMap, str::FromStr, time::Duration};

 use clap::{Parser, Subcommand};
@@ -8,9 +7,8 @@ use pageserver_api::{
        TenantDescribeResponse, TenantPolicyRequest,
    },
    models::{
-        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
-        TenantShardSplitRequest, TenantShardSplitResponse,
+        LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
+        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
    },
    shard::{ShardStripeSize, TenantShardId},
 };
@@ -127,44 +125,6 @@ enum Command {
        #[arg(long)]
        tenant_id: TenantId,
    },
-    /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
-    /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
-    TenantDrop {
-        #[arg(long)]
-        tenant_id: TenantId,
-        #[arg(long)]
-        unclean: bool,
-    },
-    NodeDrop {
-        #[arg(long)]
-        node_id: NodeId,
-        #[arg(long)]
-        unclean: bool,
-    },
-    TenantSetTimeBasedEviction {
-        #[arg(long)]
-        tenant_id: TenantId,
-        #[arg(long)]
-        period: humantime::Duration,
-        #[arg(long)]
-        threshold: humantime::Duration,
-    },
-    // Drain a set of specified pageservers by moving the primary attachments to pageservers
-    // outside of the specified set.
-    Drain {
-        // Set of pageserver node ids to drain.
-        #[arg(long)]
-        nodes: Vec<NodeId>,
-        // Optional: migration concurrency (default is 8)
-        #[arg(long)]
-        concurrency: Option<usize>,
-        // Optional: maximum number of shards to migrate
-        #[arg(long)]
-        max_shards: Option<usize>,
-        // Optional: when set to true, nothing is migrated, but the plan is printed to stdout
-        #[arg(long)]
-        dry_run: Option<bool>,
-    },
 }

 #[derive(Parser)]
@@ -714,234 +674,6 @@ async fn main() -> anyhow::Result<()> {
                }
            }
        }
-        Command::TenantDrop { tenant_id, unclean } => {
-            if !unclean {
-                anyhow::bail!("This command is not a tenant deletion, and uncleanly drops all controller state for the tenant.  If you know what you're doing, add `--unclean` to proceed.")
-            }
-            storcon_client
-                .dispatch::<(), ()>(
-                    Method::POST,
-                    format!("debug/v1/tenant/{tenant_id}/drop"),
-                    None,
-                )
-                .await?;
-        }
-        Command::NodeDrop { node_id, unclean } => {
-            if !unclean {
-                anyhow::bail!("This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it.  If you know what you're doing, add `--unclean` to proceed.")
-            }
-            storcon_client
-                .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
-                .await?;
-        }
-        Command::TenantSetTimeBasedEviction {
-            tenant_id,
-            period,
-            threshold,
-        } => {
-            vps_client
-                .tenant_config(&TenantConfigRequest {
-                    tenant_id,
-                    config: TenantConfig {
-                        eviction_policy: Some(EvictionPolicy::LayerAccessThreshold(
-                            EvictionPolicyLayerAccessThreshold {
-                                period: period.into(),
-                                threshold: threshold.into(),
-                            },
-                        )),
-                        ..Default::default()
-                    },
-                })
-                .await?;
-        }
-        Command::Drain {
-            nodes,
-            concurrency,
-            max_shards,
-            dry_run,
-        } => {
-            // Load the list of nodes, split them up into the drained and filled sets,
-            // and validate that draining is possible.
-            let node_descs = storcon_client
-                .dispatch::<(), Vec<NodeDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/node".to_string(),
-                    None,
-                )
-                .await?;
-
-            let mut node_to_drain_descs = Vec::new();
-            let mut node_to_fill_descs = Vec::new();
-
-            for desc in node_descs {
-                let to_drain = nodes.iter().any(|id| *id == desc.id);
-                if to_drain {
-                    node_to_drain_descs.push(desc);
-                } else {
-                    node_to_fill_descs.push(desc);
-                }
-            }
-
-            if nodes.len() != node_to_drain_descs.len() {
-                anyhow::bail!("Drain requested for node which doesn't exist.")
-            }
-
-            node_to_fill_descs.retain(|desc| {
-                matches!(desc.availability, NodeAvailabilityWrapper::Active)
-                    && matches!(
-                        desc.scheduling,
-                        NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Filling
-                    )
-            });
-
-            if node_to_fill_descs.is_empty() {
-                anyhow::bail!("There are no nodes to drain to")
-            }
-
-            // Set the node scheduling policy to draining for the nodes which
-            // we plan to drain.
-            for node_desc in node_to_drain_descs.iter() {
-                let req = NodeConfigureRequest {
-                    node_id: node_desc.id,
-                    availability: None,
-                    scheduling: Some(NodeSchedulingPolicy::Draining),
-                };
-
-                storcon_client
-                    .dispatch::<_, ()>(
-                        Method::PUT,
-                        format!("control/v1/node/{}/config", node_desc.id),
-                        Some(req),
-                    )
-                    .await?;
-            }
-
-            // Perform the drain: move each tenant shard scheduled on a node to
-            // be drained to a node which is being filled. A simple round robin
-            // strategy is used to pick the new node.
-            let tenants = storcon_client
-                .dispatch::<(), Vec<TenantDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/tenant".to_string(),
-                    None,
-                )
-                .await?;
-
-            let mut selected_node_idx = 0;
-
-            struct DrainMove {
-                tenant_shard_id: TenantShardId,
-                from: NodeId,
-                to: NodeId,
-            }
-
-            let mut moves: Vec<DrainMove> = Vec::new();
-
-            let shards = tenants
-                .into_iter()
-                .flat_map(|tenant| tenant.shards.into_iter());
-            for shard in shards {
-                if let Some(max_shards) = max_shards {
-                    if moves.len() >= max_shards {
-                        println!(
-                            "Stop planning shard moves since the requested maximum was reached"
-                        );
-                        break;
-                    }
-                }
-
-                let should_migrate = {
-                    if let Some(attached_to) = shard.node_attached {
-                        node_to_drain_descs
-                            .iter()
-                            .map(|desc| desc.id)
-                            .any(|id| id == attached_to)
-                    } else {
-                        false
-                    }
-                };
-
-                if !should_migrate {
-                    continue;
-                }
-
-                moves.push(DrainMove {
-                    tenant_shard_id: shard.tenant_shard_id,
-                    from: shard
-                        .node_attached
-                        .expect("We only migrate attached tenant shards"),
-                    to: node_to_fill_descs[selected_node_idx].id,
-                });
-                selected_node_idx = (selected_node_idx + 1) % node_to_fill_descs.len();
-            }
-
-            let total_moves = moves.len();
-
-            if dry_run == Some(true) {
-                println!("Dryrun requested. Planned {total_moves} moves:");
-                for mv in &moves {
-                    println!("{}: {} -> {}", mv.tenant_shard_id, mv.from, mv.to)
-                }
-
-                return Ok(());
-            }
-
-            const DEFAULT_MIGRATE_CONCURRENCY: usize = 8;
-            let mut stream = futures::stream::iter(moves)
-                .map(|mv| {
-                    let client = Client::new(cli.api.clone(), cli.jwt.clone());
-                    async move {
-                        client
-                            .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
-                                Method::PUT,
-                                format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
-                                Some(TenantShardMigrateRequest {
-                                    tenant_shard_id: mv.tenant_shard_id,
-                                    node_id: mv.to,
-                                }),
-                            )
-                            .await
-                            .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
-                    }
-                })
-                .buffered(concurrency.unwrap_or(DEFAULT_MIGRATE_CONCURRENCY));
-
-            let mut success = 0;
-            let mut failure = 0;
-
-            while let Some(res) = stream.next().await {
-                match res {
-                    Ok(_) => {
-                        success += 1;
-                    }
-                    Err((tenant_shard_id, from, to, error)) => {
-                        failure += 1;
-                        println!(
-                            "Failed to migrate {} from node {} to node {}: {}",
-                            tenant_shard_id, from, to, error
-                        );
-                    }
-                }
-
-                if (success + failure) % 20 == 0 {
-                    println!(
-                        "Processed {}/{} shards: {} succeeded, {} failed",
-                        success + failure,
-                        total_moves,
-                        success,
-                        failure
-                    );
-                }
-            }
-
-            println!(
-                "Processed {}/{} shards: {} succeeded, {} failed",
-                success + failure,
-                total_moves,
-                success,
-                failure
-            );
-        }
    }

    Ok(())
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -8,11 +8,6 @@ USER root
 RUN apt-get update &&       \
    apt-get install -y curl \
                       jq   \
-                       python3-pip \
                       netcat
-#Faker is required for the pg_anon test
-RUN pip3 install Faker
-#This is required for the pg_hintplan test
-RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src 

-USER postgres
+USER postgres
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
@@ -95,7 +95,7 @@
            },
            {
                "name": "shared_preload_libraries",
-                "value": "neon,pg_cron,timescaledb,pg_stat_statements",
+                "value": "neon",
                "vartype": "string"
            },
            {
@@ -127,16 +127,6 @@
                "name": "max_replication_flush_lag",
                "value": "10GB",
                "vartype": "string"
-            },
-            {
-                "name": "cron.database",
-                "value": "postgres",
-                "vartype": "string"
-            },
-            {
-                "name": "session_preload_libraries",
-                "value": "anon",
-                "vartype": "string"
            }
        ]
    },
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -1,3 +1,5 @@
+version: '3'
+
 services:
  minio:
    restart: always
@@ -159,12 +161,12 @@ services:
      context: ./compute_wrapper/
      args:
        - REPOSITORY=${REPOSITORY:-neondatabase}
-        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16}
+        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
        - TAG=${TAG:-latest}
        - http_proxy=$http_proxy
        - https_proxy=$https_proxy
    environment:
-      - PG_VERSION=${PG_VERSION:-16}
+      - PG_VERSION=${PG_VERSION:-14}
      #- RUST_BACKTRACE=1
    # Mount the test files directly, for faster editing cycle.
    volumes:
@@ -192,14 +194,3 @@ services:
         done"
    depends_on:
      - compute
-
-  neon-test-extensions:
-    profiles: ["test-extensions"]
-    image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest}
-    entrypoint:
-      - "/bin/bash"
-      - "-c"
-    command:
-      - sleep 1800
-    depends_on:
-      - compute
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -7,94 +7,52 @@
 # Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file
 # Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
 # to verify custom image builds (e.g pre-published ones).
-#
-# A test script for postgres extensions
-# Currently supports only v16
-#
+
 set -eux -o pipefail

-COMPOSE_FILE='docker-compose.yml'
-cd $(dirname $0)
+SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml
+
 COMPUTE_CONTAINER_NAME=docker-compose-compute-1
-TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1
-PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
-: ${http_proxy:=}
-: ${https_proxy:=}
-export http_proxy https_proxy
+SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
+PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"

 cleanup() {
    echo "show container information"
    docker ps
-    docker compose --profile test-extensions -f $COMPOSE_FILE logs
+    docker compose -f $COMPOSE_FILE logs
    echo "stop containers..."
-    docker compose --profile test-extensions -f $COMPOSE_FILE down
+    docker compose -f $COMPOSE_FILE down
 }

+echo "clean up containers if exists"
+cleanup
+
 for pg_version in 14 15 16; do
-    echo "clean up containers if exists"
-    cleanup
-    PG_TEST_VERSION=$(($pg_version < 16 ? 16 : $pg_version))
-    PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d
+    echo "start containers (pg_version=$pg_version)."
+    PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d

    echo "wait until the compute is ready. timeout after 60s. "
    cnt=0
-    while sleep 3; do
+    while sleep 1; do
        # check timeout
-        cnt=`expr $cnt + 3`
+        cnt=`expr $cnt + 1`
        if [ $cnt -gt 60 ]; then
            echo "timeout before the compute is ready."
            cleanup
            exit 1
        fi
-        if docker compose --profile test-extensions -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then
+
+        # check if the compute is ready
+        set +o pipefail
+        result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
+        set -o pipefail
+        if [ $result -eq 1 ]; then
            echo "OK. The compute is ready to connect."
            echo "execute simple queries."
            docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
+            cleanup
            break
        fi
    done
-
-    if [ $pg_version -ge 16 ]
-    then
-        echo Enabling trust connection
-        docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' "
-        echo Adding postgres role
-        docker exec $COMPUTE_CONTAINER_NAME psql $PSQL_OPTION -c "CREATE ROLE postgres SUPERUSER LOGIN"
-        # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
-        # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
-        echo Adding dummy config
-        docker exec $COMPUTE_CONTAINER_NAME touch /var/db/postgres/compute/compute_ctl_temp_override.conf
-        # This block is required for the pg_anon extension test.
-        # The test assumes that it is running on the same host with the postgres engine.
-        # In our case it's not true, that's why we are copying files to the compute node
-        TMPDIR=$(mktemp -d)
-        docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
-        echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
-        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
-        rm -rf $TMPDIR
-        TMPDIR=$(mktemp -d)
-        # The following block does the same for the pg_hintplan test
-        docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
-        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
-        rm -rf $TMPDIR
-        # We are running tests now
-        if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
-            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
-        then
-            cleanup
-        else
-            FAILED=$(tail -1 testout.txt)
-            for d in $FAILED
-            do
-                mkdir $d
-                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true
-                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true
-                cat $d/regression.out $d/regression.diffs || true
-            done
-        rm -rf $FAILED
-        cleanup
-        exit 1
-        fi
-    fi
-    cleanup
 done
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,15 +0,0 @@
-#!/bin/bash
-set -x
-
-cd /ext-src
-FAILED=
-LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
-for d in ${LIST}
-do
-       [ -d ${d} ] || continue
-    psql -c "select 1" >/dev/null || break
-       make -C ${d} installcheck || FAILED="${d} ${FAILED}"
-done
-[ -z "${FAILED}" ] && exit 0
-echo ${FAILED}
-exit 1
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -4,18 +4,18 @@

 Currently we build two main images:

- [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14.
+- [neondatabase/neon](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
+- [neondatabase/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres).

 And additional intermediate image:

 - [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.

-## Build pipeline
+## Building pipeline

 We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs

-1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14)
+1. `neondatabase/compute-tools` and `neondatabase/compute-node`

 2. `neondatabase/neon`

@@ -34,12 +34,12 @@ You can see a [docker compose](https://docs.docker.com/compose/) example to crea
 1. create containers

 You can specify version of neon cluster using following environment values.
- PG_VERSION: postgres version for compute (default is 16 as of this writing)
- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags), which is tagged in [CI test](/.github/workflows/build_and_test.yml). Default is 'latest'
+- PG_VERSION: postgres version for compute (default is 14)
+- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
 ```
 $ cd docker-compose/
 $ docker-compose down   # remove the containers if exists
-$ PG_VERSION=16 TAG=latest docker-compose up --build -d  # You can specify the postgres and image version
+$ PG_VERSION=15 TAG=2937 docker-compose up --build -d  # You can specify the postgres and image version
 Creating network "dockercompose_default" with the default driver
 Creating docker-compose_storage_broker_1       ... done
 (...omit...)
@@ -47,31 +47,29 @@ Creating docker-compose_storage_broker_1       ... done

 2. connect compute node
 ```
-$ psql postgresql://cloud_admin:cloud_admin@localhost:55433/postgres
-psql (16.3)
-Type "help" for help.
-
+$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
+$ chmod 600 ~/.pgpass
+$ psql -h localhost -p 55433 -U cloud_admin
 postgres=# CREATE TABLE t(key int primary key, value text);
 CREATE TABLE
-postgres=# insert into t values(1, 1);
+postgres=# insert into t values(1,1);
 INSERT 0 1
 postgres=# select * from t;
- key | value 
+ key | value
 -----+-------
   1 | 1
 (1 row)
-
 ```

 3. If you want to see the log, you can use `docker-compose logs` command.
 ```
 # check the container name you want to see
 $ docker ps
-CONTAINER ID   IMAGE                                              COMMAND                  CREATED         STATUS         PORTS                                                                                      NAMES
-3582f6d76227   docker-compose_compute                             "/shell/compute.sh"      2 minutes ago   Up 2 minutes   0.0.0.0:3080->3080/tcp, :::3080->3080/tcp, 0.0.0.0:55433->55433/tcp, :::55433->55433/tcp   docker-compose_compute_1
+CONTAINER ID   IMAGE                                              COMMAND                  CREATED         STATUS         PORTS                                                                                                                                  NAMES
+d6968a5ae912   dockercompose_compute                              "/shell/compute.sh"      5 minutes ago   Up 5 minutes   0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp                                                                                       dockercompose_compute_1
 (...omit...)

-$ docker logs -f docker-compose_compute_1
+$ docker logs -f dockercompose_compute_1
 2022-10-21 06:15:48.757 GMT [56] LOG:  connection authorized: user=cloud_admin database=postgres application_name=psql
 2022-10-21 06:17:00.307 GMT [56] LOG:  [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400'
 (...omit...)
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,7 +1,6 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
-use postgres_ffi::RepOriginId;
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::{fmt, ops::Range};
@@ -39,9 +38,6 @@ pub const RELATION_SIZE_PREFIX: u8 = 0x61;
 /// The key prefix of AUX file keys.
 pub const AUX_KEY_PREFIX: u8 = 0x62;

-/// The key prefix of ReplOrigin keys.
-pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63;
-
 /// Check if the key falls in the range of metadata keys.
 pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
    key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
@@ -591,37 +587,6 @@ pub const AUX_FILES_KEY: Key = Key {
    field6: 2,
 };

-#[inline(always)]
-pub fn repl_origin_key(origin_id: RepOriginId) -> Key {
-    Key {
-        field1: REPL_ORIGIN_KEY_PREFIX,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: origin_id as u32,
-    }
-}
-
-/// Get the range of replorigin keys.
-pub fn repl_origin_key_range() -> Range<Key> {
-    Key {
-        field1: REPL_ORIGIN_KEY_PREFIX,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: 0,
-    }..Key {
-        field1: REPL_ORIGIN_KEY_PREFIX,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: 0x10000,
-    }
-}
-
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.

--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -558,12 +558,6 @@ impl KeySpaceRandomAccum {
        self.ranges.push(range);
    }

-    pub fn add_keyspace(&mut self, keyspace: KeySpace) {
-        for range in keyspace.ranges {
-            self.add_range(range);
-        }
-    }
-
    pub fn to_keyspace(mut self) -> KeySpace {
        let mut ranges = Vec::new();
        if !self.ranges.is_empty() {
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -126,7 +126,6 @@ fn main() -> anyhow::Result<()> {
            .allowlist_type("PageHeaderData")
            .allowlist_type("DBState")
            .allowlist_type("RelMapFile")
-            .allowlist_type("RepOriginId")
            // Because structs are used for serialization, tell bindgen to emit
            // explicit padding fields.
            .explicit_padding(true)
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -110,7 +110,6 @@ pub mod pg_constants;
 pub mod relfile_utils;

 // Export some widely used datatypes that are unlikely to change across Postgres versions
-pub use v14::bindings::RepOriginId;
 pub use v14::bindings::{uint32, uint64, Oid};
 pub use v14::bindings::{BlockNumber, OffsetNumber};
 pub use v14::bindings::{MultiXactId, TransactionId};
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -102,7 +102,7 @@ pub const XACT_XINFO_HAS_SUBXACTS: u32 = 1u32 << 1;
 pub const XACT_XINFO_HAS_RELFILENODES: u32 = 1u32 << 2;
 pub const XACT_XINFO_HAS_INVALS: u32 = 1u32 << 3;
 pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
-pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
+// pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
 // pub const XACT_XINFO_HAS_AE_LOCKS: u32 = 1u32 << 6;
 // pub const XACT_XINFO_HAS_GID: u32 = 1u32 << 7;

@@ -167,7 +167,6 @@ pub const RM_RELMAP_ID: u8 = 7;
 pub const RM_STANDBY_ID: u8 = 8;
 pub const RM_HEAP2_ID: u8 = 9;
 pub const RM_HEAP_ID: u8 = 10;
-pub const RM_REPLORIGIN_ID: u8 = 19;
 pub const RM_LOGICALMSG_ID: u8 = 21;

 // from neon_rmgr.h
@@ -224,10 +223,6 @@ pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
 pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_LONG_HEADER: u16 = 0x0002;

-/* From xlog.h */
-pub const XLOG_REPLORIGIN_SET: u8 = 0x00;
-pub const XLOG_REPLORIGIN_DROP: u8 = 0x10;
-
 /* From replication/slot.h */
 pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4  /* offset of `slotdata` in ReplicationSlotOnDisk  */
   + 64 /* NameData */  + 4*4;
@@ -242,9 +237,6 @@ pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32;
 pub const VM_HEAPBLOCKS_PER_PAGE: u32 =
    (BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)

-/* From origin.c */
-pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE;
-
 // List of subdirectories inside pgdata.
 // Copied from src/bin/initdb/initdb.c
 pub const PGDATA_SUBDIRS: [&str; 22] = [
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -3,7 +3,6 @@
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::env;
-use std::fmt::Display;
 use std::io;
 use std::num::NonZeroU32;
 use std::pin::Pin;
@@ -30,7 +29,6 @@ use http_types::{StatusCode, Url};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
-use utils::backoff;

 use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
 use crate::{
@@ -453,58 +451,26 @@ impl RemoteStorage for AzureBlobStorage {
            // TODO batch requests are not supported by the SDK
            // https://github.com/Azure/azure-sdk-for-rust/issues/1068
            for path in paths {
-                #[derive(Debug)]
-                enum AzureOrTimeout {
-                    AzureError(azure_core::Error),
-                    Timeout,
-                    Cancel,
-                }
-                impl Display for AzureOrTimeout {
-                    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                        write!(f, "{self:?}")
-                    }
-                }
-                let warn_threshold = 3;
-                let max_retries = 5;
-                backoff::retry(
-                    || async {
-                        let blob_client = self.client.blob_client(self.relative_path_to_name(path));
+                let blob_client = self.client.blob_client(self.relative_path_to_name(path));

-                        let request = blob_client.delete().into_future();
+                let request = blob_client.delete().into_future();

-                        let res = tokio::time::timeout(self.timeout, request).await;
+                let res = tokio::time::timeout(self.timeout, request).await;

-                        match res {
-                            Ok(Ok(_v)) => Ok(()),
-                            Ok(Err(azure_err)) => {
-                                if let Some(http_err) = azure_err.as_http_error() {
-                                    if http_err.status() == StatusCode::NotFound {
-                                        return Ok(());
-                                    }
-                                }
-                                Err(AzureOrTimeout::AzureError(azure_err))
+                match res {
+                    Ok(Ok(_response)) => continue,
+                    Ok(Err(e)) => {
+                        if let Some(http_err) = e.as_http_error() {
+                            if http_err.status() == StatusCode::NotFound {
+                                continue;
                            }
-                            Err(_elapsed) => Err(AzureOrTimeout::Timeout),
                        }
-                    },
-                    |err| match err {
-                        AzureOrTimeout::AzureError(_) | AzureOrTimeout::Timeout => false,
-                        AzureOrTimeout::Cancel => true,
-                    },
-                    warn_threshold,
-                    max_retries,
-                    "deleting remote object",
-                    cancel,
-                )
-                .await
-                .ok_or_else(|| AzureOrTimeout::Cancel)
-                .and_then(|x| x)
-                .map_err(|e| match e {
-                    AzureOrTimeout::AzureError(err) => anyhow::Error::from(err),
-                    AzureOrTimeout::Timeout => TimeoutOrCancel::Timeout.into(),
-                    AzureOrTimeout::Cancel => TimeoutOrCancel::Cancel.into(),
-                })?;
+                        return Err(e.into());
+                    }
+                    Err(_elapsed) => return Err(TimeoutOrCancel::Timeout.into()),
+                }
            }
+
            Ok(())
        };

--- a/libs/remote_storage/src/support.rs
+++ b/libs/remote_storage/src/support.rs
@@ -78,10 +78,6 @@ where
                let e = Err(std::io::Error::from(e));
                return Poll::Ready(Some(e));
            }
-        } else {
-            // this would be perfectly valid behaviour for doing a graceful completion on the
-            // download for example, but not one we expect to do right now.
-            tracing::warn!("continuing polling after having cancelled or timeouted");
        }

        this.inner.poll_next(cx)
@@ -93,22 +89,13 @@ where
 }

 /// Fires only on the first cancel or timeout, not on both.
-pub(crate) fn cancel_or_timeout(
+pub(crate) async fn cancel_or_timeout(
    timeout: Duration,
    cancel: CancellationToken,
-) -> impl std::future::Future<Output = TimeoutOrCancel> + 'static {
-    // futures are lazy, they don't do anything before being polled.
-    //
-    // "precalculate" the wanted deadline before returning the future, so that we can use pause
-    // failpoint to trigger a timeout in test.
-    let deadline = tokio::time::Instant::now() + timeout;
-    async move {
-        tokio::select! {
-            _ = tokio::time::sleep_until(deadline) => TimeoutOrCancel::Timeout,
-            _ = cancel.cancelled() => {
-                TimeoutOrCancel::Cancel
-            },
-        }
+) -> TimeoutOrCancel {
+    tokio::select! {
+        _ = tokio::time::sleep(timeout) => TimeoutOrCancel::Timeout,
+        _ = cancel.cancelled() => TimeoutOrCancel::Cancel,
    }
 }

@@ -185,31 +172,4 @@ mod tests {
            _ = tokio::time::sleep(Duration::from_secs(121)) => {},
        }
    }
-
-    #[tokio::test]
-    async fn notified_but_pollable_after() {
-        let inner = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from_static(
-            b"hello world",
-        ))));
-        let timeout = Duration::from_secs(120);
-        let cancel = CancellationToken::new();
-
-        cancel.cancel();
-        let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
-        let mut stream = std::pin::pin!(stream);
-
-        let next = stream.next().await;
-        let ioe = next.unwrap().unwrap_err();
-        assert!(
-            matches!(
-                ioe.get_ref().unwrap().downcast_ref::<DownloadError>(),
-                Some(&DownloadError::Cancelled)
-            ),
-            "{ioe:?}"
-        );
-
-        let next = stream.next().await;
-        let bytes = next.unwrap().unwrap();
-        assert_eq!(&b"hello world"[..], bytes);
-    }
 }
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -7,7 +7,7 @@ license.workspace = true
 [dependencies]
 hyper.workspace = true
 opentelemetry = { workspace = true, features=["rt-tokio"] }
-opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions.workspace = true
 reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -3,9 +3,6 @@ use std::{fs, io, path::Path};

 use anyhow::Context;

-mod rename_noreplace;
-pub use rename_noreplace::rename_noreplace;
-
 pub trait PathExt {
    /// Returns an error if `self` is not a directory.
    fn is_empty_dir(&self) -> io::Result<bool>;
--- a/libs/utils/src/fs_ext/rename_noreplace.rs
+++ b/libs/utils/src/fs_ext/rename_noreplace.rs
@@ -1,109 +0,0 @@
-use nix::NixPath;
-
-/// Rename a file without replacing an existing file.
-///
-/// This is a wrapper around platform-specific APIs.
-pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
-    src: &P1,
-    dst: &P2,
-) -> nix::Result<()> {
-    {
-        #[cfg(target_os = "linux")]
-        {
-            nix::fcntl::renameat2(
-                None,
-                src,
-                None,
-                dst,
-                nix::fcntl::RenameFlags::RENAME_NOREPLACE,
-            )
-        }
-        #[cfg(target_os = "macos")]
-        {
-            let res = src.with_nix_path(|src| {
-                dst.with_nix_path(|dst|
-                    // SAFETY: `src` and `dst` are valid C strings as per the NixPath trait and they outlive the call to renamex_np.
-                    unsafe {
-                        nix::libc::renamex_np(src.as_ptr(), dst.as_ptr(), nix::libc::RENAME_EXCL)
-                })
-            })??;
-            nix::errno::Errno::result(res).map(drop)
-        }
-        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
-        {
-            std::compile_error!("OS does not support no-replace renames");
-        }
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use std::{fs, path::PathBuf};
-
-    use super::*;
-
-    fn testdir() -> camino_tempfile::Utf8TempDir {
-        match crate::env::var("NEON_UTILS_RENAME_NOREPLACE_TESTDIR") {
-            Some(path) => {
-                let path: camino::Utf8PathBuf = path;
-                camino_tempfile::tempdir_in(path).unwrap()
-            }
-            None => camino_tempfile::tempdir().unwrap(),
-        }
-    }
-
-    #[test]
-    fn test_absolute_paths() {
-        let testdir = testdir();
-        println!("testdir: {}", testdir.path());
-
-        let src = testdir.path().join("src");
-        let dst = testdir.path().join("dst");
-
-        fs::write(&src, b"").unwrap();
-        fs::write(&dst, b"").unwrap();
-
-        let src = src.canonicalize().unwrap();
-        assert!(src.is_absolute());
-        let dst = dst.canonicalize().unwrap();
-        assert!(dst.is_absolute());
-
-        let result = rename_noreplace(&src, &dst);
-        assert_eq!(result.unwrap_err(), nix::Error::EEXIST);
-    }
-
-    #[test]
-    fn test_relative_paths() {
-        let testdir = testdir();
-        println!("testdir: {}", testdir.path());
-
-        // this is fine because we run in nextest => process per test
-        std::env::set_current_dir(testdir.path()).unwrap();
-
-        let src = PathBuf::from("src");
-        let dst = PathBuf::from("dst");
-
-        fs::write(&src, b"").unwrap();
-        fs::write(&dst, b"").unwrap();
-
-        let result = rename_noreplace(&src, &dst);
-        assert_eq!(result.unwrap_err(), nix::Error::EEXIST);
-    }
-
-    #[test]
-    fn test_works_when_not_exists() {
-        let testdir = testdir();
-        println!("testdir: {}", testdir.path());
-
-        let src = testdir.path().join("src");
-        let dst = testdir.path().join("dst");
-
-        fs::write(&src, b"content").unwrap();
-
-        rename_noreplace(src.as_std_path(), dst.as_std_path()).unwrap();
-        assert_eq!(
-            "content",
-            String::from_utf8(std::fs::read(&dst).unwrap()).unwrap()
-        );
-    }
-}
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -34,9 +34,6 @@ pub enum ApiError {
    #[error("Timeout")]
    Timeout(Cow<'static, str>),

-    #[error("Request cancelled")]
-    Cancelled,
-
    #[error(transparent)]
    InternalServerError(anyhow::Error),
 }
@@ -77,10 +74,6 @@ impl ApiError {
                err.to_string(),
                StatusCode::REQUEST_TIMEOUT,
            ),
-            ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status(
-                self.to_string(),
-                StatusCode::INTERNAL_SERVER_ERROR,
-            ),
            ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                err.to_string(),
                StatusCode::INTERNAL_SERVER_ERROR,
@@ -140,7 +133,6 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
        ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
        ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
-        ApiError::Cancelled => info!("Request cancelled while processing HTTP request"),
        _ => info!("Error processing HTTP request: {api_error:#}"),
    }

--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -25,8 +25,6 @@ pub struct Config {
    ///
    /// For simplicity, this value must be greater than or equal to `memory_history_len`.
    memory_history_log_interval: usize,
-    /// The max number of iterations to skip before logging the next iteration
-    memory_history_log_noskip_interval: Duration,
 }

 impl Default for Config {
@@ -35,7 +33,6 @@ impl Default for Config {
            memory_poll_interval: Duration::from_millis(100),
            memory_history_len: 5, // use 500ms of history for decision-making
            memory_history_log_interval: 20, // but only log every ~2s (otherwise it's spammy)
-            memory_history_log_noskip_interval: Duration::from_secs(15), // but only if it's changed, or 60 seconds have passed
        }
    }
 }
@@ -88,12 +85,7 @@ impl CgroupWatcher {

        // buffer for samples that will be logged. once full, it remains so.
        let history_log_len = self.config.memory_history_log_interval;
-        let max_skip = self.config.memory_history_log_noskip_interval;
        let mut history_log_buf = vec![MemoryStatus::zeroed(); history_log_len];
-        let mut last_logged_memusage = MemoryStatus::zeroed();
-
-        // Ensure that we're tracking a value that's definitely in the past, as Instant::now is only guaranteed to be non-decreasing on Rust's T1-supported systems.
-        let mut can_skip_logs_until = Instant::now() - max_skip;

        for t in 0_u64.. {
            ticker.tick().await;
@@ -123,24 +115,12 @@ impl CgroupWatcher {
            // equal to the logging interval, we can just log the entire buffer every time we set
            // the last entry, which also means that for this log line, we can ignore that it's a
            // ring buffer (because all the entries are in order of increasing time).
-            //
-            // We skip logging the data if data hasn't meaningfully changed in a while, unless
-            // we've already ignored previous iterations for the last max_skip period.
-            if i == history_log_len - 1
-                && (now > can_skip_logs_until
-                    || !history_log_buf
-                        .iter()
-                        .all(|usage| last_logged_memusage.status_is_close_or_similar(usage)))
-            {
+            if i == history_log_len - 1 {
                info!(
                    history = ?MemoryStatus::debug_slice(&history_log_buf),
                    summary = ?summary,
                    "Recent cgroup memory statistics history"
                );
-
-                can_skip_logs_until = now + max_skip;
-
-                last_logged_memusage = *history_log_buf.last().unwrap();
            }

            updates
@@ -252,24 +232,6 @@ impl MemoryStatus {

        DS(slice)
    }
-
-    /// Check if the other memory status is a close or similar result.
-    /// Returns true if the larger value is not larger than the smaller value
-    /// by 1/8 of the smaller value, and within 128MiB.
-    /// See tests::check_similarity_behaviour for examples of behaviour
-    fn status_is_close_or_similar(&self, other: &MemoryStatus) -> bool {
-        let margin;
-        let diff;
-        if self.non_reclaimable >= other.non_reclaimable {
-            margin = other.non_reclaimable / 8;
-            diff = self.non_reclaimable - other.non_reclaimable;
-        } else {
-            margin = self.non_reclaimable / 8;
-            diff = other.non_reclaimable - self.non_reclaimable;
-        }
-
-        diff < margin && diff < 128 * 1024 * 1024
-    }
 }

 #[cfg(test)]
@@ -299,65 +261,4 @@ mod tests {
        assert_eq!(values(2, 4), [9, 0, 1, 2]);
        assert_eq!(values(2, 10), [3, 4, 5, 6, 7, 8, 9, 0, 1, 2]);
    }
-
-    #[test]
-    fn check_similarity_behaviour() {
-        // This all accesses private methods, so we can't actually run this
-        // as doctests, because doctests run as an external crate.
-        let mut small = super::MemoryStatus {
-            non_reclaimable: 1024,
-        };
-        let mut large = super::MemoryStatus {
-            non_reclaimable: 1024 * 1024 * 1024 * 1024,
-        };
-
-        // objects are self-similar, no matter the size
-        assert!(small.status_is_close_or_similar(&small));
-        assert!(large.status_is_close_or_similar(&large));
-
-        // inequality is symmetric
-        assert!(!small.status_is_close_or_similar(&large));
-        assert!(!large.status_is_close_or_similar(&small));
-
-        small.non_reclaimable = 64;
-        large.non_reclaimable = (small.non_reclaimable / 8) * 9;
-
-        // objects are self-similar, no matter the size
-        assert!(small.status_is_close_or_similar(&small));
-        assert!(large.status_is_close_or_similar(&large));
-
-        // values are similar if the larger value is larger by less than
-        // 12.5%, i.e. 1/8 of the smaller value.
-        // In the example above, large is exactly 12.5% larger, so this doesn't
-        // match.
-        assert!(!small.status_is_close_or_similar(&large));
-        assert!(!large.status_is_close_or_similar(&small));
-
-        large.non_reclaimable -= 1;
-        assert!(large.status_is_close_or_similar(&large));
-
-        assert!(small.status_is_close_or_similar(&large));
-        assert!(large.status_is_close_or_similar(&small));
-
-        // The 1/8 rule only applies up to 128MiB of difference
-        small.non_reclaimable = 1024 * 1024 * 1024 * 1024;
-        large.non_reclaimable = small.non_reclaimable / 8 * 9;
-        assert!(small.status_is_close_or_similar(&small));
-        assert!(large.status_is_close_or_similar(&large));
-
-        assert!(!small.status_is_close_or_similar(&large));
-        assert!(!large.status_is_close_or_similar(&small));
-        // the large value is put just above the threshold
-        large.non_reclaimable = small.non_reclaimable + 128 * 1024 * 1024;
-        assert!(large.status_is_close_or_similar(&large));
-
-        assert!(!small.status_is_close_or_similar(&large));
-        assert!(!large.status_is_close_or_similar(&small));
-        // now below
-        large.non_reclaimable -= 1;
-        assert!(large.status_is_close_or_similar(&large));
-
-        assert!(small.status_is_close_or_similar(&large));
-        assert!(large.status_is_close_or_similar(&small));
-    }
 }
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -12,11 +12,11 @@ use futures::{
    stream::{SplitSink, SplitStream},
    SinkExt, StreamExt,
 };
-use tracing::{debug, info};
+use tracing::info;

 use crate::protocol::{
-    OutboundMsg, OutboundMsgKind, ProtocolRange, ProtocolResponse, ProtocolVersion,
-    PROTOCOL_MAX_VERSION, PROTOCOL_MIN_VERSION,
+    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, PROTOCOL_MAX_VERSION,
+    PROTOCOL_MIN_VERSION,
 };

 /// The central handler for all communications in the monitor.
@@ -118,12 +118,7 @@ impl Dispatcher {
    /// serialize the wrong thing and send it, since `self.sink.send` will take
    /// any string.
    pub async fn send(&mut self, message: OutboundMsg) -> anyhow::Result<()> {
-        if matches!(&message.inner, OutboundMsgKind::HealthCheck { .. }) {
-            debug!(?message, "sending message");
-        } else {
-            info!(?message, "sending message");
-        }
-
+        info!(?message, "sending message");
        let json = serde_json::to_string(&message).context("failed to serialize message")?;
        self.sink
            .send(Message::Text(json))
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -12,7 +12,7 @@ use axum::extract::ws::{Message, WebSocket};
 use futures::StreamExt;
 use tokio::sync::{broadcast, watch};
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, warn};
+use tracing::{error, info, warn};

 use crate::cgroup::{self, CgroupWatcher};
 use crate::dispatcher::Dispatcher;
@@ -474,29 +474,26 @@ impl Runner {
                // there is a message from the agent
                msg = self.dispatcher.source.next() => {
                    if let Some(msg) = msg {
-                        match &msg {
+                        // Don't use 'message' as a key as the string also uses
+                        // that for its key
+                        info!(?msg, "received message");
+                        match msg {
                            Ok(msg) => {
                                let message: InboundMsg = match msg {
                                    Message::Text(text) => {
-                                        serde_json::from_str(text).context("failed to deserialize text message")?
+                                        serde_json::from_str(&text).context("failed to deserialize text message")?
                                    }
                                    other => {
                                        warn!(
                                            // Don't use 'message' as a key as the
                                            // string also uses that for its key
                                            msg = ?other,
-                                            "problem processing incoming message: agent should only send text messages but received different type"
+                                            "agent should only send text messages but received different type"
                                        );
                                        continue
                                    },
                                };

-                                if matches!(&message.inner, InboundMsgKind::HealthCheck { .. }) {
-                                    debug!(?msg, "received message");
-                                } else {
-                                    info!(?msg, "received message");
-                                }
-
                                let out = match self.process_message(message.clone()).await {
                                    Ok(Some(out)) => out,
                                    Ok(None) => continue,
@@ -520,11 +517,7 @@ impl Runner {
                                    .await
                                    .context("failed to send message")?;
                            }
-                            Err(e) => warn!(
-                                error = format!("{e}"),
-                                msg = ?msg,
-                                "received error message"
-                            ),
+                            Err(e) => warn!("{e}"),
                        }
                    } else {
                        anyhow::bail!("dispatcher connection closed")
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -1,6 +1,11 @@
+use std::collections::HashMap;
+
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use pageserver::tenant::IndexPart;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::storage_layer::LayerName;
+use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
+use utils::lsn::Lsn;

 #[derive(clap::Subcommand)]
 pub(crate) enum IndexPartCmd {
@@ -12,7 +17,20 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
        IndexPartCmd::Dump { path } => {
            let bytes = tokio::fs::read(path).await.context("read file")?;
            let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
-            let output = serde_json::to_string_pretty(&des).context("serialize output")?;
+            #[derive(serde::Serialize)]
+            struct Output<'a> {
+                layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
+                disk_consistent_lsn: Lsn,
+                timeline_metadata: &'a TimelineMetadata,
+            }
+
+            let output = Output {
+                layer_metadata: &des.layer_metadata,
+                disk_consistent_lsn: des.get_disk_consistent_lsn(),
+                timeline_metadata: &des.metadata,
+            };
+
+            let output = serde_json::to_string_pretty(&output).context("serialize output")?;
            println!("{output}");
            Ok(())
        }
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -178,8 +178,7 @@ impl AuxFileSizeEstimator {
        }
    }

-    /// When generating base backup or doing initial logical size calculation
-    pub fn on_initial(&self, new_size: usize) {
+    pub fn on_base_backup(&self, new_size: usize) {
        let mut guard = self.size.lock().unwrap();
        *guard = Some(new_size as isize);
        self.report(new_size as isize);
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -362,13 +362,6 @@ where
                    ));
                    info!("Replication slot {} restart LSN={}", path, restart_lsn);
                    min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
-                } else if path == "pg_logical/replorigin_checkpoint" {
-                    // replorigin_checkoint is written only on compute shutdown, so it contains
-                    // deteriorated values. So we generate our own version of this file for the particular LSN
-                    // based on information about replorigins extracted from transaction commit records.
-                    // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
-                    // but now we should handle (skip) it for backward compatibility.
-                    continue;
                }
                let header = new_tar_header(&path, content.len() as u64)?;
                self.ar
@@ -397,32 +390,6 @@ where
        {
            self.add_twophase_file(xid).await?;
        }
-        let repl_origins = self
-            .timeline
-            .get_replorigins(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
-        let n_origins = repl_origins.len();
-        if n_origins != 0 {
-            //
-            // Construct "pg_logical/replorigin_checkpoint" file based on information about replication origins
-            // extracted from transaction commit record. We are using this file to pass information about replication
-            // origins to compute to allow logical replication to restart from proper point.
-            //
-            let mut content = Vec::with_capacity(n_origins * 16 + 8);
-            content.extend_from_slice(&pg_constants::REPLICATION_STATE_MAGIC.to_le_bytes());
-            for (origin_id, origin_lsn) in repl_origins {
-                content.extend_from_slice(&origin_id.to_le_bytes());
-                content.extend_from_slice(&[0u8; 6]); // align to 8 bytes
-                content.extend_from_slice(&origin_lsn.0.to_le_bytes());
-            }
-            let crc32 = crc32c::crc32c(&content);
-            content.extend_from_slice(&crc32.to_le_bytes());
-            let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?;
-            self.ar.append(&header, &*content).await.context(
-                "could not add pg_logical/replorigin_checkpoint file to basebackup tarball",
-            )?;
-        }

        fail_point!("basebackup-before-control-file", |_| {
            Err(BasebackupError::Server(anyhow!(
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -99,6 +99,8 @@ pub mod defaults {

    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;

+    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async";
+
    ///
    /// Default built-in configuration file.
    ///
@@ -144,6 +146,8 @@ pub mod defaults {

 #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'

+#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -296,6 +300,8 @@ pub struct PageServerConf {
    ///
    /// Setting this to zero disables limits on total ephemeral layer size.
    pub ephemeral_bytes_per_memory_kb: usize,
+
+    pub walredo_process_kind: crate::walredo::ProcessKind,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -401,6 +407,8 @@ struct PageServerConfigBuilder {
    validate_vectored_get: BuilderValue<bool>,

    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
+
+    walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
 }

 impl PageServerConfigBuilder {
@@ -489,6 +497,8 @@ impl PageServerConfigBuilder {
            )),
            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
+
+            walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
        }
    }
 }
@@ -676,6 +686,10 @@ impl PageServerConfigBuilder {
        self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
    }

+    pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
+        self.walredo_process_kind = BuilderValue::Set(value);
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();

@@ -733,6 +747,7 @@ impl PageServerConfigBuilder {
                max_vectored_read_bytes,
                validate_vectored_get,
                ephemeral_bytes_per_memory_kb,
+                walredo_process_kind,
            }
            CUSTOM LOGIC
            {
@@ -1029,6 +1044,9 @@ impl PageServerConf {
                "ephemeral_bytes_per_memory_kb" => {
                    builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                }
+                "walredo_process_kind" => {
+                    builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
+                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1112,6 +1130,7 @@ impl PageServerConf {
            ),
            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+            walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
        }
    }
 }
@@ -1351,6 +1370,7 @@ background_task_maximum_delay = '334 s'
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1424,6 +1444,7 @@ background_task_maximum_delay = '334 s'
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -2,9 +2,10 @@
 //! and push them to a HTTP endpoint.
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::size::CalculateSyntheticSizeError;
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
+use crate::tenant::{
+    mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
+};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
@@ -349,12 +350,19 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
    // Same for the loop that fetches computed metrics.
    // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
    // which turns out is really handy to understand the system.
-    match tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await {
-        Ok(_) => {}
-        Err(CalculateSyntheticSizeError::Cancelled) => {}
-        Err(e) => {
-            let tenant_shard_id = tenant.tenant_shard_id();
-            error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
-        }
+    let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else {
+        return;
+    };
+
+    // this error can be returned if timeline is shutting down, but it does not
+    // mean the synthetic size worker should terminate.
+    let shutting_down = matches!(
+        e.downcast_ref::<PageReconstructError>(),
+        Some(PageReconstructError::Cancelled)
+    );
+
+    if !shutting_down {
+        let tenant_shard_id = tenant.tenant_shard_id();
+        error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
    }
 }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -81,10 +81,8 @@ paths:
        Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
        404 means that deletion successfully finished"
      responses:
-        "200":
-          description: Tenant was successfully deleted, or was already not found.
        "404":
-          description: Tenant not found. This is a success result, equivalent to 200.
+          description: Tenant not found. This is the success path.
          content:
            application/json:
              schema:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -181,7 +181,9 @@ impl From<PageReconstructError> for ApiError {
            PageReconstructError::MissingKey(e) => {
                ApiError::InternalServerError(anyhow::anyhow!("{e}"))
            }
-            PageReconstructError::Cancelled => ApiError::Cancelled,
+            PageReconstructError::Cancelled => {
+                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
+            }
            PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
            PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
        }
@@ -1071,7 +1073,7 @@ async fn tenant_delete_handler(

    let state = get_state(&request);

-    let status = state
+    state
        .tenant_manager
        .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
        .instrument(info_span!("tenant_delete_handler",
@@ -1080,14 +1082,7 @@ async fn tenant_delete_handler(
        ))
        .await?;

-    // Callers use 404 as success for deletions, for historical reasons.
-    if status == StatusCode::NOT_FOUND {
-        return Err(ApiError::NotFound(
-            anyhow::anyhow!("Deletion complete").into(),
-        ));
-    }
-
-    json_response(status, ())
+    json_response(StatusCode::ACCEPTED, ())
 }

 /// HTTP endpoint to query the current tenant_size of a tenant.
@@ -1135,10 +1130,7 @@ async fn tenant_size_handler(
            &ctx,
        )
        .await
-        .map_err(|e| match e {
-            crate::tenant::size::CalculateSyntheticSizeError::Cancelled => ApiError::ShuttingDown,
-            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
-        })?;
+        .map_err(ApiError::InternalServerError)?;

    let mut sizes = None;
    let accepts_html = headers
@@ -1146,7 +1138,9 @@ async fn tenant_size_handler(
        .map(|v| v == "text/html")
        .unwrap_or_default();
    if !inputs_only.unwrap_or(false) {
-        let storage_model = inputs.calculate_model();
+        let storage_model = inputs
+            .calculate_model()
+            .map_err(ApiError::InternalServerError)?;
        let size = storage_model.calculate();

        // If request header expects html, return html
@@ -2188,7 +2182,7 @@ async fn tenant_scan_remote_handler(
            {
                Ok((index_part, index_generation)) => {
                    tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
-                        index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn());
+                        index_part.layer_metadata.len(), index_part.get_disk_consistent_lsn());
                    generation = std::cmp::max(generation, index_generation);
                }
                Err(DownloadError::NotFound) => {
@@ -2430,25 +2424,6 @@ async fn list_aux_files(
    json_response(StatusCode::OK, files)
 }

-async fn perf_info(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    let state = get_state(&request);
-
-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
-
-    let result = timeline.perf_info().await;
-
-    json_response(StatusCode::OK, result)
-}
-
 async fn ingest_aux_files(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -2876,9 +2851,5 @@ pub fn make_router(
            |r| testing_api_handler("list_aux_files", r, list_aux_files),
        )
        .post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
-            |r| testing_api_handler("perf_info", r, perf_info),
-        )
        .any(handler_404))
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2108,7 +2108,6 @@ pub(crate) struct TimelineMetrics {
    pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
    pub evictions: IntCounter,
    pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
-    shutdown: std::sync::atomic::AtomicBool,
 }

 impl TimelineMetrics {
@@ -2228,7 +2227,6 @@ impl TimelineMetrics {
            evictions_with_low_residence_duration: std::sync::RwLock::new(
                evictions_with_low_residence_duration,
            ),
-            shutdown: std::sync::atomic::AtomicBool::default(),
        }
    }

@@ -2251,17 +2249,6 @@ impl TimelineMetrics {
    }

    pub(crate) fn shutdown(&self) {
-        let was_shutdown = self
-            .shutdown
-            .swap(true, std::sync::atomic::Ordering::Relaxed);
-
-        if was_shutdown {
-            // this happens on tenant deletion because tenant first shuts down timelines, then
-            // invokes timeline deletion which first shuts down the timeline again.
-            // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
-            return;
-        }
-
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -18,16 +18,16 @@ use enum_map::Enum;
 use itertools::Itertools;
 use pageserver_api::key::{
    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
-    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
-    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
-    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+    relmap_file_key, slru_block_to_key, slru_dir_to_key, slru_segment_key_range,
+    slru_segment_size_to_key, twophase_file_key, twophase_key_range, AUX_FILES_KEY, CHECKPOINT_KEY,
+    CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
-use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
+use postgres_ffi::{Oid, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
@@ -36,7 +36,6 @@ use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
-use utils::pausable_failpoint;
 use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};

@@ -410,8 +409,6 @@ impl Timeline {
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<LsnForTimestamp, PageReconstructError> {
-        pausable_failpoint!("find-lsn-for-timestamp-pausable");
-
        let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
        // We use this method to figure out the branching LSN for the new branch, but the
        // GC cutoff could be before the branching point and we cannot create a new branch
@@ -427,7 +424,6 @@ impl Timeline {

        let mut found_smaller = false;
        let mut found_larger = false;
-
        while low < high {
            if cancel.is_cancelled() {
                return Err(PageReconstructError::Cancelled);
@@ -722,22 +718,10 @@ impl Timeline {
                result.insert(fname, content);
            }
        }
-        self.aux_file_size_estimator.on_initial(sz);
+        self.aux_file_size_estimator.on_base_backup(sz);
        Ok(result)
    }

-    pub(crate) async fn trigger_aux_file_size_computation(
-        &self,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
-        let current_policy = self.last_aux_file_policy.load();
-        if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy {
-            self.list_aux_files_v2(lsn, ctx).await?;
-        }
-        Ok(())
-    }
-
    pub(crate) async fn list_aux_files(
        &self,
        lsn: Lsn,
@@ -776,27 +760,6 @@ impl Timeline {
        }
    }

-    pub(crate) async fn get_replorigins(
-        &self,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<HashMap<RepOriginId, Lsn>, PageReconstructError> {
-        let kv = self
-            .scan(KeySpace::single(repl_origin_key_range()), lsn, ctx)
-            .await
-            .context("scan")?;
-        let mut result = HashMap::new();
-        for (k, v) in kv {
-            let v = v.context("get value")?;
-            let origin_id = k.field6 as RepOriginId;
-            let origin_lsn = Lsn::des(&v).unwrap();
-            if origin_lsn != Lsn::INVALID {
-                result.insert(origin_id, origin_lsn);
-            }
-        }
-        Ok(result)
-    }
-
    /// Does the same as get_current_logical_size but counted on demand.
    /// Used to initialize the logical size tracking on startup.
    ///
@@ -919,20 +882,10 @@ impl Timeline {
            result.add_key(AUX_FILES_KEY);
        }

-        #[cfg(test)]
-        {
-            let guard = self.extra_test_dense_keyspace.load();
-            for kr in &guard.ranges {
-                result.add_range(kr.clone());
-            }
-        }
-
        Ok((
            result.to_keyspace(),
            /* AUX sparse key space */
-            SparseKeySpace(KeySpace {
-                ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
-            }),
+            SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
        ))
    }

@@ -1201,20 +1154,6 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    pub async fn set_replorigin(
-        &mut self,
-        origin_id: RepOriginId,
-        origin_lsn: Lsn,
-    ) -> anyhow::Result<()> {
-        let key = repl_origin_key(origin_id);
-        self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
-        Ok(())
-    }
-
-    pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
-        self.set_replorigin(origin_id, Lsn::INVALID).await
-    }
-
    pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
        self.put(CONTROLFILE_KEY, Value::Image(img));
        Ok(())
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -509,24 +509,11 @@ pub(crate) enum GcError {
    #[error(transparent)]
    Remote(anyhow::Error),

-    // An error reading while calculating GC cutoffs
-    #[error(transparent)]
-    GcCutoffs(PageReconstructError),
-
    // If GC was invoked for a particular timeline, this error means it didn't exist
    #[error("timeline not found")]
    TimelineNotFound,
 }

-impl From<PageReconstructError> for GcError {
-    fn from(value: PageReconstructError) -> Self {
-        match value {
-            PageReconstructError::Cancelled => Self::TimelineCancelled,
-            other => Self::GcCutoffs(other),
-        }
-    }
-}
-
 impl Tenant {
    /// Yet another helper for timeline initialization.
    ///
@@ -1046,6 +1033,7 @@ impl Tenant {
                remote_metadata,
                TimelineResources {
                    remote_client,
+                    deletion_queue_client: self.deletion_queue_client.clone(),
                    timeline_get_throttle: self.timeline_get_throttle.clone(),
                },
                ctx,
@@ -1071,6 +1059,7 @@ impl Tenant {
                timeline_id,
                &index_part.metadata,
                remote_timeline_client,
+                self.deletion_queue_client.clone(),
            )
            .instrument(tracing::info_span!("timeline_delete", %timeline_id))
            .await
@@ -2932,9 +2921,17 @@ impl Tenant {
                .checked_sub(horizon)
                .unwrap_or(Lsn(0));

-            let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?;
-            let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
-            assert!(old.is_none());
+            let res = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await;
+
+            match res {
+                Ok(cutoffs) => {
+                    let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
+                    assert!(old.is_none());
+                }
+                Err(e) => {
+                    tracing::warn!(timeline_id = %timeline.timeline_id, "ignoring failure to find gc cutoffs: {e:#}");
+                }
+            }
        }

        if !self.is_active() || self.cancel.is_cancelled() {
@@ -3398,12 +3395,6 @@ impl Tenant {
        let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id;
        let unfinished_timeline = raw_timeline.raw_timeline()?;

-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        unfinished_timeline.maybe_spawn_flush_loop();
-
        import_datadir::import_timeline_from_postgres_datadir(
            unfinished_timeline,
            &pgdata_path,
@@ -3415,6 +3406,12 @@ impl Tenant {
            format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}")
        })?;

+        // Flush the new layer files to disk, before we make the timeline as available to
+        // the outside world.
+        //
+        // Flush loop needs to be spawned in order to be able to flush.
+        unfinished_timeline.maybe_spawn_flush_loop();
+
        fail::fail_point!("before-checkpoint-new-timeline", |_| {
            anyhow::bail!("failpoint before-checkpoint-new-timeline");
        });
@@ -3446,6 +3443,7 @@ impl Tenant {
        );
        TimelineResources {
            remote_client,
+            deletion_queue_client: self.deletion_queue_client.clone(),
            timeline_get_throttle: self.timeline_get_throttle.clone(),
        }
    }
@@ -3555,7 +3553,7 @@ impl Tenant {
        cause: LogicalSizeCalculationCause,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<size::ModelInputs, size::CalculateSyntheticSizeError> {
+    ) -> anyhow::Result<size::ModelInputs> {
        let logical_sizes_at_once = self
            .conf
            .concurrent_tenant_size_logical_size_queries
@@ -3570,8 +3568,8 @@ impl Tenant {
        // See more for on the issue #2748 condenced out of the initial PR review.
        let mut shared_cache = tokio::select! {
            locked = self.cached_logical_sizes.lock() => locked,
-            _ = cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled),
-            _ = self.cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled),
+            _ = cancel.cancelled() => anyhow::bail!("cancelled"),
+            _ = self.cancel.cancelled() => anyhow::bail!("tenant is shutting down"),
        };

        size::gather_inputs(
@@ -3595,10 +3593,10 @@ impl Tenant {
        cause: LogicalSizeCalculationCause,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<u64, size::CalculateSyntheticSizeError> {
+    ) -> anyhow::Result<u64> {
        let inputs = self.gather_size_inputs(None, cause, cancel, ctx).await?;

-        let size = inputs.calculate();
+        let size = inputs.calculate()?;

        self.set_cached_synthetic_size(size);

@@ -3867,9 +3865,6 @@ pub(crate) mod harness {
        pub fn create_custom(
            test_name: &'static str,
            tenant_conf: TenantConf,
-            tenant_id: TenantId,
-            shard_identity: ShardIdentity,
-            generation: Generation,
        ) -> anyhow::Result<Self> {
            setup_logging();

@@ -3882,12 +3877,8 @@ pub(crate) mod harness {
            // OK in a test.
            let conf: &'static PageServerConf = Box::leak(Box::new(conf));

-            let shard = shard_identity.shard_index();
-            let tenant_shard_id = TenantShardId {
-                tenant_id,
-                shard_number: shard.shard_number,
-                shard_count: shard.shard_count,
-            };
+            let tenant_id = TenantId::generate();
+            let tenant_shard_id = TenantShardId::unsharded(tenant_id);
            fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?;
            fs::create_dir_all(conf.timelines_path(&tenant_shard_id))?;

@@ -3905,8 +3896,8 @@ pub(crate) mod harness {
                conf,
                tenant_conf,
                tenant_shard_id,
-                generation,
-                shard,
+                generation: Generation::new(0xdeadbeef),
+                shard: ShardIndex::unsharded(),
                remote_storage,
                remote_fs_dir,
                deletion_queue,
@@ -3921,15 +3912,8 @@ pub(crate) mod harness {
                compaction_period: Duration::ZERO,
                ..TenantConf::default()
            };
-            let tenant_id = TenantId::generate();
-            let shard = ShardIdentity::unsharded();
-            Self::create_custom(
-                test_name,
-                tenant_conf,
-                tenant_id,
-                shard,
-                Generation::new(0xdeadbeef),
-            )
+
+            Self::create_custom(test_name, tenant_conf)
        }

        pub fn span(&self) -> tracing::Span {
@@ -4008,8 +3992,8 @@ pub(crate) mod harness {
                let base_img = base_img.expect("Neon WAL redo requires base image").1;
                let mut page = BytesMut::new();
                page.extend_from_slice(&base_img);
-                for (record_lsn, record) in records {
-                    apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?;
+                for (_record_lsn, record) in records {
+                    apply_neon::apply_in_neon(&record, key, &mut page)?;
                }
                Ok(page.freeze())
            } else {
@@ -4043,20 +4027,16 @@ mod tests {
    use crate::repository::{Key, Value};
    use crate::tenant::harness::*;
    use crate::tenant::timeline::CompactFlags;
-    use crate::walrecord::NeonWalRecord;
    use crate::DEFAULT_PG_VERSION;
    use bytes::{Bytes, BytesMut};
    use hex_literal::hex;
-    use itertools::Itertools;
    use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
    use pageserver_api::keyspace::KeySpace;
    use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
    use rand::{thread_rng, Rng};
-    use storage_layer::PersistentLayerKey;
    use tests::storage_layer::ValuesReconstructState;
    use tests::timeline::{GetVectoredError, ShutdownMode};
    use utils::bin_ser::BeSer;
-    use utils::id::TenantId;

    static TEST_KEY: Lazy<Key> =
        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4956,13 +4936,7 @@ mod tests {
            ..TenantConf::default()
        };

-        let harness = TenantHarness::create_custom(
-            "test_get_vectored_key_gap",
-            tenant_conf,
-            TenantId::generate(),
-            ShardIdentity::unsharded(),
-            Generation::new(0xdeadbeef),
-        )?;
+        let harness = TenantHarness::create_custom("test_get_vectored_key_gap", tenant_conf)?;
        let (tenant, ctx) = harness.load().await;

        let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
@@ -5267,9 +5241,6 @@ mod tests {
        let cancel = CancellationToken::new();

        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
-        let mut test_key_end = test_key;
-        test_key_end.field6 = NUM_KEYS as u32;
-        tline.add_extra_test_dense_keyspace(KeySpace::single(test_key..test_key_end));

        let mut keyspace = KeySpaceAccum::new();

@@ -6229,8 +6200,8 @@ mod tests {

        let cancel = CancellationToken::new();

-        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
-        assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
        let mut test_key = base_key;
        let mut lsn = Lsn(0x10);

@@ -6335,7 +6306,6 @@ mod tests {
                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
            )
            .await?;
-        tline.add_extra_test_dense_keyspace(KeySpace::single(base_key..(base_key_nonexist.next())));

        let child = tenant
            .branch_timeline_test_with_layers(
@@ -6593,8 +6563,8 @@ mod tests {
    }

    #[tokio::test]
-    async fn test_metadata_tombstone_image_creation() {
-        let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap();
+    async fn test_metadata_tombstone_image_creation() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
        let (tenant, ctx) = harness.load().await;

        let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
@@ -6622,8 +6592,7 @@ mod tests {
                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
                Lsn(0x30),
            )
-            .await
-            .unwrap();
+            .await?;

        let cancel = CancellationToken::new();

@@ -6638,24 +6607,23 @@ mod tests {
                },
                &ctx,
            )
-            .await
-            .unwrap();
+            .await?;

        // Image layers are created at last_record_lsn
        let images = tline
            .inspect_image_layers(Lsn(0x30), &ctx)
-            .await
-            .unwrap()
+            .await?
            .into_iter()
            .filter(|(k, _)| k.is_metadata_key())
            .collect::<Vec<_>>();
        assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed.
+
+        Ok(())
    }

    #[tokio::test]
-    async fn test_metadata_tombstone_empty_image_creation() {
-        let harness =
-            TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap();
+    async fn test_metadata_tombstone_empty_image_creation() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
        let (tenant, ctx) = harness.load().await;

        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -6677,8 +6645,7 @@ mod tests {
                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
                Lsn(0x30),
            )
-            .await
-            .unwrap();
+            .await?;

        let cancel = CancellationToken::new();

@@ -6693,249 +6660,16 @@ mod tests {
                },
                &ctx,
            )
-            .await
-            .unwrap();
+            .await?;

        // Image layers are created at last_record_lsn
        let images = tline
            .inspect_image_layers(Lsn(0x30), &ctx)
-            .await
-            .unwrap()
+            .await?
            .into_iter()
            .filter(|(k, _)| k.is_metadata_key())
            .collect::<Vec<_>>();
        assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created
-    }
-
-    #[tokio::test]
-    async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction_images")?;
-        let (tenant, ctx) = harness.load().await;
-
-        fn get_key(id: u32) -> Key {
-            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
-            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
-            key.field6 = id;
-            key
-        }
-
-        // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
-        //
-        //  | D1 |                       | D3 |
-        // -|    |-- gc horizon -----------------
-        //  |    |                | D2 |
-        // --------- img layer ------------------
-        //
-        // What we should expact from this compaction is:
-        //  | Part of D1 |               | D3 |
-        // --------- img layer with D1+D2 at GC horizon------------------
-
-        // img layer at 0x10
-        let img_layer = (0..10)
-            .map(|id| (get_key(id), test_img(&format!("value {id}@0x10"))))
-            .collect_vec();
-
-        let delta1 = vec![
-            // TODO: we should test a real delta record here, which requires us to add a variant of NeonWalRecord for testing purpose.
-            (
-                get_key(1),
-                Lsn(0x20),
-                Value::Image(test_img("value 1@0x20")),
-            ),
-            (
-                get_key(2),
-                Lsn(0x30),
-                Value::Image(test_img("value 2@0x30")),
-            ),
-            (
-                get_key(3),
-                Lsn(0x40),
-                Value::Image(test_img("value 3@0x40")),
-            ),
-        ];
-        let delta2 = vec![
-            (
-                get_key(5),
-                Lsn(0x20),
-                Value::Image(test_img("value 5@0x20")),
-            ),
-            (
-                get_key(6),
-                Lsn(0x20),
-                Value::Image(test_img("value 6@0x20")),
-            ),
-        ];
-        let delta3 = vec![
-            (
-                get_key(8),
-                Lsn(0x40),
-                Value::Image(test_img("value 8@0x40")),
-            ),
-            (
-                get_key(9),
-                Lsn(0x40),
-                Value::Image(test_img("value 9@0x40")),
-            ),
-        ];
-
-        let tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                vec![delta1, delta2, delta3], // delta layers
-                vec![(Lsn(0x10), img_layer)], // image layers
-                Lsn(0x50),
-            )
-            .await?;
-        {
-            // Update GC info
-            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.pitr = Lsn(0x30);
-            guard.cutoffs.horizon = Lsn(0x30);
-        }
-
-        let cancel = CancellationToken::new();
-        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
-
-        // Check if the image layer at the GC horizon contains exactly what we want
-        let image_at_gc_horizon = tline
-            .inspect_image_layers(Lsn(0x30), &ctx)
-            .await
-            .unwrap()
-            .into_iter()
-            .filter(|(k, _)| k.is_metadata_key())
-            .collect::<Vec<_>>();
-
-        assert_eq!(image_at_gc_horizon.len(), 10);
-        let expected_lsn = [0x10, 0x20, 0x30, 0x10, 0x10, 0x20, 0x20, 0x10, 0x10, 0x10];
-        for idx in 0..10 {
-            assert_eq!(
-                image_at_gc_horizon[idx],
-                (
-                    get_key(idx as u32),
-                    test_img(&format!("value {idx}@{:#x}", expected_lsn[idx]))
-                )
-            );
-        }
-
-        // Check if old layers are removed / new layers have the expected LSN
-        let mut all_layers = tline.inspect_historic_layers().await.unwrap();
-        all_layers.sort_by(|k1, k2| {
-            (
-                k1.is_delta,
-                k1.key_range.start,
-                k1.key_range.end,
-                k1.lsn_range.start,
-                k1.lsn_range.end,
-            )
-                .cmp(&(
-                    k2.is_delta,
-                    k2.key_range.start,
-                    k2.key_range.end,
-                    k2.lsn_range.start,
-                    k2.lsn_range.end,
-                ))
-        });
-        assert_eq!(
-            all_layers,
-            vec![
-                // Image layer at GC horizon
-                PersistentLayerKey {
-                    key_range: Key::MIN..get_key(10),
-                    lsn_range: Lsn(0x30)..Lsn(0x31),
-                    is_delta: false
-                },
-                // The delta layer that is cut in the middle
-                PersistentLayerKey {
-                    key_range: Key::MIN..get_key(9),
-                    lsn_range: Lsn(0x30)..Lsn(0x41),
-                    is_delta: true
-                },
-                // The delta layer we created and should not be picked for the compaction
-                PersistentLayerKey {
-                    key_range: get_key(8)..get_key(10),
-                    lsn_range: Lsn(0x40)..Lsn(0x41),
-                    is_delta: true
-                }
-            ]
-        );
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_neon_test_record() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_neon_test_record")?;
-        let (tenant, ctx) = harness.load().await;
-
-        fn get_key(id: u32) -> Key {
-            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
-            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
-            key.field6 = id;
-            key
-        }
-
-        let delta1 = vec![
-            (
-                get_key(1),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append(",0x20")),
-            ),
-            (
-                get_key(1),
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append(",0x30")),
-            ),
-            (get_key(2), Lsn(0x10), Value::Image("0x10".into())),
-            (
-                get_key(2),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append(",0x20")),
-            ),
-            (
-                get_key(2),
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append(",0x30")),
-            ),
-            (get_key(3), Lsn(0x10), Value::Image("0x10".into())),
-            (
-                get_key(3),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_clear()),
-            ),
-            (get_key(4), Lsn(0x10), Value::Image("0x10".into())),
-            (
-                get_key(4),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_init()),
-            ),
-        ];
-        let image1 = vec![(get_key(1), "0x10".into())];
-
-        let tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                vec![delta1],              // delta layers
-                vec![(Lsn(0x10), image1)], // image layers
-                Lsn(0x50),
-            )
-            .await?;
-
-        assert_eq!(
-            tline.get(get_key(1), Lsn(0x50), &ctx).await?,
-            Bytes::from_static(b"0x10,0x20,0x30")
-        );
-        assert_eq!(
-            tline.get(get_key(2), Lsn(0x50), &ctx).await?,
-            Bytes::from_static(b"0x10,0x20,0x30")
-        );
-        // assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new());
-        // assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new());

        Ok(())
    }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -16,7 +16,6 @@ use crate::{
    task_mgr::{self, TaskKind},
    tenant::{
        mgr::{TenantSlot, TenantsMapRemoveResult},
-        remote_timeline_client::remote_heatmap_path,
        timeline::ShutdownMode,
    },
 };
@@ -532,25 +531,6 @@ impl DeleteTenantFlow {
            }
        }

-        // Remove top-level tenant objects that don't belong to a timeline, such as heatmap
-        let heatmap_path = remote_heatmap_path(&tenant.tenant_shard_id());
-        if let Some(Err(e)) = backoff::retry(
-            || async {
-                remote_storage
-                    .delete(&heatmap_path, &task_mgr::shutdown_token())
-                    .await
-            },
-            TimeoutOrCancel::caused_by_cancel,
-            FAILED_UPLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "remove_remote_tenant_heatmap",
-            &task_mgr::shutdown_token(),
-        )
-        .await
-        {
-            tracing::warn!("Failed to delete heatmap at {heatmap_path}: {e}");
-        }
-
        let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
        // May not exist if we fail in cleanup_remaining_fs_traces after removing it
        if timelines_path.exists() {
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -1,23 +1,15 @@
-//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in
-//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines,
-//! this struct and it's original serialization format is still needed because they were written a
-//! long time ago.
+//! Every image of a certain timeline from [`crate::tenant::Tenant`]
+//! has a metadata that needs to be stored persistently.
 //!
-//! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json
-//! versioning.
+//! Later, the file gets used in [`remote_timeline_client`] as a part of
+//! external storage import and export operations.
 //!
-//! To clean up this module we need to migrate all index_part.json files to a later version.
-//! While doing this, we need to be mindful about s3 based recovery as well, so it might take
-//! however long we keep the old versions to be able to delete the old code. After that, we can
-//! remove everything else than [`TimelineMetadataBodyV2`], rename it as `TimelineMetadata` and
-//! move it to `index.rs`. Before doing all of this, we need to keep the structures for backwards
-//! compatibility.
+//! The module contains all structs and related helper methods related to timeline metadata.
 //!
 //! [`remote_timeline_client`]: super::remote_timeline_client
-//! [`IndexPart`]: super::remote_timeline_client::index::IndexPart

 use anyhow::ensure;
-use serde::{Deserialize, Serialize};
+use serde::{de::Error, Deserialize, Serialize, Serializer};
 use utils::bin_ser::SerializeError;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};

@@ -25,37 +17,17 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
 const METADATA_FORMAT_VERSION: u16 = 4;

 /// Previous supported format versions.
-///
-/// In practice, none of these should remain, all are [`METADATA_FORMAT_VERSION`], but confirming
-/// that requires a scrubber run which is yet to be done.
 const METADATA_OLD_FORMAT_VERSION: u16 = 3;

-/// When the file existed on disk we assumed that a write of up to METADATA_MAX_SIZE bytes is atomic.
+/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic.
 ///
 /// This is the same assumption that PostgreSQL makes with the control file,
-///
 /// see PG_CONTROL_MAX_SAFE_SIZE
 const METADATA_MAX_SIZE: usize = 512;

-/// Legacy metadata stored as a component of `index_part.json` per timeline.
+/// Metadata stored on disk for each timeline
 ///
-/// Do not make new changes to this type or the module. In production, we have two different kinds
-/// of serializations of this type: bincode and json. Bincode version reflects what used to be
-/// stored on disk in earlier versions and does internal crc32 checksumming.
-///
-/// This type should not implement `serde::Serialize` or `serde::Deserialize` because there would
-/// be a confusion whether you want the old version ([`TimelineMetadata::from_bytes`]) or the modern
-/// as-exists in `index_part.json` ([`self::modern_serde`]).
-///
-/// ```compile_fail
-/// #[derive(serde::Serialize)]
-/// struct DoNotDoThis(pageserver::tenant::metadata::TimelineMetadata);
-/// ```
-///
-/// ```compile_fail
-/// #[derive(serde::Deserialize)]
-/// struct NeitherDoThis(pageserver::tenant::metadata::TimelineMetadata);
-/// ```
+/// The fields correspond to the values we hold in memory, in Timeline.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct TimelineMetadata {
    hdr: TimelineMetadataHeader,
@@ -68,49 +40,6 @@ struct TimelineMetadataHeader {
    size: u16,           // size of serialized metadata
    format_version: u16, // metadata format version (used for compatibility checks)
 }
-
-impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader {
-    type Error = Crc32CalculationFailed;
-
-    fn try_from(value: &TimelineMetadataBodyV2) -> Result<Self, Self::Error> {
-        #[derive(Default)]
-        struct Crc32Sink {
-            crc: u32,
-            count: usize,
-        }
-
-        impl std::io::Write for Crc32Sink {
-            fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
-                self.crc = crc32c::crc32c_append(self.crc, buf);
-                self.count += buf.len();
-                Ok(buf.len())
-            }
-
-            fn flush(&mut self) -> std::io::Result<()> {
-                Ok(())
-            }
-        }
-
-        // jump through hoops to calculate the crc32 so that TimelineMetadata::ne works
-        // across serialization versions
-        let mut sink = Crc32Sink::default();
-        <TimelineMetadataBodyV2 as utils::bin_ser::BeSer>::ser_into(value, &mut sink)
-            .map_err(Crc32CalculationFailed)?;
-
-        let size = METADATA_HDR_SIZE + sink.count;
-
-        Ok(TimelineMetadataHeader {
-            checksum: sink.crc,
-            size: size as u16,
-            format_version: METADATA_FORMAT_VERSION,
-        })
-    }
-}
-
-#[derive(thiserror::Error, Debug)]
-#[error("re-serializing for crc32 failed")]
-struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError);
-
 const METADATA_HDR_SIZE: usize = std::mem::size_of::<TimelineMetadataHeader>();

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -182,12 +111,6 @@ impl TimelineMetadata {
        }
    }

-    #[cfg(test)]
-    pub(crate) fn with_recalculated_checksum(mut self) -> anyhow::Result<Self> {
-        self.hdr = TimelineMetadataHeader::try_from(&self.body)?;
-        Ok(self)
-    }
-
    fn upgrade_timeline_metadata(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
        let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?;

@@ -338,93 +261,25 @@ impl TimelineMetadata {
    }
 }

-pub(crate) mod modern_serde {
-    use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader};
-    use serde::{Deserialize, Serialize};
-
-    pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result<TimelineMetadata, D::Error>
+impl<'de> Deserialize<'de> for TimelineMetadata {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
-        D: serde::de::Deserializer<'de>,
+        D: serde::Deserializer<'de>,
    {
-        // for legacy reasons versions 1-5 had TimelineMetadata serialized as a Vec<u8> field with
-        // BeSer.
-        struct Visitor;
-
-        impl<'d> serde::de::Visitor<'d> for Visitor {
-            type Value = TimelineMetadata;
-
-            fn expecting(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-                f.write_str("BeSer bytes or json structure")
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'d>,
-            {
-                use serde::de::Error;
-                let de = serde::de::value::SeqAccessDeserializer::new(seq);
-                Vec::<u8>::deserialize(de)
-                    .map(|v| TimelineMetadata::from_bytes(&v).map_err(A::Error::custom))?
-            }
-
-            fn visit_map<A>(self, map: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::MapAccess<'d>,
-            {
-                use serde::de::Error;
-
-                let de = serde::de::value::MapAccessDeserializer::new(map);
-                let body = TimelineMetadataBodyV2::deserialize(de)?;
-                let hdr = TimelineMetadataHeader::try_from(&body).map_err(A::Error::custom)?;
-
-                Ok(TimelineMetadata { hdr, body })
-            }
-        }
-
-        deserializer.deserialize_any(Visitor)
+        let bytes = Vec::<u8>::deserialize(deserializer)?;
+        Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{e}")))
    }
+}

-    pub(crate) fn serialize<S>(
-        metadata: &TimelineMetadata,
-        serializer: S,
-    ) -> Result<S::Ok, S::Error>
+impl Serialize for TimelineMetadata {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
-        S: serde::Serializer,
+        S: Serializer,
    {
-        // header is not needed, upon reading we've upgraded all v1 to v2
-        metadata.body.serialize(serializer)
-    }
-
-    #[test]
-    fn deserializes_bytes_as_well_as_equivalent_body_v2() {
-        #[derive(serde::Deserialize, serde::Serialize)]
-        struct Wrapper(
-            #[serde(deserialize_with = "deserialize", serialize_with = "serialize")]
-            TimelineMetadata,
-        );
-
-        let too_many_bytes = "[216,111,252,208,0,54,0,4,0,0,0,0,1,73,253,144,1,0,0,0,0,1,73,253,24,0,0,0,0,0,0,0,0,0,0,0,0,0,1,73,253,24,0,0,0,0,1,73,253,24,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]";
-
-        let wrapper_from_bytes = serde_json::from_str::<Wrapper>(too_many_bytes).unwrap();
-
-        let serialized = serde_json::to_value(&wrapper_from_bytes).unwrap();
-
-        assert_eq!(
-            serialized,
-            serde_json::json! {{
-                "disk_consistent_lsn": "0/149FD90",
-                "prev_record_lsn": "0/149FD18",
-                "ancestor_timeline": null,
-                "ancestor_lsn": "0/0",
-                "latest_gc_cutoff_lsn": "0/149FD18",
-                "initdb_lsn": "0/149FD18",
-                "pg_version": 15
-            }}
-        );
-
-        let wrapper_from_json = serde_json::value::from_value::<Wrapper>(serialized).unwrap();
-
-        assert_eq!(wrapper_from_bytes.0, wrapper_from_json.0);
+        let bytes = self
+            .to_bytes()
+            .map_err(|e| serde::ser::Error::custom(format!("{e}")))?;
+        bytes.serialize(serializer)
    }
 }

@@ -548,6 +403,59 @@ mod tests {
        );
    }

+    #[test]
+    fn test_metadata_bincode_serde() {
+        let original_metadata = TimelineMetadata::new(
+            Lsn(0x200),
+            Some(Lsn(0x100)),
+            Some(TIMELINE_ID),
+            Lsn(0),
+            Lsn(0),
+            Lsn(0),
+            // Any version will do here, so use the default
+            crate::DEFAULT_PG_VERSION,
+        );
+        let metadata_bytes = original_metadata
+            .to_bytes()
+            .expect("Cannot create bytes array from metadata");
+
+        let metadata_bincode_be_bytes = original_metadata
+            .ser()
+            .expect("Cannot serialize the metadata");
+
+        // 8 bytes for the length of the vector
+        assert_eq!(metadata_bincode_be_bytes.len(), 8 + metadata_bytes.len());
+
+        let expected_bincode_bytes = {
+            let mut temp = vec![];
+            let len_bytes = metadata_bytes.len().to_be_bytes();
+            temp.extend_from_slice(&len_bytes);
+            temp.extend_from_slice(&metadata_bytes);
+            temp
+        };
+        assert_eq!(metadata_bincode_be_bytes, expected_bincode_bytes);
+
+        let deserialized_metadata = TimelineMetadata::des(&metadata_bincode_be_bytes).unwrap();
+        // Deserialized metadata has the metadata header, which is different from the serialized one.
+        //   Reference: TimelineMetaData::to_bytes()
+        let expected_metadata = {
+            let mut temp_metadata = original_metadata;
+            let body_bytes = temp_metadata
+                .body
+                .ser()
+                .expect("Cannot serialize the metadata body");
+            let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
+            let hdr = TimelineMetadataHeader {
+                size: metadata_size as u16,
+                format_version: METADATA_FORMAT_VERSION,
+                checksum: crc32c::crc32c(&body_bytes),
+            };
+            temp_metadata.hdr = hdr;
+            temp_metadata
+        };
+        assert_eq!(deserialized_metadata, expected_metadata);
+    }
+
    #[test]
    fn test_metadata_bincode_serde_ensure_roundtrip() {
        let original_metadata = TimelineMetadata::new(
@@ -561,6 +469,8 @@ mod tests {
            crate::DEFAULT_PG_VERSION,
        );
        let expected_bytes = vec![
+            /* bincode length encoding bytes */
+            0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector
            /* TimelineMetadataHeader */
            4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
            /* TimelineMetadataBodyV2 */
@@ -590,7 +500,7 @@ mod tests {
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0,
        ];
-        let metadata_ser_bytes = original_metadata.to_bytes().unwrap();
+        let metadata_ser_bytes = original_metadata.ser().unwrap();
        assert_eq!(metadata_ser_bytes, expected_bytes);

        let expected_metadata = {
@@ -608,7 +518,7 @@ mod tests {
            temp_metadata.hdr = hdr;
            temp_metadata
        };
-        let des_metadata = TimelineMetadata::from_bytes(&metadata_ser_bytes).unwrap();
+        let des_metadata = TimelineMetadata::des(&metadata_ser_bytes).unwrap();
        assert_eq!(des_metadata, expected_metadata);
    }
 }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -3,7 +3,6 @@

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
-use hyper::StatusCode;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
@@ -55,7 +54,6 @@ use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};

 use super::delete::DeleteTenantError;
-use super::remote_timeline_client::remote_tenant_path;
 use super::secondary::SecondaryTenant;
 use super::timeline::detach_ancestor::PreparedTimelineDetach;
 use super::TenantSharedResources;
@@ -1371,7 +1369,7 @@ impl TenantManager {
        &self,
        tenant_shard_id: TenantShardId,
        activation_timeout: Duration,
-    ) -> Result<StatusCode, DeleteTenantError> {
+    ) -> Result<(), DeleteTenantError> {
        super::span::debug_assert_current_span_has_tenant_id();
        // We acquire a SlotGuard during this function to protect against concurrent
        // changes while the ::prepare phase of DeleteTenantFlow executes, but then
@@ -1384,79 +1382,18 @@ impl TenantManager {
        //
        // See https://github.com/neondatabase/neon/issues/5080

-        // Tenant deletion can happen two ways:
-        // - Legacy: called on an attached location. The attached Tenant object stays alive in Stopping
-        //   state until deletion is complete.
-        // - New: called on a pageserver without an attached location.  We proceed with deletion from
-        //   remote storage.
-        //
-        // See https://github.com/neondatabase/neon/issues/5080 for more context on this transition.
+        let slot_guard =
+            tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;

-        let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
-        match &slot_guard.old_value {
-            Some(TenantSlot::Attached(tenant)) => {
-                // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
-                // deletion will be resumed across restarts.
-                let tenant = tenant.clone();
-                return self
-                    .delete_tenant_attached(slot_guard, tenant, activation_timeout)
-                    .await;
+        // unwrap is safe because we used MustExist mode when acquiring
+        let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
+            TenantSlot::Attached(tenant) => tenant.clone(),
+            _ => {
+                // Express "not attached" as equivalent to "not found"
+                return Err(DeleteTenantError::NotAttached);
            }
-            Some(TenantSlot::Secondary(secondary_tenant)) => {
-                secondary_tenant.shutdown().await;
-                let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
-                let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
-                    .await
-                    .with_context(|| {
-                        format!("local tenant directory {local_tenant_directory:?} rename")
-                    })?;
-                spawn_background_purge(tmp_dir);
-            }
-            Some(TenantSlot::InProgress(_)) => unreachable!(),
-            None => {}
        };

-        // Fall through: local state for this tenant is no longer present, proceed with remote delete
-        let remote_path = remote_tenant_path(&tenant_shard_id);
-        let keys = match self
-            .resources
-            .remote_storage
-            .list(
-                Some(&remote_path),
-                remote_storage::ListingMode::NoDelimiter,
-                None,
-                &self.cancel,
-            )
-            .await
-        {
-            Ok(listing) => listing.keys,
-            Err(remote_storage::DownloadError::Cancelled) => {
-                return Err(DeleteTenantError::Cancelled)
-            }
-            Err(remote_storage::DownloadError::NotFound) => return Ok(StatusCode::NOT_FOUND),
-            Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
-        };
-
-        if keys.is_empty() {
-            tracing::info!("Remote storage already deleted");
-        } else {
-            tracing::info!("Deleting {} keys from remote storage", keys.len());
-            self.resources
-                .remote_storage
-                .delete_objects(&keys, &self.cancel)
-                .await?;
-        }
-
-        // Callers use 404 as success for deletions, for historical reasons.
-        Ok(StatusCode::NOT_FOUND)
-    }
-
-    async fn delete_tenant_attached(
-        &self,
-        slot_guard: SlotGuard,
-        tenant: Arc<Tenant>,
-        activation_timeout: Duration,
-    ) -> Result<StatusCode, DeleteTenantError> {
        match tenant.current_state() {
            TenantState::Broken { .. } | TenantState::Stopping { .. } => {
                // If deletion is already in progress, return success (the semantics of this
@@ -1466,7 +1403,7 @@ impl TenantManager {
                    // The `delete_progress` lock is held: deletion is already happening
                    // in the bacckground
                    slot_guard.revert();
-                    return Ok(StatusCode::ACCEPTED);
+                    return Ok(());
                }
            }
            _ => {
@@ -1499,8 +1436,7 @@ impl TenantManager {

        // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
        slot_guard.revert();
-        let () = result?;
-        Ok(StatusCode::ACCEPTED)
+        result
    }

    #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -91,7 +91,8 @@
 //!
 //! The *actual* remote state lags behind the *desired* remote state while
 //! there are in-flight operations.
-//! We keep track of the desired remote state in [`UploadQueueInitialized::dirty`].
+//! We keep track of the desired remote state in
+//! [`UploadQueueInitialized::latest_files`] and [`UploadQueueInitialized::latest_metadata`].
 //! It is initialized based on the [`IndexPart`] that was passed during init
 //! and updated with every `schedule_*` function call.
 //! All this is necessary necessary to compute the future [`IndexPart`]s
@@ -114,7 +115,8 @@
 //!
 //! # Completion
 //!
-//! Once an operation has completed, we update [`UploadQueueInitialized::clean`] immediately,
+//! Once an operation has completed, we update
+//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately,
 //! and submit a request through the DeletionQueue to update
 //! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has
 //! validated that our generation is not stale.  It is this visible value
@@ -414,7 +416,6 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    /// Returns `None` if nothing is yet uplodaded, `Some(disk_consistent_lsn)` otherwise.
    pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
        match &mut *self.upload_queue.lock().unwrap() {
            UploadQueue::Uninitialized => None,
@@ -441,11 +442,13 @@ impl RemoteTimelineClient {
    /// Returns true if this timeline was previously detached at this Lsn and the remote timeline
    /// client is currently initialized.
    pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
+        // technically this is a dirty read, but given how timeline detach ancestor is implemented
+        // via tenant restart, the lineage has always been uploaded.
        self.upload_queue
            .lock()
            .unwrap()
            .initialized_mut()
-            .map(|uq| uq.clean.0.lineage.is_previous_ancestor_lsn(lsn))
+            .map(|uq| uq.latest_lineage.is_previous_ancestor_lsn(lsn))
            .unwrap_or(false)
    }

@@ -454,6 +457,7 @@ impl RemoteTimelineClient {
            current_remote_index_part
                .layer_metadata
                .values()
+                // If we don't have the file size for the layer, don't account for it in the metric.
                .map(|ilmd| ilmd.file_size)
                .sum()
        } else {
@@ -581,9 +585,9 @@ impl RemoteTimelineClient {

        // As documented in the struct definition, it's ok for latest_metadata to be
        // ahead of what's _actually_ on the remote during index upload.
-        upload_queue.dirty.metadata = metadata.clone();
+        upload_queue.latest_metadata = metadata.clone();

-        self.schedule_index_upload(upload_queue)?;
+        self.schedule_index_upload(upload_queue);

        Ok(())
    }
@@ -602,9 +606,9 @@ impl RemoteTimelineClient {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        upload_queue.dirty.metadata.apply(update);
+        upload_queue.latest_metadata.apply(update);

-        self.schedule_index_upload(upload_queue)?;
+        self.schedule_index_upload(upload_queue);

        Ok(())
    }
@@ -616,8 +620,8 @@ impl RemoteTimelineClient {
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;
-        upload_queue.dirty.last_aux_file_policy = last_aux_file_policy;
-        self.schedule_index_upload(upload_queue)?;
+        upload_queue.last_aux_file_policy = last_aux_file_policy;
+        self.schedule_index_upload(upload_queue);
        Ok(())
    }
    ///
@@ -635,44 +639,30 @@ impl RemoteTimelineClient {
        let upload_queue = guard.initialized_mut()?;

        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue)?;
+            self.schedule_index_upload(upload_queue);
        }

        Ok(())
    }

    /// Launch an index-file upload operation in the background (internal function)
-    fn schedule_index_upload(
-        self: &Arc<Self>,
-        upload_queue: &mut UploadQueueInitialized,
-    ) -> anyhow::Result<()> {
-        let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn();
-        // fix up the duplicated field
-        upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn;
-
-        // make sure it serializes before doing it in perform_upload_task so that it doesn't
-        // look like a retryable error
-        let void = std::io::sink();
-        serde_json::to_writer(void, &upload_queue.dirty).context("serialize index_part.json")?;
-
-        let index_part = &upload_queue.dirty;
+    fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
+        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();

        info!(
            "scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
-            index_part.layer_metadata.len(),
+            upload_queue.latest_files.len(),
            upload_queue.latest_files_changes_since_metadata_upload_scheduled,
        );

-        let op = UploadOp::UploadMetadata {
-            uploaded: Box::new(index_part.clone()),
-        };
+        let index_part = IndexPart::from(&*upload_queue);
+        let op = UploadOp::UploadMetadata(Box::new(index_part), disk_consistent_lsn);
        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
        upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;

        // Launch the task immediately, if possible
        self.launch_queued_tasks(upload_queue);
-        Ok(())
    }

    pub(crate) async fn schedule_reparenting_and_wait(
@@ -685,16 +675,16 @@ impl RemoteTimelineClient {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;

-            let Some(prev) = upload_queue.dirty.metadata.ancestor_timeline() else {
+            let Some(prev) = upload_queue.latest_metadata.ancestor_timeline() else {
                return Err(anyhow::anyhow!(
                    "cannot reparent without a current ancestor"
                ));
            };

-            upload_queue.dirty.metadata.reparent(new_parent);
-            upload_queue.dirty.lineage.record_previous_ancestor(&prev);
+            upload_queue.latest_metadata.reparent(new_parent);
+            upload_queue.latest_lineage.record_previous_ancestor(&prev);

-            self.schedule_index_upload(upload_queue)?;
+            self.schedule_index_upload(upload_queue);

            self.schedule_barrier0(upload_queue)
        };
@@ -715,17 +705,16 @@ impl RemoteTimelineClient {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;

-            upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
-            upload_queue.dirty.lineage.record_detaching(&adopted);
+            upload_queue.latest_metadata.detach_from_ancestor(&adopted);
+            upload_queue.latest_lineage.record_detaching(&adopted);

            for layer in layers {
                upload_queue
-                    .dirty
-                    .layer_metadata
+                    .latest_files
                    .insert(layer.layer_desc().layer_name(), layer.metadata());
            }

-            self.schedule_index_upload(upload_queue)?;
+            self.schedule_index_upload(upload_queue);

            let barrier = self.schedule_barrier0(upload_queue);
            self.launch_queued_tasks(upload_queue);
@@ -757,8 +746,7 @@ impl RemoteTimelineClient {
        let metadata = layer.metadata();

        upload_queue
-            .dirty
-            .layer_metadata
+            .latest_files
            .insert(layer.layer_desc().layer_name(), metadata.clone());
        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;

@@ -788,8 +776,8 @@ impl RemoteTimelineClient {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        let with_metadata = self
-            .schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned())?;
+        let with_metadata =
+            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());

        self.schedule_deletion_of_unlinked0(upload_queue, with_metadata);

@@ -813,7 +801,7 @@ impl RemoteTimelineClient {

        let names = gc_layers.iter().map(|x| x.layer_desc().layer_name());

-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?;
+        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);

        self.launch_queued_tasks(upload_queue);

@@ -826,7 +814,7 @@ impl RemoteTimelineClient {
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
        names: I,
-    ) -> anyhow::Result<Vec<(LayerName, LayerFileMetadata)>>
+    ) -> Vec<(LayerName, LayerFileMetadata)>
    where
        I: IntoIterator<Item = LayerName>,
    {
@@ -836,7 +824,7 @@ impl RemoteTimelineClient {
        let with_metadata: Vec<_> = names
            .into_iter()
            .filter_map(|name| {
-                let meta = upload_queue.dirty.layer_metadata.remove(&name);
+                let meta = upload_queue.latest_files.remove(&name);

                if let Some(meta) = meta {
                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
@@ -868,10 +856,10 @@ impl RemoteTimelineClient {
        // index_part update, because that needs to be uploaded before we can actually delete the
        // files.
        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue)?;
+            self.schedule_index_upload(upload_queue);
        }

-        Ok(with_metadata)
+        with_metadata
    }

    /// Schedules deletion for layer files which have previously been unlinked from the
@@ -962,7 +950,7 @@ impl RemoteTimelineClient {

        let names = compacted_from.iter().map(|x| x.layer_desc().layer_name());

-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?;
+        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
        self.launch_queued_tasks(upload_queue);

        Ok(())
@@ -1097,7 +1085,7 @@ impl RemoteTimelineClient {
            let deleted_at = Utc::now().naive_utc();
            stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);

-            let mut index_part = stopped.upload_queue_for_deletion.dirty.clone();
+            let mut index_part = IndexPart::from(&stopped.upload_queue_for_deletion);
            index_part.deleted_at = Some(deleted_at);
            index_part
        };
@@ -1308,8 +1296,7 @@ impl RemoteTimelineClient {

            stopped
                .upload_queue_for_deletion
-                .dirty
-                .layer_metadata
+                .latest_files
                .drain()
                .map(|(file_name, meta)| {
                    remote_layer_path(
@@ -1446,7 +1433,7 @@ impl RemoteTimelineClient {
                    // Can always be scheduled.
                    true
                }
-                UploadOp::UploadMetadata { .. } => {
+                UploadOp::UploadMetadata(_, _) => {
                    // These can only be performed after all the preceding operations
                    // have finished.
                    upload_queue.inprogress_tasks.is_empty()
@@ -1488,7 +1475,7 @@ impl RemoteTimelineClient {
                UploadOp::UploadLayer(_, _) => {
                    upload_queue.num_inprogress_layer_uploads += 1;
                }
-                UploadOp::UploadMetadata { .. } => {
+                UploadOp::UploadMetadata(_, _) => {
                    upload_queue.num_inprogress_metadata_uploads += 1;
                }
                UploadOp::Delete(_) => {
@@ -1597,13 +1584,22 @@ impl RemoteTimelineClient {
                    )
                    .await
                }
-                UploadOp::UploadMetadata { ref uploaded } => {
+                UploadOp::UploadMetadata(ref index_part, _lsn) => {
+                    let mention_having_future_layers = if cfg!(feature = "testing") {
+                        index_part
+                            .layer_metadata
+                            .keys()
+                            .any(|x| x.is_in_future(*_lsn))
+                    } else {
+                        false
+                    };
+
                    let res = upload::upload_index_part(
                        &self.storage_impl,
                        &self.tenant_shard_id,
                        &self.timeline_id,
                        self.generation,
-                        uploaded,
+                        index_part,
                        &self.cancel,
                    )
                    .measure_remote_op(
@@ -1613,21 +1609,10 @@ impl RemoteTimelineClient {
                    )
                    .await;
                    if res.is_ok() {
-                        self.update_remote_physical_size_gauge(Some(uploaded));
-                        let mention_having_future_layers = if cfg!(feature = "testing") {
-                            uploaded
-                                .layer_metadata
-                                .keys()
-                                .any(|x| x.is_in_future(uploaded.metadata.disk_consistent_lsn()))
-                        } else {
-                            false
-                        };
+                        self.update_remote_physical_size_gauge(Some(index_part));
                        if mention_having_future_layers {
                            // find rationale near crate::tenant::timeline::init::cleanup_future_layer
-                            tracing::info!(
-                                disk_consistent_lsn = %uploaded.metadata.disk_consistent_lsn(),
-                                "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup"
-                            );
+                            tracing::info!(disk_consistent_lsn=%_lsn, "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup");
                        }
                    }
                    res
@@ -1728,23 +1713,11 @@ impl RemoteTimelineClient {
                    upload_queue.num_inprogress_layer_uploads -= 1;
                    None
                }
-                UploadOp::UploadMetadata { ref uploaded } => {
+                UploadOp::UploadMetadata(_, lsn) => {
                    upload_queue.num_inprogress_metadata_uploads -= 1;
+                    // XXX monotonicity check?

-                    // the task id is reused as a monotonicity check for storing the "clean"
-                    // IndexPart.
-                    let last_updater = upload_queue.clean.1;
-                    let is_later = last_updater.is_some_and(|task_id| task_id < task.task_id);
-                    let monotone = is_later || last_updater.is_none();
-
-                    assert!(monotone, "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", task.task_id);
-
-                    // not taking ownership is wasteful
-                    upload_queue.clean.0.clone_from(uploaded);
-                    upload_queue.clean.1 = Some(task.task_id);
-
-                    let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
-
+                    upload_queue.projected_remote_consistent_lsn = Some(lsn);
                    if self.generation.is_none() {
                        // Legacy mode: skip validating generation
                        upload_queue.visible_remote_consistent_lsn.store(lsn);
@@ -1798,7 +1771,7 @@ impl RemoteTimelineClient {
                RemoteOpKind::Upload,
                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
            ),
-            UploadOp::UploadMetadata { .. } => (
+            UploadOp::UploadMetadata(_, _) => (
                RemoteOpFileKind::Index,
                RemoteOpKind::Upload,
                DontTrackSize {
@@ -1874,9 +1847,11 @@ impl RemoteTimelineClient {
                    // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
                    let upload_queue_for_deletion = UploadQueueInitialized {
                        task_counter: 0,
-                        dirty: initialized.dirty.clone(),
-                        clean: initialized.clean.clone(),
+                        latest_files: initialized.latest_files.clone(),
                        latest_files_changes_since_metadata_upload_scheduled: 0,
+                        latest_metadata: initialized.latest_metadata.clone(),
+                        latest_lineage: initialized.latest_lineage.clone(),
+                        projected_remote_consistent_lsn: None,
                        visible_remote_consistent_lsn: initialized
                            .visible_remote_consistent_lsn
                            .clone(),
@@ -1889,6 +1864,7 @@ impl RemoteTimelineClient {
                        dangling_files: HashMap::default(),
                        shutting_down: false,
                        shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+                        last_aux_file_policy: initialized.last_aux_file_policy,
                    };

                    let upload_queue = std::mem::replace(
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -28,7 +28,6 @@ use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
-use utils::pausable_failpoint;

 use super::index::{IndexPart, LayerFileMetadata};
 use super::{
@@ -153,8 +152,6 @@ async fn download_object<'a>(

                let download = storage.download(src_path, cancel).await?;

-                pausable_failpoint!("before-downloading-layer-stream-pausable");
-
                let mut buf_writer =
                    tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);

@@ -202,8 +199,6 @@ async fn download_object<'a>(

                let mut download = storage.download(src_path, cancel).await?;

-                pausable_failpoint!("before-downloading-layer-stream-pausable");
-
                // TODO: use vectored write (writev) once supported by tokio-epoll-uring.
                // There's chunks_vectored() on the stream.
                let (bytes_amount, destination_file) = async {
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -11,6 +11,7 @@ use utils::id::TimelineId;

 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerName;
+use crate::tenant::upload_queue::UploadQueueInitialized;
 use crate::tenant::Generation;
 use pageserver_api::shard::ShardIndex;

@@ -38,19 +39,12 @@ pub struct IndexPart {
    /// that latest version stores.
    pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,

-    /// Because of the trouble of eyeballing the legacy "metadata" field, we copied the
-    /// "disk_consistent_lsn" out. After version 7 this is no longer needed, but the name cannot be
-    /// reused.
-    pub(super) disk_consistent_lsn: Lsn,
+    // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
+    // It's duplicated for convenience when reading the serialized structure, but is
+    // private because internally we would read from metadata instead.
+    disk_consistent_lsn: Lsn,

-    // TODO: rename as "metadata" next week, keep the alias = "metadata_bytes", bump version Adding
-    // the "alias = metadata" was forgotten in #7693, so we have to use "rewrite = metadata_bytes"
-    // for backwards compatibility.
-    #[serde(
-        rename = "metadata_bytes",
-        alias = "metadata",
-        with = "crate::tenant::metadata::modern_serde"
-    )]
+    #[serde(rename = "metadata_bytes")]
    pub metadata: TimelineMetadata,

    #[serde(default)]
@@ -79,33 +73,40 @@ impl IndexPart {
    /// - 4: timeline_layers is fully removed.
    /// - 5: lineage was added
    /// - 6: last_aux_file_policy is added.
-    /// - 7: metadata_bytes is no longer written, but still read
-    const LATEST_VERSION: usize = 7;
+    const LATEST_VERSION: usize = 6;

    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6];

    pub const FILE_NAME: &'static str = "index_part.json";

-    pub(crate) fn empty(metadata: TimelineMetadata) -> Self {
-        IndexPart {
+    fn new(
+        layers_and_metadata: &HashMap<LayerName, LayerFileMetadata>,
+        disk_consistent_lsn: Lsn,
+        metadata: TimelineMetadata,
+        lineage: Lineage,
+        last_aux_file_policy: Option<AuxFilePolicy>,
+    ) -> Self {
+        let layer_metadata = layers_and_metadata.clone();
+
+        Self {
            version: Self::LATEST_VERSION,
-            layer_metadata: Default::default(),
-            disk_consistent_lsn: metadata.disk_consistent_lsn(),
+            layer_metadata,
+            disk_consistent_lsn,
            metadata,
            deleted_at: None,
-            lineage: Default::default(),
-            last_aux_file_policy: None,
+            lineage,
+            last_aux_file_policy,
        }
    }

-    pub fn version(&self) -> usize {
+    pub fn get_version(&self) -> usize {
        self.version
    }

    /// If you want this under normal operations, read it from self.metadata:
    /// this method is just for the scrubber to use when validating an index.
-    pub fn duplicated_disk_consistent_lsn(&self) -> Lsn {
+    pub fn get_disk_consistent_lsn(&self) -> Lsn {
        self.disk_consistent_lsn
    }

@@ -119,7 +120,14 @@ impl IndexPart {

    #[cfg(test)]
    pub(crate) fn example() -> Self {
-        Self::empty(TimelineMetadata::example())
+        let example_metadata = TimelineMetadata::example();
+        Self::new(
+            &HashMap::new(),
+            example_metadata.disk_consistent_lsn(),
+            example_metadata,
+            Default::default(),
+            Some(AuxFilePolicy::V1),
+        )
    }

    pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
@@ -127,6 +135,22 @@ impl IndexPart {
    }
 }

+impl From<&UploadQueueInitialized> for IndexPart {
+    fn from(uq: &UploadQueueInitialized) -> Self {
+        let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn();
+        let metadata = uq.latest_metadata.clone();
+        let lineage = uq.latest_lineage.clone();
+
+        Self::new(
+            &uq.latest_files,
+            disk_consistent_lsn,
+            metadata,
+            lineage,
+            uq.last_aux_file_policy,
+        )
+    }
+}
+
 /// Metadata gathered for each of the layer files.
 ///
 /// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
@@ -212,18 +236,19 @@ impl Lineage {
    /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
    /// to start a read/write primary at this lsn".
    ///
-    /// Returns true if the Lsn was previously our branch point.
+    /// Returns true if the Lsn was previously a branch point.
    pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
        self.original_ancestor
-            .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
+            .as_ref()
+            .is_some_and(|(_, ancestor_lsn, _)| lsn == *ancestor_lsn)
    }
 }

 #[cfg(test)]
 mod tests {
-    use super::*;
    use std::str::FromStr;
-    use utils::id::TimelineId;
+
+    use super::*;

    #[test]
    fn v1_indexpart_is_parsed() {
@@ -342,7 +367,8 @@ mod tests {
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
-            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
+            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
            lineage: Lineage::default(),
            last_aux_file_policy: None,
        };
@@ -518,7 +544,8 @@ mod tests {
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
-            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
+            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
            lineage: Lineage {
                reparenting_history_truncated: false,
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
@@ -531,60 +558,6 @@ mod tests {
        assert_eq!(part, expected);
    }

-    #[test]
-    fn v7_indexpart_is_parsed() {
-        let example = r#"{
-            "version": 7,
-            "layer_metadata":{
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
-            },
-            "disk_consistent_lsn":"0/16960E8",
-            "metadata": {
-                "disk_consistent_lsn": "0/16960E8",
-                "prev_record_lsn": "0/1696070",
-                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
-                "ancestor_lsn": "0/0",
-                "latest_gc_cutoff_lsn": "0/1696070",
-                "initdb_lsn": "0/1696070",
-                "pg_version": 14
-            },
-            "deleted_at": "2023-07-31T09:00:00.123"
-        }"#;
-
-        let expected = IndexPart {
-            version: 7,
-            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
-                    file_size: 25600000,
-                    generation: Generation::none(),
-                    shard: ShardIndex::unsharded()
-                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
-                    file_size: 9007199254741001,
-                    generation: Generation::none(),
-                    shard: ShardIndex::unsharded()
-                })
-            ]),
-            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata: TimelineMetadata::new(
-                Lsn::from_str("0/16960E8").unwrap(),
-                Some(Lsn::from_str("0/1696070").unwrap()),
-                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
-                Lsn::INVALID,
-                Lsn::from_str("0/1696070").unwrap(),
-                Lsn::from_str("0/1696070").unwrap(),
-                14,
-            ).with_recalculated_checksum().unwrap(),
-            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
-            lineage: Default::default(),
-            last_aux_file_policy: Default::default(),
-        };
-
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
-        assert_eq!(part, expected);
-    }
-
    fn parse_naive_datetime(s: &str) -> NaiveDateTime {
        chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
    }
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -1,7 +1,6 @@
 //! Helper functions to upload files to remote storage with a RemoteStorage

 use anyhow::{bail, Context};
-use bytes::Bytes;
 use camino::Utf8Path;
 use fail::fail_point;
 use pageserver_api::shard::TenantShardId;
@@ -12,10 +11,10 @@ use tokio::io::AsyncSeekExt;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, pausable_failpoint};

-use super::index::IndexPart;
 use super::Generation;
 use crate::tenant::remote_timeline_client::{
-    remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path,
+    index::IndexPart, remote_index_path, remote_initdb_archive_path,
+    remote_initdb_preserved_archive_path,
 };
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
 use utils::id::{TenantId, TimelineId};
@@ -28,7 +27,7 @@ pub(crate) async fn upload_index_part<'a>(
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    generation: Generation,
-    index_part: &IndexPart,
+    index_part: &'a IndexPart,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    tracing::trace!("uploading new index part");
@@ -38,16 +37,16 @@ pub(crate) async fn upload_index_part<'a>(
    });
    pausable_failpoint!("before-upload-index-pausable");

-    // FIXME: this error comes too late
-    let serialized = index_part.to_s3_bytes()?;
-    let serialized = Bytes::from(serialized);
-
-    let index_part_size = serialized.len();
+    let index_part_bytes = index_part
+        .to_s3_bytes()
+        .context("serialize index part file into bytes")?;
+    let index_part_size = index_part_bytes.len();
+    let index_part_bytes = bytes::Bytes::from(index_part_bytes);

    let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
    storage
        .upload_storage_object(
-            futures::stream::once(futures::future::ready(Ok(serialized))),
+            futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
            index_part_size,
            &remote_path,
            cancel,
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -513,7 +513,7 @@ impl<'a> TenantDownloader<'a> {
        // cover our access to local storage.
        let Ok(_guard) = self.secondary_state.gate.enter() else {
            // Shutting down
-            return Err(UpdateError::Cancelled);
+            return Ok(());
        };

        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
@@ -846,7 +846,7 @@ impl<'a> TenantDownloader<'a> {
        for layer in timeline.layers {
            if self.secondary_state.cancel.is_cancelled() {
                tracing::debug!("Cancelled -- dropping out of layer loop");
-                return Err(UpdateError::Cancelled);
+                return Ok(());
            }

            // Existing on-disk layers: just update their access time.
@@ -1000,7 +1000,7 @@ impl<'a> TenantDownloader<'a> {
            layer.name,
            layer.metadata.file_size
        );
-        let downloaded_bytes = download_layer_file(
+        let downloaded_bytes = match download_layer_file(
            self.conf,
            self.remote_storage,
            *tenant_shard_id,
@@ -1011,9 +1011,8 @@ impl<'a> TenantDownloader<'a> {
            &self.secondary_state.cancel,
            ctx,
        )
-        .await;
-
-        let downloaded_bytes = match downloaded_bytes {
+        .await
+        {
            Ok(bytes) => bytes,
            Err(DownloadError::NotFound) => {
                // A heatmap might be out of date and refer to a layer that doesn't exist any more.
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -334,11 +334,8 @@ where

        let tenant_shard_id = job.get_tenant_shard_id();
        let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
-            tracing::info!(
-                tenant_id=%tenant_shard_id.tenant_id,
-                shard_id=%tenant_shard_id.shard_slug(),
-                "Command already running, waiting for it"
-            );
+            tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                           "Command already running, waiting for it");
            barrier
        } else {
            let running = self.spawn_now(job);
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -3,6 +3,7 @@ use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;

+use anyhow::{bail, Context};
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -10,7 +11,7 @@ use tokio_util::sync::CancellationToken;
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;

-use super::{GcError, LogicalSizeCalculationCause, Tenant};
+use super::{LogicalSizeCalculationCause, Tenant};
 use crate::tenant::Timeline;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
@@ -42,40 +43,6 @@ pub struct SegmentMeta {
    pub kind: LsnKind,
 }

-#[derive(thiserror::Error, Debug)]
-pub(crate) enum CalculateSyntheticSizeError {
-    /// Something went wrong internally to the calculation of logical size at a particular branch point
-    #[error("Failed to calculated logical size on timeline {timeline_id} at {lsn}: {error}")]
-    LogicalSize {
-        timeline_id: TimelineId,
-        lsn: Lsn,
-        error: CalculateLogicalSizeError,
-    },
-
-    /// Something went wrong internally when calculating GC parameters at start of size calculation
-    #[error(transparent)]
-    GcInfo(GcError),
-
-    /// Totally unexpected errors, like panics joining a task
-    #[error(transparent)]
-    Fatal(anyhow::Error),
-
-    /// Tenant shut down while calculating size
-    #[error("Cancelled")]
-    Cancelled,
-}
-
-impl From<GcError> for CalculateSyntheticSizeError {
-    fn from(value: GcError) -> Self {
-        match value {
-            GcError::TenantCancelled | GcError::TimelineCancelled => {
-                CalculateSyntheticSizeError::Cancelled
-            }
-            other => CalculateSyntheticSizeError::GcInfo(other),
-        }
-    }
-}
-
 impl SegmentMeta {
    fn size_needed(&self) -> bool {
        match self.kind {
@@ -149,9 +116,12 @@ pub(super) async fn gather_inputs(
    cause: LogicalSizeCalculationCause,
    cancel: &CancellationToken,
    ctx: &RequestContext,
-) -> Result<ModelInputs, CalculateSyntheticSizeError> {
+) -> anyhow::Result<ModelInputs> {
    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
-    tenant.refresh_gc_info(cancel, ctx).await?;
+    tenant
+        .refresh_gc_info(cancel, ctx)
+        .await
+        .context("Failed to refresh gc_info before gathering inputs")?;

    // Collect information about all the timelines
    let mut timelines = tenant.list_timelines();
@@ -357,12 +327,6 @@ pub(super) async fn gather_inputs(
    )
    .await?;

-    if tenant.cancel.is_cancelled() {
-        // If we're shutting down, return an error rather than a sparse result that might include some
-        // timelines from before we started shutting down
-        return Err(CalculateSyntheticSizeError::Cancelled);
-    }
-
    Ok(ModelInputs {
        segments,
        timeline_inputs,
@@ -371,8 +335,9 @@ pub(super) async fn gather_inputs(

 /// Augment 'segments' with logical sizes
 ///
-/// This will leave segments' sizes as None if the Timeline associated with the segment is deleted concurrently
-/// (i.e. we cannot read its logical size at a particular LSN).
+/// this will probably conflict with on-demand downloaded layers, or at least force them all
+/// to be downloaded
+///
 async fn fill_logical_sizes(
    timelines: &[Arc<Timeline>],
    segments: &mut [SegmentMeta],
@@ -380,7 +345,7 @@ async fn fill_logical_sizes(
    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
    cause: LogicalSizeCalculationCause,
    ctx: &RequestContext,
-) -> Result<(), CalculateSyntheticSizeError> {
+) -> anyhow::Result<()> {
    let timeline_hash: HashMap<TimelineId, Arc<Timeline>> = HashMap::from_iter(
        timelines
            .iter()
@@ -422,7 +387,7 @@ async fn fill_logical_sizes(
    }

    // Perform the size lookups
-    let mut have_any_error = None;
+    let mut have_any_error = false;
    while let Some(res) = joinset.join_next().await {
        // each of these come with Result<anyhow::Result<_>, JoinError>
        // because of spawn + spawn_blocking
@@ -433,36 +398,21 @@ async fn fill_logical_sizes(
            Err(join_error) => {
                // cannot really do anything, as this panic is likely a bug
                error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}");
-
-                have_any_error = Some(CalculateSyntheticSizeError::Fatal(
-                    anyhow::anyhow!(join_error)
-                        .context("task that calls spawn_ondemand_logical_size_calculation"),
-                ));
+                have_any_error = true;
            }
            Ok(Err(recv_result_error)) => {
                // cannot really do anything, as this panic is likely a bug
                error!("failed to receive logical size query result: {recv_result_error:#}");
-                have_any_error = Some(CalculateSyntheticSizeError::Fatal(
-                    anyhow::anyhow!(recv_result_error)
-                        .context("Receiving logical size query result"),
-                ));
+                have_any_error = true;
            }
            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
-                if matches!(error, CalculateLogicalSizeError::Cancelled) {
-                    // Skip this: it's okay if one timeline among many is shutting down while we
-                    // calculate inputs for the overall tenant.
-                    continue;
-                } else {
+                if !matches!(error, CalculateLogicalSizeError::Cancelled) {
                    warn!(
                        timeline_id=%timeline.timeline_id,
                        "failed to calculate logical size at {lsn}: {error:#}"
                    );
-                    have_any_error = Some(CalculateSyntheticSizeError::LogicalSize {
-                        timeline_id: timeline.timeline_id,
-                        lsn,
-                        error,
-                    });
                }
+                have_any_error = true;
            }
            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
                debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
@@ -476,10 +426,10 @@ async fn fill_logical_sizes(
    // prune any keys not needed anymore; we record every used key and added key.
    logical_size_cache.retain(|key, _| sizes_needed.contains_key(key));

-    if let Some(error) = have_any_error {
+    if have_any_error {
        // we cannot complete this round, because we are missing data.
        // we have however cached all we were able to request calculation on.
-        return Err(error);
+        anyhow::bail!("failed to calculate some logical_sizes");
    }

    // Insert the looked up sizes to the Segments
@@ -493,28 +443,33 @@ async fn fill_logical_sizes(

        if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) {
            seg.segment.size = Some(*size);
+        } else {
+            bail!("could not find size at {} in timeline {}", lsn, timeline_id);
        }
    }
    Ok(())
 }

 impl ModelInputs {
-    pub fn calculate_model(&self) -> tenant_size_model::StorageModel {
+    pub fn calculate_model(&self) -> anyhow::Result<tenant_size_model::StorageModel> {
        // Convert SegmentMetas into plain Segments
-        StorageModel {
+        let storage = StorageModel {
            segments: self
                .segments
                .iter()
                .map(|seg| seg.segment.clone())
                .collect(),
-        }
+        };
+
+        Ok(storage)
    }

    // calculate total project size
-    pub fn calculate(&self) -> u64 {
-        let storage = self.calculate_model();
+    pub fn calculate(&self) -> anyhow::Result<u64> {
+        let storage = self.calculate_model()?;
        let sizes = storage.calculate();
-        sizes.total_size
+
+        Ok(sizes.total_size)
    }
 }

@@ -701,7 +656,7 @@ fn verify_size_for_multiple_branches() {
 "#;
    let inputs: ModelInputs = serde_json::from_str(doc).unwrap();

-    assert_eq!(inputs.calculate(), 37_851_408);
+    assert_eq!(inputs.calculate().unwrap(), 37_851_408);
 }

 #[test]
@@ -756,7 +711,7 @@ fn verify_size_for_one_branch() {

    let model: ModelInputs = serde_json::from_str(doc).unwrap();

-    let res = model.calculate_model().calculate();
+    let res = model.calculate_model().unwrap().calculate();

    println!("calculated synthetic size: {}", res.total_size);
    println!("result: {:?}", serde_json::to_string(&res.segments));
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -318,7 +318,7 @@ pub(crate) struct LayerFringe {
 #[derive(Debug)]
 struct LayerKeyspace {
    layer: ReadableLayer,
-    target_keyspace: KeySpaceRandomAccum,
+    target_keyspace: Vec<KeySpace>,
 }

 impl LayerFringe {
@@ -342,13 +342,17 @@ impl LayerFringe {
                _,
                LayerKeyspace {
                    layer,
-                    mut target_keyspace,
+                    target_keyspace,
                },
-            )) => Some((
-                layer,
-                target_keyspace.consume_keyspace(),
-                read_desc.lsn_range,
-            )),
+            )) => {
+                let mut keyspace = KeySpaceRandomAccum::new();
+                for ks in target_keyspace {
+                    for part in ks.ranges {
+                        keyspace.add_range(part);
+                    }
+                }
+                Some((layer, keyspace.consume_keyspace(), read_desc.lsn_range))
+            }
            None => unreachable!("fringe internals are always consistent"),
        }
    }
@@ -363,18 +367,16 @@ impl LayerFringe {
        let entry = self.layers.entry(layer_id.clone());
        match entry {
            Entry::Occupied(mut entry) => {
-                entry.get_mut().target_keyspace.add_keyspace(keyspace);
+                entry.get_mut().target_keyspace.push(keyspace);
            }
            Entry::Vacant(entry) => {
                self.planned_reads_by_lsn.push(ReadDesc {
                    lsn_range,
                    layer_id: layer_id.clone(),
                });
-                let mut accum = KeySpaceRandomAccum::new();
-                accum.add_keyspace(keyspace);
                entry.insert(LayerKeyspace {
                    layer,
-                    target_keyspace: accum,
+                    target_keyspace: vec![keyspace],
                });
            }
        }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -219,6 +219,7 @@ pub struct DeltaLayerInner {
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,
+    lsn_range: Range<Lsn>,

    file: VirtualFile,
    file_id: FileId,
@@ -477,23 +478,6 @@ impl DeltaLayerWriterInner {
        key_end: Key,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
-        let temp_path = self.path.clone();
-        let result = self.finish0(key_end, timeline, ctx).await;
-        if result.is_err() {
-            tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
-            if let Err(e) = std::fs::remove_file(&temp_path) {
-                tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
-            }
-        }
-        result
-    }
-
-    async fn finish0(
-        self,
-        key_end: Key,
-        timeline: &Arc<Timeline>,
-        ctx: &RequestContext,
    ) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -667,11 +651,19 @@ impl DeltaLayerWriter {
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
    ) -> anyhow::Result<ResidentLayer> {
-        self.inner
-            .take()
-            .unwrap()
-            .finish(key_end, timeline, ctx)
-            .await
+        let inner = self.inner.take().unwrap();
+        let temp_path = inner.path.clone();
+        let result = inner.finish(key_end, timeline, ctx).await;
+        // The delta layer files can sometimes be really large. Clean them up.
+        if result.is_err() {
+            tracing::warn!(
+                "Cleaning up temporary delta file {temp_path} after error during writing"
+            );
+            if let Err(e) = std::fs::remove_file(&temp_path) {
+                tracing::warn!("Error cleaning up temporary delta layer file {temp_path}: {e:?}")
+            }
+        }
+        result
    }
 }

@@ -784,6 +776,7 @@ impl DeltaLayerInner {
            file_id,
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
+            lsn_range: actual_summary.lsn_range,
            max_vectored_read_bytes,
        }))
    }
@@ -909,7 +902,7 @@ impl DeltaLayerInner {

        let reads = Self::plan_reads(
            &keyspace,
-            lsn_range.clone(),
+            lsn_range,
            data_end_offset,
            index_reader,
            planner,
@@ -922,50 +915,11 @@ impl DeltaLayerInner {
        self.do_reads_and_update_state(reads, reconstruct_state, ctx)
            .await;

-        reconstruct_state.on_lsn_advanced(&keyspace, lsn_range.start);
+        reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start);

        Ok(())
    }

-    /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
-    #[cfg(test)]
-    pub(super) async fn load_key_values(
-        &self,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            block_reader,
-        );
-        let mut result = Vec::new();
-        let mut stream =
-            Box::pin(self.stream_index_forwards(&index_reader, &[0; DELTA_KEY_SIZE], ctx));
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let cursor = block_reader.block_cursor();
-        let mut buf = Vec::new();
-        while let Some(item) = stream.next().await {
-            let (key, lsn, pos) = item?;
-            // TODO: dedup code with get_reconstruct_value
-            // TODO: ctx handling and sharding
-            cursor
-                .read_blob_into_buf(pos.pos(), &mut buf, ctx)
-                .await
-                .with_context(|| {
-                    format!("Failed to read blob from virtual file {}", self.file.path)
-                })?;
-            let val = Value::des(&buf).with_context(|| {
-                format!(
-                    "Failed to deserialize file blob from virtual file {}",
-                    self.file.path
-                )
-            })?;
-            result.push((key, lsn, val));
-        }
-        Ok(result)
-    }
-
    async fn plan_reads<Reader>(
        keyspace: &KeySpace,
        lsn_range: Range<Lsn>,
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -485,34 +485,6 @@ impl ImageLayerInner {
        Ok(())
    }

-    /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
-    #[cfg(test)]
-    pub(super) async fn load_key_values(
-        &self,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader =
-            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
-        let mut result = Vec::new();
-        let mut stream = Box::pin(tree_reader.get_stream_from(&[0; KEY_SIZE], ctx));
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let cursor = block_reader.block_cursor();
-        while let Some(item) = stream.next().await {
-            // TODO: dedup code with get_reconstruct_value
-            let (raw_key, offset) = item?;
-            let key = Key::from_slice(&raw_key[..KEY_SIZE]);
-            // TODO: ctx handling and sharding
-            let blob = cursor
-                .read_blob(offset, ctx)
-                .await
-                .with_context(|| format!("failed to read value from offset {}", offset))?;
-            let value = Bytes::from(blob);
-            result.push((key, self.lsn, Value::Image(value)));
-        }
-        Ok(result)
-    }
-
    /// Traverse the layer's index to build read operations on the overlap of the input keyspace
    /// and the keys in this layer.
    ///
@@ -945,57 +917,26 @@ impl Drop for ImageLayerWriter {

 #[cfg(test)]
 mod test {
-    use std::time::Duration;
-
    use bytes::Bytes;
    use pageserver_api::{
        key::Key,
        shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
    };
-    use utils::{
-        generation::Generation,
-        id::{TenantId, TimelineId},
-        lsn::Lsn,
-    };
+    use utils::{id::TimelineId, lsn::Lsn};

-    use crate::{
-        tenant::{config::TenantConf, harness::TenantHarness},
-        DEFAULT_PG_VERSION,
-    };
+    use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};

    use super::ImageLayerWriter;

    #[tokio::test]
    async fn image_layer_rewrite() {
-        let tenant_conf = TenantConf {
-            gc_period: Duration::ZERO,
-            compaction_period: Duration::ZERO,
-            ..TenantConf::default()
-        };
-        let tenant_id = TenantId::generate();
-        let mut gen = Generation::new(0xdead0001);
-        let mut get_next_gen = || {
-            let ret = gen;
-            gen = gen.next();
-            ret
-        };
+        let harness = TenantHarness::create("test_image_layer_rewrite").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
        // The LSN at which we will create an image layer to filter
        let lsn = Lsn(0xdeadbeef0000);
+
        let timeline_id = TimelineId::generate();
-
-        //
-        // Create an unsharded parent with a layer.
-        //
-
-        let harness = TenantHarness::create_custom(
-            "test_image_layer_rewrite--parent",
-            tenant_conf.clone(),
-            tenant_id,
-            ShardIdentity::unsharded(),
-            get_next_gen(),
-        )
-        .unwrap();
-        let (tenant, ctx) = harness.load().await;
        let timeline = tenant
            .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
            .await
@@ -1030,47 +971,9 @@ mod test {
        };
        let original_size = resident.metadata().file_size;

-        //
-        // Create child shards and do the rewrite, exercising filter().
-        // TODO: abstraction in TenantHarness for splits.
-        //
-
        // Filter for various shards: this exercises cases like values at start of key range, end of key
        // range, middle of key range.
-        let shard_count = ShardCount::new(4);
-        for shard_number in 0..shard_count.count() {
-            //
-            // mimic the shard split
-            //
-            let shard_identity = ShardIdentity::new(
-                ShardNumber(shard_number),
-                shard_count,
-                ShardStripeSize(0x8000),
-            )
-            .unwrap();
-            let harness = TenantHarness::create_custom(
-                Box::leak(Box::new(format!(
-                    "test_image_layer_rewrite--child{}",
-                    shard_identity.shard_slug()
-                ))),
-                tenant_conf.clone(),
-                tenant_id,
-                shard_identity,
-                // NB: in reality, the shards would each fork off their own gen number sequence from the parent.
-                // But here, all we care about is that the gen number is unique.
-                get_next_gen(),
-            )
-            .unwrap();
-            let (tenant, ctx) = harness.load().await;
-            let timeline = tenant
-                .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
-                .await
-                .unwrap();
-
-            //
-            // use filter() and make assertions
-            //
-
+        for shard_number in 0..4 {
            let mut filtered_writer = ImageLayerWriter::new(
                harness.conf,
                timeline_id,
@@ -1082,6 +985,15 @@ mod test {
            .await
            .unwrap();

+            // TenantHarness gave us an unsharded tenant, but we'll use a sharded ShardIdentity
+            // to exercise filter()
+            let shard_identity = ShardIdentity::new(
+                ShardNumber(shard_number),
+                ShardCount::new(4),
+                ShardStripeSize(0x8000),
+            )
+            .unwrap();
+
            let wrote_keys = resident
                .filter(&shard_identity, &mut filtered_writer, &ctx)
                .await
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -52,7 +52,7 @@ pub struct InMemoryLayer {

    /// Frozen layers have an exclusive end LSN.
    /// Writes are only allowed when this is `None`.
-    pub(crate) end_lsn: OnceLock<Lsn>,
+    end_lsn: OnceLock<Lsn>,

    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
    local_path_str: Arc<str>,
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -277,10 +277,9 @@ impl Layer {

        let downloaded = resident.expect("just initialized");

-        // We never want to overwrite an existing file, so we use `RENAME_NOREPLACE`.
-        // TODO: this leaves the temp file in place if the rename fails, risking us running
-        // out of space. Should we clean it up here or does the calling context deal with this?
-        utils::fs_ext::rename_noreplace(temp_path.as_std_path(), owner.local_path().as_std_path())
+        // if the rename works, the path is as expected
+        // TODO: sync system call
+        std::fs::rename(temp_path, owner.local_path())
            .with_context(|| format!("rename temporary file as correct path for {owner}"))?;

        Ok(ResidentLayer { downloaded, owner })
@@ -388,23 +387,6 @@ impl Layer {
            })
    }

-    /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
-    #[cfg(test)]
-    pub(crate) async fn load_key_values(
-        &self,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
-        let layer = self
-            .0
-            .get_or_maybe_download(true, Some(ctx))
-            .await
-            .map_err(|err| match err {
-                DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
-                other => GetVectoredError::Other(anyhow::anyhow!(other)),
-            })?;
-        layer.load_key_values(&self.0, ctx).await
-    }
-
    /// Download the layer if evicted.
    ///
    /// Will not error when the layer is already downloaded.
@@ -1774,20 +1756,6 @@ impl DownloadedLayer {
        }
    }

-    #[cfg(test)]
-    async fn load_key_values(
-        &self,
-        owner: &Arc<LayerInner>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
-        use LayerKind::*;
-
-        match self.get(owner, ctx).await? {
-            Delta(d) => d.load_key_values(ctx).await,
-            Image(i) => i.load_key_values(ctx).await,
-        }
-    }
-
    async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
        use LayerKind::*;
        match self.get(owner, ctx).await? {
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -815,7 +815,6 @@ async fn eviction_cancellation_on_drop() {
 /// A test case to remind you the cost of these structures. You can bump the size limit
 /// below if it is really necessary to add more fields to the structures.
 #[test]
-#[cfg(target_arch = "x86_64")]
 fn layer_size() {
    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
    assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,4 +1,3 @@
-pub(crate) mod analysis;
 mod compaction;
 pub mod delete;
 pub(crate) mod detach_ancestor;
@@ -62,7 +61,6 @@ use std::{
    ops::ControlFlow,
 };

-use crate::metrics::GetKind;
 use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
 use crate::{
    aux_file::AuxFileSizeEstimator,
@@ -76,6 +74,7 @@ use crate::{
    disk_usage_eviction_task::DiskUsageEvictionInfo,
    pgdatadir_mapping::CollectKeySpaceError,
 };
+use crate::{deletion_queue::DeletionQueueClient, metrics::GetKind};
 use crate::{
    disk_usage_eviction_task::finite_f32,
    tenant::storage_layer::{
@@ -205,6 +204,7 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
    pub remote_client: RemoteTimelineClient,
+    pub deletion_queue_client: DeletionQueueClient,
    pub timeline_get_throttle: Arc<
        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
    >,
@@ -321,8 +321,6 @@ pub struct Timeline {
    /// Locked automatically by [`TimelineWriter`] and checkpointer.
    /// Must always be acquired before the layer map/individual layer lock
    /// to avoid deadlock.
-    ///
-    /// The state is cleared upon freezing.
    write_lock: tokio::sync::Mutex<Option<TimelineWriterState>>,

    /// Used to avoid multiple `flush_loop` tasks running
@@ -425,14 +423,6 @@ pub struct Timeline {

    /// Indicate whether aux file v2 storage is enabled.
    pub(crate) last_aux_file_policy: AtomicAuxFilePolicy,
-
-    /// Some test cases directly place keys into the timeline without actually modifying the directory
-    /// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that
-    /// these keys won't get garbage-collected during compaction/GC. This field only modifies the dense
-    /// keyspace return value of `collect_keyspace`. For sparse keyspaces, use AUX keys for testing, and
-    /// in the future, add `extra_test_sparse_keyspace` if necessary.
-    #[cfg(test)]
-    pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
 }

 pub struct WalReceiverInfo {
@@ -1578,15 +1568,7 @@ impl Timeline {
    // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
    // polluting the span hierarchy.
    pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
-        let to_lsn = {
-            // Freeze the current open in-memory layer. It will be written to disk on next
-            // iteration.
-            let mut g = self.write_lock.lock().await;
-
-            let to_lsn = self.get_last_record_lsn();
-            self.freeze_inmem_layer_at(to_lsn, &mut g).await;
-            to_lsn
-        };
+        let to_lsn = self.freeze_inmem_layer(false).await;
        self.flush_frozen_layers_and_wait(to_lsn).await
    }

@@ -1595,7 +1577,7 @@ impl Timeline {
    // an ephemeral layer open forever when idle.  It also freezes layers if the global limit on
    // ephemeral layer bytes has been breached.
    pub(super) async fn maybe_freeze_ephemeral_layer(&self) {
-        let Ok(mut write_guard) = self.write_lock.try_lock() else {
+        let Ok(_write_guard) = self.write_lock.try_lock() else {
            // If the write lock is held, there is an active wal receiver: rolling open layers
            // is their responsibility while they hold this lock.
            return;
@@ -1672,35 +1654,24 @@ impl Timeline {
            self.last_freeze_at.load(),
            open_layer.get_opened_at(),
        ) {
-            let at_lsn = match open_layer.info() {
+            match open_layer.info() {
                InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
                    // We may reach this point if the layer was already frozen by not yet flushed: flushing
                    // happens asynchronously in the background.
                    tracing::debug!(
                        "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})"
                    );
-                    None
                }
                InMemoryLayerInfo::Open { .. } => {
                    // Upgrade to a write lock and freeze the layer
                    drop(layers_guard);
                    let mut layers_guard = self.layers.write().await;
-                    let froze = layers_guard
-                        .try_freeze_in_memory_layer(
-                            current_lsn,
-                            &self.last_freeze_at,
-                            &mut write_guard,
-                        )
+                    layers_guard
+                        .try_freeze_in_memory_layer(current_lsn, &self.last_freeze_at)
                        .await;
-                    Some(current_lsn).filter(|_| froze)
-                }
-            };
-            if let Some(lsn) = at_lsn {
-                let res: Result<u64, _> = self.flush_frozen_layers(lsn);
-                if let Err(e) = res {
-                    tracing::info!("failed to flush frozen layer after background freeze: {e:#}");
                }
            }
+            self.flush_frozen_layers();
        }
    }

@@ -2064,11 +2035,11 @@ impl Timeline {
            true
        } else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() {
            info!(
-                "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
-                projected_lsn,
-                layer_size,
-                opened_at.elapsed()
-            );
+                    "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
+                    projected_lsn,
+                    layer_size,
+                    opened_at.elapsed()
+                );

            true
        } else {
@@ -2351,9 +2322,6 @@ impl Timeline {
                aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),

                last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy),
-
-                #[cfg(test)]
-                extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -2412,7 +2380,7 @@ impl Timeline {
                let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
                self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
                let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
-                assert!(matches!(*flush_loop_state, FlushLoopState::Running{..}));
+                assert!(matches!(*flush_loop_state, FlushLoopState::Running{ ..}));
                *flush_loop_state  = FlushLoopState::Exited;
                Ok(())
            }
@@ -2819,21 +2787,17 @@ impl Timeline {
                    crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
                };

-                let calculated_size = self_ref
+                match self_ref
                    .logical_size_calculation_task(
                        initial_part_end,
                        LogicalSizeCalculationCause::Initial,
                        background_ctx,
                    )
-                    .await?;
-
-                self_ref
-                    .trigger_aux_file_size_computation(initial_part_end, background_ctx)
-                    .await?;
-
-                // TODO: add aux file size to logical size
-
-                Ok((calculated_size, metrics_guard))
+                    .await
+                {
+                    Ok(calculated_size) => Ok((calculated_size, metrics_guard)),
+                    Err(e) => Err(e),
+                }
            }
        };

@@ -3675,21 +3639,28 @@ impl Timeline {
        self.last_record_lsn.advance(new_lsn);
    }

-    async fn freeze_inmem_layer_at(
-        &self,
-        at: Lsn,
-        write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
-    ) {
-        let frozen = {
-            let mut guard = self.layers.write().await;
-            guard
-                .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock)
-                .await
+    /// Whether there was a layer to freeze or not, return the value of get_last_record_lsn
+    /// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive).
+    async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn {
+        // Freeze the current open in-memory layer. It will be written to disk on next
+        // iteration.
+
+        let _write_guard = if write_lock_held {
+            None
+        } else {
+            Some(self.write_lock.lock().await)
        };
-        if frozen {
-            let now = Instant::now();
-            *(self.last_freeze_ts.write().unwrap()) = now;
-        }
+
+        let to_lsn = self.get_last_record_lsn();
+        self.freeze_inmem_layer_at(to_lsn).await;
+        to_lsn
+    }
+
+    async fn freeze_inmem_layer_at(&self, at: Lsn) {
+        let mut guard = self.layers.write().await;
+        guard
+            .try_freeze_in_memory_layer(at, &self.last_freeze_at)
+            .await;
    }

    /// Layer flusher task's main loop.
@@ -3783,14 +3754,18 @@ impl Timeline {
        }
    }

-    /// Request the flush loop to write out all frozen layers up to `at_lsn` as Delta L0 files to disk.
-    /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer_at`].
+    /// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk.
+    /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`].
    ///
-    /// `at_lsn` may be higher than the highest LSN of a frozen layer: if this is the
-    /// case, it means no data will be written between the top of the highest frozen layer and
-    /// to_lsn, e.g. because this tenant shard has ingested up to to_lsn and not written any data
-    /// locally for that part of the WAL.
-    fn flush_frozen_layers(&self, at_lsn: Lsn) -> Result<u64, FlushLayerError> {
+    /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
+    /// it means no data will be written between the top of the highest frozen layer and to_lsn,
+    /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
+    async fn flush_frozen_layers_and_wait(
+        &self,
+        last_record_lsn: Lsn,
+    ) -> Result<(), FlushLayerError> {
+        let mut rx = self.layer_flush_done_tx.subscribe();
+
        // Increment the flush cycle counter and wake up the flush task.
        // Remember the new value, so that when we listen for the flush
        // to finish, we know when the flush that we initiated has
@@ -3805,18 +3780,13 @@ impl Timeline {
        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
            my_flush_request = *counter + 1;
            *counter = my_flush_request;
-            *lsn = std::cmp::max(at_lsn, *lsn);
+            *lsn = std::cmp::max(last_record_lsn, *lsn);
        });

-        Ok(my_flush_request)
-    }
-
-    async fn wait_flush_completion(&self, request: u64) -> Result<(), FlushLayerError> {
-        let mut rx = self.layer_flush_done_tx.subscribe();
        loop {
            {
                let (last_result_counter, last_result) = &*rx.borrow();
-                if *last_result_counter >= request {
+                if *last_result_counter >= my_flush_request {
                    if let Err(err) = last_result {
                        // We already logged the original error in
                        // flush_loop. We cannot propagate it to the caller
@@ -3843,9 +3813,12 @@ impl Timeline {
        }
    }

-    async fn flush_frozen_layers_and_wait(&self, at_lsn: Lsn) -> Result<(), FlushLayerError> {
-        let token = self.flush_frozen_layers(at_lsn)?;
-        self.wait_flush_completion(token).await
+    fn flush_frozen_layers(&self) {
+        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
+            *counter += 1;
+
+            *lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1));
+        });
    }

    /// Flush one frozen in-memory layer to disk, as a new delta layer.
@@ -3906,25 +3879,22 @@ impl Timeline {
                return Err(FlushLayerError::Cancelled);
            }

-            // FIXME(auxfilesv2): support multiple metadata key partitions might need initdb support as well?
-            // This code path will not be hit during regression tests. After #7099 we have a single partition
-            // with two key ranges. If someone wants to fix initdb optimization in the future, this might need
-            // to be fixed.
-
            // For metadata, always create delta layers.
            let delta_layer = if !metadata_partition.parts.is_empty() {
                assert_eq!(
                    metadata_partition.parts.len(),
                    1,
-                    "currently sparse keyspace should only contain a single metadata keyspace"
+                    "currently sparse keyspace should only contain a single aux file keyspace"
                );
                let metadata_keyspace = &metadata_partition.parts[0];
+                assert_eq!(
+                    metadata_keyspace.0.ranges.len(),
+                    1,
+                    "aux file keyspace should be a single range"
+                );
                self.create_delta_layer(
                    &frozen_layer,
-                    Some(
-                        metadata_keyspace.0.ranges.first().unwrap().start
-                            ..metadata_keyspace.0.ranges.last().unwrap().end,
-                    ),
+                    Some(metadata_keyspace.0.ranges[0].clone()),
                    ctx,
                )
                .await
@@ -4469,12 +4439,6 @@ impl Timeline {
                if mode == ImageLayerCreationMode::Initial {
                    return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
                }
-                if mode == ImageLayerCreationMode::Try && !check_for_image_layers {
-                    // Skip compaction if there are not enough updates. Metadata compaction will do a scan and
-                    // might mess up with evictions.
-                    start = img_range.end;
-                    continue;
-                }
            } else if let ImageLayerCreationMode::Try = mode {
                // check_for_image_layers = false -> skip
                // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
@@ -4822,7 +4786,7 @@ impl Timeline {
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<GcCutoffs, PageReconstructError> {
+    ) -> anyhow::Result<GcCutoffs> {
        let _timer = self
            .metrics
            .find_gc_cutoffs_histo
@@ -5559,33 +5523,10 @@ impl Timeline {
        all_data.sort();
        Ok(all_data)
    }
-
-    /// Get all historic layer descriptors in the layer map
-    #[cfg(test)]
-    pub(crate) async fn inspect_historic_layers(
-        self: &Arc<Timeline>,
-    ) -> anyhow::Result<Vec<super::storage_layer::PersistentLayerKey>> {
-        let mut layers = Vec::new();
-        let guard = self.layers.read().await;
-        for layer in guard.layer_map().iter_historic_layers() {
-            layers.push(layer.key());
-        }
-        Ok(layers)
-    }
-
-    #[cfg(test)]
-    pub(crate) fn add_extra_test_dense_keyspace(&self, ks: KeySpace) {
-        let mut keyspace = self.extra_test_dense_keyspace.load().as_ref().clone();
-        keyspace.merge(&ks);
-        self.extra_test_dense_keyspace.store(Arc::new(keyspace));
-    }
 }

 type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);

-/// Tracking writes ingestion does to a particular in-memory layer.
-///
-/// Cleared upon freezing a layer.
 struct TimelineWriterState {
    open_layer: Arc<InMemoryLayer>,
    current_size: u64,
@@ -5626,6 +5567,12 @@ impl Deref for TimelineWriter<'_> {
    }
 }

+impl Drop for TimelineWriter<'_> {
+    fn drop(&mut self) {
+        self.write_guard.take();
+    }
+}
+
 #[derive(PartialEq)]
 enum OpenLayerAction {
    Roll,
@@ -5708,16 +5655,17 @@ impl<'a> TimelineWriter<'a> {
    }

    async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> {
+        assert!(self.write_guard.is_some());
+
+        self.tl.freeze_inmem_layer_at(freeze_at).await;
+
+        let now = Instant::now();
+        *(self.last_freeze_ts.write().unwrap()) = now;
+
+        self.tl.flush_frozen_layers();
+
        let current_size = self.write_guard.as_ref().unwrap().current_size;
-
-        // self.write_guard will be taken by the freezing
-        self.tl
-            .freeze_inmem_layer_at(freeze_at, &mut self.write_guard)
-            .await;
-
-        self.tl.flush_frozen_layers(freeze_at)?;
-
-        if current_size >= self.get_checkpoint_distance() * 2 {
+        if current_size > self.get_checkpoint_distance() {
            warn!("Flushed oversized open layer with size {}", current_size)
        }

@@ -5730,27 +5678,9 @@ impl<'a> TimelineWriter<'a> {
            return OpenLayerAction::Open;
        };

-        #[cfg(feature = "testing")]
-        if state.cached_last_freeze_at < self.tl.last_freeze_at.load() {
-            // this check and assertion are not really needed because
-            // LayerManager::try_freeze_in_memory_layer will always clear out the
-            // TimelineWriterState if something is frozen. however, we can advance last_freeze_at when there
-            // is no TimelineWriterState.
-            assert!(
-                state.open_layer.end_lsn.get().is_some(),
-                "our open_layer must be outdated"
-            );
-
-            // this would be a memory leak waiting to happen because the in-memory layer always has
-            // an index
-            panic!("BUG: TimelineWriterState held on to frozen in-memory layer.");
-        }
-
        if state.prev_lsn == Some(lsn) {
-            // Rolling mid LSN is not supported by [downstream code].
+            // Rolling mid LSN is not supported by downstream code.
            // Hence, only roll at LSN boundaries.
-            //
-            // [downstream code]: https://github.com/neondatabase/neon/pull/7993#discussion_r1633345422
            return OpenLayerAction::None;
        }

--- a/pageserver/src/tenant/timeline/analysis.rs
+++ b/pageserver/src/tenant/timeline/analysis.rs
@@ -1,90 +0,0 @@
-use std::{collections::BTreeSet, ops::Range};
-
-use utils::lsn::Lsn;
-
-use super::Timeline;
-
-#[derive(serde::Serialize)]
-pub(crate) struct RangeAnalysis {
-    start: String,
-    end: String,
-    has_image: bool,
-    num_of_deltas_above_image: usize,
-    total_num_of_deltas: usize,
-}
-
-impl Timeline {
-    pub(crate) async fn perf_info(&self) -> Vec<RangeAnalysis> {
-        // First, collect all split points of the layers.
-        let mut split_points = BTreeSet::new();
-        let mut delta_ranges = Vec::new();
-        let mut image_ranges = Vec::new();
-
-        let all_layer_files = {
-            let guard = self.layers.read().await;
-            guard.all_persistent_layers()
-        };
-        let lsn = self.get_last_record_lsn();
-
-        for key in all_layer_files {
-            split_points.insert(key.key_range.start);
-            split_points.insert(key.key_range.end);
-            if key.is_delta {
-                delta_ranges.push((key.key_range.clone(), key.lsn_range.clone()));
-            } else {
-                image_ranges.push((key.key_range.clone(), key.lsn_range.start));
-            }
-        }
-
-        // For each split range, compute the estimated read amplification.
-        let split_points = split_points.into_iter().collect::<Vec<_>>();
-
-        let mut result = Vec::new();
-
-        for i in 0..(split_points.len() - 1) {
-            let start = split_points[i];
-            let end = split_points[i + 1];
-            // Find the latest image layer that contains the information.
-            let mut maybe_image_layers = image_ranges
-                .iter()
-                // We insert split points for all image layers, and therefore a `contains` check for the start point should be enough.
-                .filter(|(key_range, img_lsn)| key_range.contains(&start) && img_lsn <= &lsn)
-                .cloned()
-                .collect::<Vec<_>>();
-            maybe_image_layers.sort_by(|a, b| a.1.cmp(&b.1));
-            let image_layer = maybe_image_layers.last().cloned();
-            let lsn_filter_start = image_layer
-                .as_ref()
-                .map(|(_, lsn)| *lsn)
-                .unwrap_or(Lsn::INVALID);
-
-            fn overlaps_with(lsn_range_a: &Range<Lsn>, lsn_range_b: &Range<Lsn>) -> bool {
-                !(lsn_range_a.end <= lsn_range_b.start || lsn_range_a.start >= lsn_range_b.end)
-            }
-
-            let maybe_delta_layers = delta_ranges
-                .iter()
-                .filter(|(key_range, lsn_range)| {
-                    key_range.contains(&start) && overlaps_with(&(lsn_filter_start..lsn), lsn_range)
-                })
-                .cloned()
-                .collect::<Vec<_>>();
-
-            let pitr_delta_layers = delta_ranges
-                .iter()
-                .filter(|(key_range, _)| key_range.contains(&start))
-                .cloned()
-                .collect::<Vec<_>>();
-
-            result.push(RangeAnalysis {
-                start: start.to_string(),
-                end: end.to_string(),
-                has_image: image_layer.is_some(),
-                num_of_deltas_above_image: maybe_delta_layers.len(),
-                total_num_of_deltas: pitr_delta_layers.len(),
-            });
-        }
-
-        result
-    }
-}
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -133,7 +133,8 @@ impl Timeline {
                        },
                        &image_ctx,
                    )
-                    .await?;
+                    .await
+                    .map_err(anyhow::Error::from)?;

                self.upload_new_image_layers(image_layers)?;
                partitioning.parts.len()
@@ -421,6 +422,48 @@ impl Timeline {
            return Ok(CompactLevel0Phase1Result::default());
        }

+        // This failpoint is used together with `test_duplicate_layers` integration test.
+        // It returns the compaction result exactly the same layers as input to compaction.
+        // We want to ensure that this will not cause any problem when updating the layer map
+        // after the compaction is finished.
+        //
+        // Currently, there are two rare edge cases that will cause duplicated layers being
+        // inserted.
+        // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which
+        //    is compacted to 5, but the page server is shut down, next time we start page server we will get a layer
+        //    map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this
+        //    point again, it is likely that we will get a file 6 which has the same content and the key range as 5,
+        //    and this causes an overwrite. This is acceptable because the content is the same, and we should do a
+        //    layer replace instead of the normal remove / upload process.
+        // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file
+        //    size length. Compaction will likely create the same set of n files afterwards.
+        //
+        // This failpoint is a superset of both of the cases.
+        if cfg!(feature = "testing") {
+            let active = (|| {
+                ::fail::fail_point!("compact-level0-phase1-return-same", |_| true);
+                false
+            })();
+
+            if active {
+                let mut new_layers = Vec::with_capacity(level0_deltas.len());
+                for delta in &level0_deltas {
+                    // we are just faking these layers as being produced again for this failpoint
+                    new_layers.push(
+                        delta
+                            .download_and_keep_resident()
+                            .await
+                            .context("download layer for failpoint")?,
+                    );
+                }
+                tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
+                return Ok(CompactLevel0Phase1Result {
+                    new_layers,
+                    deltas_to_compact: level0_deltas,
+                });
+            }
+        }
+
        // Gather the files to compact in this iteration.
        //
        // Start with the oldest Level 0 delta file, and collect any other
@@ -952,178 +995,6 @@ impl Timeline {
        adaptor.flush_updates().await?;
        Ok(())
    }
-
-    /// An experimental compaction building block that combines compaction with garbage collection.
-    ///
-    /// The current implementation picks all delta + image layers that are below or intersecting with
-    /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
-    /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
-    /// and create delta layers with all deltas >= gc horizon.
-    #[cfg(test)]
-    pub(crate) async fn compact_with_gc(
-        self: &Arc<Self>,
-        _cancel: &CancellationToken,
-        ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
-        use crate::tenant::storage_layer::ValueReconstructState;
-        // Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
-        // The layer selection has the following properties:
-        // 1. If a layer is in the selection, all layers below it are in the selection.
-        // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
-        let (layer_selection, gc_cutoff) = {
-            let guard = self.layers.read().await;
-            let layers = guard.layer_map();
-            let gc_info = self.gc_info.read().unwrap();
-            let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
-            let mut selected_layers = Vec::new();
-            // TODO: consider retain_lsns
-            drop(gc_info);
-            for desc in layers.iter_historic_layers() {
-                if desc.get_lsn_range().start <= gc_cutoff {
-                    selected_layers.push(guard.get_from_desc(&desc));
-                }
-            }
-            (selected_layers, gc_cutoff)
-        };
-        // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
-        let mut all_key_values = Vec::new();
-        for layer in &layer_selection {
-            all_key_values.extend(layer.load_key_values(ctx).await?);
-        }
-        // Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and
-        // image layers, make image appear later than delta.
-        struct ValueWrapper<'a>(&'a crate::repository::Value);
-        impl Ord for ValueWrapper<'_> {
-            fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-                use crate::repository::Value;
-                use std::cmp::Ordering;
-                match (self.0, other.0) {
-                    (Value::Image(_), Value::WalRecord(_)) => Ordering::Greater,
-                    (Value::WalRecord(_), Value::Image(_)) => Ordering::Less,
-                    _ => Ordering::Equal,
-                }
-            }
-        }
-        impl PartialOrd for ValueWrapper<'_> {
-            fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-                Some(self.cmp(other))
-            }
-        }
-        impl PartialEq for ValueWrapper<'_> {
-            fn eq(&self, other: &Self) -> bool {
-                self.cmp(other) == std::cmp::Ordering::Equal
-            }
-        }
-        impl Eq for ValueWrapper<'_> {}
-        all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| {
-            (k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2)))
-        });
-        let max_lsn = all_key_values
-            .iter()
-            .map(|(_, lsn, _)| lsn)
-            .max()
-            .copied()
-            .unwrap()
-            + 1;
-        // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
-        // Data of the same key.
-        let mut accumulated_values = Vec::new();
-        let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty
-
-        /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
-        async fn flush_accumulated_states(
-            tline: &Arc<Timeline>,
-            key: Key,
-            accumulated_values: &[&(Key, Lsn, crate::repository::Value)],
-            horizon: Lsn,
-        ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
-            let mut base_image = None;
-            let mut keys_above_horizon = Vec::new();
-            let mut delta_above_base_image = Vec::new();
-            // We have a list of deltas/images. We want to create image layers while collect garbages.
-            for (key, lsn, val) in accumulated_values.iter().rev() {
-                if *lsn > horizon {
-                    keys_above_horizon.push((*key, *lsn, val.clone())); // TODO: ensure one LSN corresponds to either delta or image instead of both
-                } else if *lsn <= horizon {
-                    match val {
-                        crate::repository::Value::Image(image) => {
-                            if lsn <= &horizon {
-                                base_image = Some((*lsn, image.clone()));
-                                break;
-                            }
-                        }
-                        crate::repository::Value::WalRecord(wal) => {
-                            delta_above_base_image.push((*lsn, wal.clone()));
-                        }
-                    }
-                }
-            }
-            delta_above_base_image.reverse();
-            keys_above_horizon.reverse();
-            let state = ValueReconstructState {
-                img: base_image,
-                records: delta_above_base_image,
-            };
-            let img = tline.reconstruct_value(key, horizon, state).await?;
-            Ok((keys_above_horizon, img))
-        }
-
-        let mut delta_layer_writer = DeltaLayerWriter::new(
-            self.conf,
-            self.timeline_id,
-            self.tenant_shard_id,
-            all_key_values.first().unwrap().0,
-            gc_cutoff..max_lsn, // TODO: off by one?
-            ctx,
-        )
-        .await?;
-        let mut image_layer_writer = ImageLayerWriter::new(
-            self.conf,
-            self.timeline_id,
-            self.tenant_shard_id,
-            &(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()),
-            gc_cutoff,
-            ctx,
-        )
-        .await?;
-
-        for item @ (key, _, _) in &all_key_values {
-            if &last_key == key {
-                accumulated_values.push(item);
-            } else {
-                let (deltas, image) =
-                    flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff)
-                        .await?;
-                image_layer_writer.put_image(last_key, image, ctx).await?;
-                for (key, lsn, val) in deltas {
-                    delta_layer_writer.put_value(key, lsn, val, ctx).await?;
-                }
-                accumulated_values.clear();
-                accumulated_values.push(item);
-                last_key = *key;
-            }
-        }
-        let (deltas, image) =
-            flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
-        image_layer_writer.put_image(last_key, image, ctx).await?;
-        for (key, lsn, val) in deltas {
-            delta_layer_writer.put_value(key, lsn, val, ctx).await?;
-        }
-        accumulated_values.clear();
-        // TODO: split layers
-        let delta_layer = delta_layer_writer.finish(last_key, self, ctx).await?;
-        let image_layer = image_layer_writer.finish(self, ctx).await?;
-        // Step 3: Place back to the layer map.
-        {
-            let mut guard = self.layers.write().await;
-            guard.finish_gc_compaction(
-                &layer_selection,
-                &[delta_layer.clone(), image_layer.clone()],
-                &self.metrics,
-            )
-        };
-        Ok(())
-    }
 }

 struct TimelineAdaptor {
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -11,6 +11,7 @@ use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};

 use crate::{
    config::PageServerConf,
+    deletion_queue::DeletionQueueClient,
    task_mgr::{self, TaskKind},
    tenant::{
        metadata::TimelineMetadata,
@@ -262,6 +263,7 @@ impl DeleteTimelineFlow {
        timeline_id: TimelineId,
        local_metadata: &TimelineMetadata,
        remote_client: RemoteTimelineClient,
+        deletion_queue_client: DeletionQueueClient,
    ) -> anyhow::Result<()> {
        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
        // RemoteTimelineClient is the only functioning part.
@@ -272,6 +274,7 @@ impl DeleteTimelineFlow {
                None, // Ancestor is not needed for deletion.
                TimelineResources {
                    remote_client,
+                    deletion_queue_client,
                    timeline_get_throttle: tenant.timeline_get_throttle.clone(),
                },
                // Important. We dont pass ancestor above because it can be missing.
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,5 +1,4 @@
 use anyhow::{bail, ensure, Context, Result};
-use itertools::Itertools;
 use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
 use tracing::trace;
@@ -21,8 +20,6 @@ use crate::{
    },
 };

-use super::TimelineWriterState;
-
 /// Provides semantic APIs to manipulate the layer map.
 #[derive(Default)]
 pub(crate) struct LayerManager {
@@ -122,20 +119,18 @@ impl LayerManager {
        Ok(layer)
    }

-    /// Tries to freeze an open layer and also manages clearing the TimelineWriterState.
-    ///
-    /// Returns true if anything was frozen.
-    pub(super) async fn try_freeze_in_memory_layer(
+    /// Called from `freeze_inmem_layer`, returns true if successfully frozen.
+    pub(crate) async fn try_freeze_in_memory_layer(
        &mut self,
        lsn: Lsn,
        last_freeze_at: &AtomicLsn,
-        write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
-    ) -> bool {
+    ) {
        let Lsn(last_record_lsn) = lsn;
        let end_lsn = Lsn(last_record_lsn + 1);

-        let froze = if let Some(open_layer) = &self.layer_map.open_layer {
+        if let Some(open_layer) = &self.layer_map.open_layer {
            let open_layer_rc = Arc::clone(open_layer);
+            // Does this layer need freezing?
            open_layer.freeze(end_lsn).await;

            // The layer is no longer open, update the layer map to reflect this.
@@ -143,25 +138,11 @@ impl LayerManager {
            self.layer_map.frozen_layers.push_back(open_layer_rc);
            self.layer_map.open_layer = None;
            self.layer_map.next_open_layer_at = Some(end_lsn);
-
-            true
-        } else {
-            false
-        };
+        }

        // Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this
        // accounts for regions in the LSN range where we might have ingested no data due to sharding.
        last_freeze_at.store(end_lsn);
-
-        // the writer state must no longer have a reference to the frozen layer
-        let taken = write_lock.take();
-        assert_eq!(
-            froze,
-            taken.is_some(),
-            "should only had frozen a layer when TimelineWriterState existed"
-        );
-
-        froze
    }

    /// Add image layers to the layer map, called from `create_image_layers`.
@@ -226,18 +207,6 @@ impl LayerManager {
        updates.flush();
    }

-    /// Called when a GC-compaction is completed.
-    #[cfg(test)]
-    pub(crate) fn finish_gc_compaction(
-        &mut self,
-        compact_from: &[Layer],
-        compact_to: &[ResidentLayer],
-        metrics: &TimelineMetrics,
-    ) {
-        // We can simply reuse compact l0 logic. Use a different function name to indicate a different type of layer map modification.
-        self.finish_compact_l0(compact_from, compact_to, metrics)
-    }
-
    /// Called when compaction is completed.
    pub(crate) fn rewrite_layers(
        &mut self,
@@ -339,10 +308,6 @@ impl LayerManager {
    pub(crate) fn contains(&self, layer: &Layer) -> bool {
        self.layer_fmgr.contains(layer)
    }
-
-    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
-        self.layer_fmgr.0.keys().cloned().collect_vec()
-    }
 }

 pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -3,10 +3,12 @@ use super::storage_layer::ResidentLayer;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::remote_timeline_client::index::Lineage;
 use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;

 use chrono::NaiveDateTime;
+use pageserver_api::models::AuxFilePolicy;
 use std::sync::Arc;
 use tracing::info;
 use utils::lsn::AtomicLsn;
@@ -43,25 +45,34 @@ pub(crate) struct UploadQueueInitialized {
    /// Counter to assign task IDs
    pub(crate) task_counter: u64,

-    /// The next uploaded index_part.json; assumed to be dirty.
-    ///
-    /// Should not be read, directly except for layer file updates. Instead you should add a
-    /// projected field.
-    pub(crate) dirty: IndexPart,
-
-    /// The latest remote persisted IndexPart.
-    ///
-    /// Each completed metadata upload will update this. The second item is the task_id which last
-    /// updated the value, used to ensure we never store an older value over a newer one.
-    pub(crate) clean: (IndexPart, Option<u64>),
+    /// All layer files stored in the remote storage, taking into account all
+    /// in-progress and queued operations
+    pub(crate) latest_files: HashMap<LayerName, LayerFileMetadata>,

    /// How many file uploads or deletions been scheduled, since the
    /// last (scheduling of) metadata index upload?
    pub(crate) latest_files_changes_since_metadata_upload_scheduled: u64,

-    /// The Lsn is only updated after our generation has been validated with
+    /// Metadata stored in the remote storage, taking into account all
+    /// in-progress and queued operations.
+    /// DANGER: do not return to outside world, e.g., safekeepers.
+    pub(crate) latest_metadata: TimelineMetadata,
+
+    /// Part of the flattened "next" `index_part.json`.
+    pub(crate) latest_lineage: Lineage,
+
+    /// The last aux file policy used on this timeline.
+    pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
+
+    /// `disk_consistent_lsn` from the last metadata file that was successfully
+    /// uploaded. `Lsn(0)` if nothing was uploaded yet.
+    /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
+    /// Safekeeper can rely on it to make decisions for WAL storage.
+    ///
+    /// visible_remote_consistent_lsn is only updated after our generation has been validated with
    /// the control plane (unlesss a timeline's generation is None, in which case
    /// we skip validation)
+    pub(crate) projected_remote_consistent_lsn: Option<Lsn>,
    pub(crate) visible_remote_consistent_lsn: Arc<AtomicLsn>,

    // Breakdown of different kinds of tasks currently in-progress
@@ -107,8 +118,7 @@ impl UploadQueueInitialized {
    }

    pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
-        let lsn = self.clean.0.metadata.disk_consistent_lsn();
-        self.clean.1.map(|_| lsn)
+        self.projected_remote_consistent_lsn
    }
 }

@@ -164,12 +174,13 @@ impl UploadQueue {

        info!("initializing upload queue for empty remote");

-        let index_part = IndexPart::empty(metadata.clone());
-
        let state = UploadQueueInitialized {
-            dirty: index_part.clone(),
-            clean: (index_part, None),
+            // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
+            latest_files: HashMap::new(),
            latest_files_changes_since_metadata_upload_scheduled: 0,
+            latest_metadata: metadata.clone(),
+            latest_lineage: Lineage::default(),
+            projected_remote_consistent_lsn: None,
            visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
            // what follows are boring default initializations
            task_counter: 0,
@@ -182,6 +193,7 @@ impl UploadQueue {
            dangling_files: HashMap::new(),
            shutting_down: false,
            shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+            last_aux_file_policy: Default::default(),
        };

        *self = UploadQueue::Initialized(state);
@@ -199,15 +211,22 @@ impl UploadQueue {
            }
        }

+        let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
+        for (layer_name, layer_metadata) in &index_part.layer_metadata {
+            files.insert(layer_name.to_owned(), layer_metadata.clone());
+        }
+
        info!(
            "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
            index_part.metadata.disk_consistent_lsn()
        );

        let state = UploadQueueInitialized {
-            dirty: index_part.clone(),
-            clean: (index_part.clone(), None),
+            latest_files: files,
            latest_files_changes_since_metadata_upload_scheduled: 0,
+            latest_metadata: index_part.metadata.clone(),
+            latest_lineage: index_part.lineage.clone(),
+            projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
            visible_remote_consistent_lsn: Arc::new(
                index_part.metadata.disk_consistent_lsn().into(),
            ),
@@ -222,6 +241,7 @@ impl UploadQueue {
            dangling_files: HashMap::new(),
            shutting_down: false,
            shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+            last_aux_file_policy: index_part.last_aux_file_policy(),
        };

        *self = UploadQueue::Initialized(state);
@@ -278,16 +298,13 @@ pub(crate) enum UploadOp {
    /// Upload a layer file
    UploadLayer(ResidentLayer, LayerFileMetadata),

-    /// Upload a index_part.json file
-    UploadMetadata {
-        /// The next [`UploadQueueInitialized::clean`] after this upload succeeds.
-        uploaded: Box<IndexPart>,
-    },
+    /// Upload the metadata file
+    UploadMetadata(Box<IndexPart>, Lsn),

    /// Delete layer files
    Delete(Delete),

-    /// Barrier. When the barrier operation is reached, the channel is closed.
+    /// Barrier. When the barrier operation is reached,
    Barrier(tokio::sync::watch::Sender<()>),

    /// Shutdown; upon encountering this operation no new operations will be spawned, otherwise
@@ -305,12 +322,8 @@ impl std::fmt::Display for UploadOp {
                    layer, metadata.file_size, metadata.generation
                )
            }
-            UploadOp::UploadMetadata { uploaded, .. } => {
-                write!(
-                    f,
-                    "UploadMetadata(lsn: {})",
-                    uploaded.metadata.disk_consistent_lsn()
-                )
+            UploadOp::UploadMetadata(_, lsn) => {
+                write!(f, "UploadMetadata(lsn: {})", lsn)
            }
            UploadOp::Delete(delete) => {
                write!(f, "Delete({} layers)", delete.layers.len())
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -234,7 +234,6 @@ impl WalIngest {
                        modification,
                        &parsed_xact,
                        info == pg_constants::XLOG_XACT_COMMIT,
-                        decoded.origin_id,
                        ctx,
                    )
                    .await?;
@@ -247,7 +246,6 @@ impl WalIngest {
                        modification,
                        &parsed_xact,
                        info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
-                        decoded.origin_id,
                        ctx,
                    )
                    .await?;
@@ -377,18 +375,6 @@ impl WalIngest {
                    self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
                }
            }
-            pg_constants::RM_REPLORIGIN_ID => {
-                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-                if info == pg_constants::XLOG_REPLORIGIN_SET {
-                    let xlrec = crate::walrecord::XlReploriginSet::decode(&mut buf);
-                    modification
-                        .set_replorigin(xlrec.node_id, xlrec.remote_lsn)
-                        .await?
-                } else if info == pg_constants::XLOG_REPLORIGIN_DROP {
-                    let xlrec = crate::walrecord::XlReploriginDrop::decode(&mut buf);
-                    modification.drop_replorigin(xlrec.node_id).await?
-                }
-            }
            _x => {
                // TODO: should probably log & fail here instead of blindly
                // doing something without understanding the protocol
@@ -1192,7 +1178,6 @@ impl WalIngest {
        modification: &mut DatadirModification<'_>,
        parsed: &XlXactParsedRecord,
        is_commit: bool,
-        origin_id: u16,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        // Record update of CLOG pages
@@ -1258,11 +1243,6 @@ impl WalIngest {
                }
            }
        }
-        if origin_id != 0 {
-            modification
-                .set_replorigin(origin_id, parsed.origin_lsn)
-                .await?;
-        }
        Ok(())
    }

--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -9,10 +9,10 @@ use postgres_ffi::pg_constants;
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{BlockNumber, TimestampTz};
 use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId};
-use postgres_ffi::{RepOriginId, XLogRecord, XLOG_SIZE_OF_XLOG_RECORD};
+use postgres_ffi::{XLogRecord, XLOG_SIZE_OF_XLOG_RECORD};
 use serde::{Deserialize, Serialize};
 use tracing::*;
-use utils::{bin_ser::DeserializeError, lsn::Lsn};
+use utils::bin_ser::DeserializeError;

 /// Each update to a page is represented by a NeonWalRecord. It can be a wrapper
 /// around a PostgreSQL WAL record, or a custom neon-specific "record".
@@ -49,19 +49,6 @@ pub enum NeonWalRecord {
        file_path: String,
        content: Option<Bytes>,
    },
-
-    /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
-    #[cfg(test)]
-    Test {
-        /// Append a string to the image.
-        append: String,
-        /// Clear the image before appending.
-        clear: bool,
-        /// Treat this record as an init record. `clear` should be set to true if this field is set
-        /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
-        /// its references in `timeline.rs`.
-        will_init: bool,
-    },
 }

 impl NeonWalRecord {
@@ -71,39 +58,11 @@ impl NeonWalRecord {
        // If you change this function, you'll also need to change ValueBytes::will_init
        match self {
            NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
-            #[cfg(test)]
-            NeonWalRecord::Test { will_init, .. } => *will_init,
+
            // None of the special neon record types currently initialize the page
            _ => false,
        }
    }
-
-    #[cfg(test)]
-    pub(crate) fn wal_append(s: impl AsRef<str>) -> Self {
-        Self::Test {
-            append: s.as_ref().to_string(),
-            clear: false,
-            will_init: false,
-        }
-    }
-
-    #[cfg(test)]
-    pub(crate) fn wal_clear() -> Self {
-        Self::Test {
-            append: "".to_string(),
-            clear: true,
-            will_init: false,
-        }
-    }
-
-    #[cfg(test)]
-    pub(crate) fn wal_init() -> Self {
-        Self::Test {
-            append: "".to_string(),
-            clear: true,
-            will_init: true,
-        }
-    }
 }

 /// DecodedBkpBlock represents per-page data contained in a WAL record.
@@ -157,7 +116,6 @@ pub struct DecodedWALRecord {

    pub blocks: Vec<DecodedBkpBlock>,
    pub main_data_offset: usize,
-    pub origin_id: u16,
 }

 #[repr(C)]
@@ -615,7 +573,6 @@ pub struct XlXactParsedRecord {
    pub subxacts: Vec<TransactionId>,

    pub xnodes: Vec<RelFileNode>,
-    pub origin_lsn: Lsn,
 }

 impl XlXactParsedRecord {
@@ -694,11 +651,6 @@ impl XlXactParsedRecord {
            debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid);
        }

-        let origin_lsn = if xinfo & pg_constants::XACT_XINFO_HAS_ORIGIN != 0 {
-            Lsn(buf.get_u64_le())
-        } else {
-            Lsn::INVALID
-        };
        XlXactParsedRecord {
            xid,
            info,
@@ -708,7 +660,6 @@ impl XlXactParsedRecord {
            ts_id,
            subxacts,
            xnodes,
-            origin_lsn,
        }
    }
 }
@@ -859,36 +810,6 @@ impl XlRunningXacts {
    }
 }

-#[repr(C)]
-#[derive(Debug)]
-pub struct XlReploriginDrop {
-    pub node_id: RepOriginId,
-}
-
-impl XlReploriginDrop {
-    pub fn decode(buf: &mut Bytes) -> XlReploriginDrop {
-        XlReploriginDrop {
-            node_id: buf.get_u16_le(),
-        }
-    }
-}
-
-#[repr(C)]
-#[derive(Debug)]
-pub struct XlReploriginSet {
-    pub remote_lsn: Lsn,
-    pub node_id: RepOriginId,
-}
-
-impl XlReploriginSet {
-    pub fn decode(buf: &mut Bytes) -> XlReploriginSet {
-        XlReploriginSet {
-            remote_lsn: Lsn(buf.get_u64_le()),
-            node_id: buf.get_u16_le(),
-        }
-    }
-}
-
 /// Main routine to decode a WAL record and figure out which blocks are modified
 //
 // See xlogrecord.h for details
@@ -923,7 +844,6 @@ pub fn decode_wal_record(
    let mut rnode_dbnode: u32 = 0;
    let mut rnode_relnode: u32 = 0;
    let mut got_rnode = false;
-    let mut origin_id: u16 = 0;

    let mut buf = record.clone();

@@ -971,7 +891,7 @@ pub fn decode_wal_record(

            pg_constants::XLR_BLOCK_ID_ORIGIN => {
                // RepOriginId is uint16
-                origin_id = buf.get_u16_le();
+                buf.advance(2);
            }

            pg_constants::XLR_BLOCK_ID_TOPLEVEL_XID => {
@@ -1168,7 +1088,6 @@ pub fn decode_wal_record(
    decoded.xl_info = xlogrec.xl_info;
    decoded.xl_rmid = xlogrec.xl_rmid;
    decoded.record = record;
-    decoded.origin_id = origin_id;
    decoded.main_data_offset = main_data_offset;

    Ok(())
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -20,6 +20,7 @@

 /// Process lifecycle and abstracction for the IPC protocol.
 mod process;
+pub use process::Kind as ProcessKind;

 /// Code to apply [`NeonWalRecord`]s.
 pub(crate) mod apply_neon;
@@ -53,7 +54,7 @@ pub struct PostgresRedoManager {
    tenant_shard_id: TenantShardId,
    conf: &'static PageServerConf,
    last_redo_at: std::sync::Mutex<Option<Instant>>,
-    /// The current [`process::WalRedoProcess`] that is used by new redo requests.
+    /// The current [`process::Process`] that is used by new redo requests.
    /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
    /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
    /// their process object; we use [`Arc::clone`] for that.
@@ -65,7 +66,7 @@ pub struct PostgresRedoManager {
    /// still be using the old redo process. But, those other tasks will most likely
    /// encounter an error as well, and errors are an unexpected condition anyway.
    /// So, probably we could get rid of the `Arc` in the future.
-    redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
+    redo_process: heavier_once_cell::OnceCell<Arc<process::Process>>,
 }

 ///
@@ -210,31 +211,26 @@ impl PostgresRedoManager {
        const MAX_RETRY_ATTEMPTS: u32 = 1;
        let mut n_attempts = 0u32;
        loop {
-            let proc: Arc<process::WalRedoProcess> =
-                match self.redo_process.get_or_init_detached().await {
-                    Ok(guard) => Arc::clone(&guard),
-                    Err(permit) => {
-                        // don't hold poison_guard, the launch code can bail
-                        let start = Instant::now();
-                        let proc = Arc::new(
-                            process::WalRedoProcess::launch(
-                                self.conf,
-                                self.tenant_shard_id,
-                                pg_version,
-                            )
+            let proc: Arc<process::Process> = match self.redo_process.get_or_init_detached().await {
+                Ok(guard) => Arc::clone(&guard),
+                Err(permit) => {
+                    // don't hold poison_guard, the launch code can bail
+                    let start = Instant::now();
+                    let proc = Arc::new(
+                        process::Process::launch(self.conf, self.tenant_shard_id, pg_version)
                            .context("launch walredo process")?,
-                        );
-                        let duration = start.elapsed();
-                        WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
-                        info!(
-                            duration_ms = duration.as_millis(),
-                            pid = proc.id(),
-                            "launched walredo process"
-                        );
-                        self.redo_process.set(Arc::clone(&proc), permit);
-                        proc
-                    }
-                };
+                    );
+                    let duration = start.elapsed();
+                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                    info!(
+                        duration_ms = duration.as_millis(),
+                        pid = proc.id(),
+                        "launched walredo process"
+                    );
+                    self.redo_process.set(Arc::clone(&proc), permit);
+                    proc
+                }
+            };

            let started_at = std::time::Instant::now();

@@ -365,10 +361,10 @@ impl PostgresRedoManager {
        &self,
        key: Key,
        page: &mut BytesMut,
-        record_lsn: Lsn,
+        _record_lsn: Lsn,
        record: &NeonWalRecord,
    ) -> anyhow::Result<()> {
-        apply_neon::apply_in_neon(record, record_lsn, key, page)?;
+        apply_neon::apply_in_neon(record, key, page)?;

        Ok(())
    }
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -14,7 +14,6 @@ use postgres_ffi::v14::nonrelfile_utils::{
 use postgres_ffi::BLCKSZ;
 use tracing::*;
 use utils::bin_ser::BeSer;
-use utils::lsn::Lsn;

 /// Can this request be served by neon redo functions
 /// or we need to pass it to wal-redo postgres process?
@@ -33,7 +32,6 @@ pub(crate) fn can_apply_in_neon(rec: &NeonWalRecord) -> bool {

 pub(crate) fn apply_in_neon(
    record: &NeonWalRecord,
-    lsn: Lsn,
    key: Key,
    page: &mut BytesMut,
 ) -> Result<(), anyhow::Error> {
@@ -69,7 +67,6 @@ pub(crate) fn apply_in_neon(
                let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];

                map[map_byte as usize] &= !(flags << map_offset);
-                postgres_ffi::page_set_lsn(page, lsn);
            }

            // Repeat for 'old_heap_blkno', if any
@@ -83,7 +80,6 @@ pub(crate) fn apply_in_neon(
                let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];

                map[map_byte as usize] &= !(flags << map_offset);
-                postgres_ffi::page_set_lsn(page, lsn);
            }
        }
        // Non-relational WAL records are handled here, with custom code that has the
@@ -244,20 +240,6 @@ pub(crate) fn apply_in_neon(
            let mut writer = page.writer();
            dir.ser_into(&mut writer)?;
        }
-        #[cfg(test)]
-        NeonWalRecord::Test {
-            append,
-            clear,
-            will_init,
-        } => {
-            if *will_init {
-                assert!(*clear, "init record must be clear to ensure correctness");
-            }
-            if *clear {
-                page.clear();
-            }
-            page.put_slice(append.as_bytes());
-        }
    }
    Ok(())
 }
@@ -303,7 +285,7 @@ mod test {
        let mut page = BytesMut::from_iter(base_image);

        for record in deltas {
-            apply_in_neon(&record, Lsn(8), file_path, &mut page)?;
+            apply_in_neon(&record, file_path, &mut page)?;
        }

        let reconstructed = AuxFilesDirectory::des(&page)?;
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -1,187 +1,64 @@
+/// Layer of indirection previously used to support multiple implementations.
+/// Subject to removal: <https://github.com/neondatabase/neon/issues/7753>
+use std::time::Duration;
+
+use bytes::Bytes;
+use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use tracing::warn;
+use utils::lsn::Lsn;
+
+use crate::{config::PageServerConf, walrecord::NeonWalRecord};
+
 mod no_leak_child;
 /// The IPC protocol that pageserver and walredo process speak over their shared pipe.
 mod protocol;

-use self::no_leak_child::NoLeakChild;
-use crate::{
-    config::PageServerConf,
-    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
-    span::debug_assert_current_span_has_tenant_id,
-    walrecord::NeonWalRecord,
-};
-use anyhow::Context;
-use bytes::Bytes;
-use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use postgres_ffi::BLCKSZ;
-#[cfg(feature = "testing")]
-use std::sync::atomic::AtomicUsize;
-use std::{
-    collections::VecDeque,
-    process::{Command, Stdio},
-    time::Duration,
-};
-use tokio::io::{AsyncReadExt, AsyncWriteExt};
-use tracing::{debug, error, instrument, Instrument};
-use utils::{lsn::Lsn, poison::Poison};
-
-pub struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    #[cfg(feature = "testing")]
-    tenant_shard_id: TenantShardId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
-    stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
+mod process_impl {
+    pub(super) mod process_async;
 }

-struct ProcessInput {
-    stdin: tokio::process::ChildStdin,
-    n_requests: usize,
+#[derive(
+    Clone,
+    Copy,
+    Debug,
+    PartialEq,
+    Eq,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    strum_macros::IntoStaticStr,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+#[repr(u8)]
+pub enum Kind {
+    Sync,
+    Async,
 }

-struct ProcessOutput {
-    stdout: tokio::process::ChildStdout,
-    pending_responses: VecDeque<Option<Bytes>>,
-    n_processed_responses: usize,
-}
+pub(crate) struct Process(process_impl::process_async::WalRedoProcess);

-impl WalRedoProcess {
-    //
-    // Start postgres binary in special WAL redo mode.
-    //
-    #[instrument(skip_all,fields(pg_version=pg_version))]
-    pub(crate) fn launch(
+impl Process {
+    #[inline(always)]
+    pub fn launch(
        conf: &'static PageServerConf,
        tenant_shard_id: TenantShardId,
        pg_version: u32,
    ) -> anyhow::Result<Self> {
-        crate::span::debug_assert_current_span_has_tenant_id();
-
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
-
-        use no_leak_child::NoLeakChildCommandExt;
-        // Start postgres itself
-        let child = Command::new(pg_bin_dir_path.join("postgres"))
-            // the first arg must be --wal-redo so the child process enters into walredo mode
-            .arg("--wal-redo")
-            // the child doesn't process this arg, but, having it in the argv helps indentify the
-            // walredo process for a particular tenant when debugging a pagserver
-            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
-            .stdin(Stdio::piped())
-            .stderr(Stdio::piped())
-            .stdout(Stdio::piped())
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // NB: The redo process is not trusted after we sent it the first
-            // walredo work. Before that, it is trusted. Specifically, we trust
-            // it to
-            // 1. close all file descriptors except stdin, stdout, stderr because
-            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
-            //    the files it opens, and
-            // 2. to use seccomp to sandbox itself before processing the first
-            //    walredo request.
-            .spawn_no_leak_child(tenant_shard_id)
-            .context("spawn process")?;
-        WAL_REDO_PROCESS_COUNTERS.started.inc();
-        let mut child = scopeguard::guard(child, |child| {
-            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait(WalRedoKillCause::Startup);
-        });
-
-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
-        let stdin =
-            tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
-        let stdout = tokio::process::ChildStdout::from_std(stdout)
-            .context("convert to tokio::ChildStdout")?;
-
-        // all fallible operations post-spawn are complete, so get rid of the guard
-        let child = scopeguard::ScopeGuard::into_inner(child);
-
-        tokio::spawn(
-            async move {
-                scopeguard::defer! {
-                    debug!("wal-redo-postgres stderr_logger_task finished");
-                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
-                }
-                debug!("wal-redo-postgres stderr_logger_task started");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-                use tokio::io::AsyncBufReadExt;
-                let mut stderr_lines = tokio::io::BufReader::new(stderr);
-                let mut buf = Vec::new();
-                let res = loop {
-                    buf.clear();
-                    // TODO we don't trust the process to cap its stderr length.
-                    // Currently it can do unbounded Vec allocation.
-                    match stderr_lines.read_until(b'\n', &mut buf).await {
-                        Ok(0) => break Ok(()), // eof
-                        Ok(num_bytes) => {
-                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                            error!(%output, "received output");
-                        }
-                        Err(e) => {
-                            break Err(e);
-                        }
-                    }
-                };
-                match res {
-                    Ok(()) => (),
-                    Err(e) => {
-                        error!(error=?e, "failed to read from walredo stderr");
-                    }
-                }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
-        );
-
-        Ok(Self {
+        if conf.walredo_process_kind != Kind::Async {
+            warn!(
+                configured = %conf.walredo_process_kind,
+                "the walredo_process_kind setting has been turned into a no-op, using async implementation"
+            );
+        }
+        Ok(Self(process_impl::process_async::WalRedoProcess::launch(
            conf,
-            #[cfg(feature = "testing")]
            tenant_shard_id,
-            child: Some(child),
-            stdin: tokio::sync::Mutex::new(Poison::new(
-                "stdin",
-                ProcessInput {
-                    stdin,
-                    n_requests: 0,
-                },
-            )),
-            stdout: tokio::sync::Mutex::new(Poison::new(
-                "stdout",
-                ProcessOutput {
-                    stdout,
-                    pending_responses: VecDeque::new(),
-                    n_processed_responses: 0,
-                },
-            )),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
-        })
+            pg_version,
+        )?))
    }

-    pub(crate) fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
-    }
-
-    /// Apply given WAL records ('records') over an old page image. Returns
-    /// new page image.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// Cancellation safe.
-    #[instrument(skip_all, fields(pid=%self.id()))]
+    #[inline(always)]
    pub(crate) async fn apply_wal_records(
        &self,
        rel: RelTag,
@@ -190,193 +67,12 @@ impl WalRedoProcess {
        records: &[(Lsn, NeonWalRecord)],
        wal_redo_timeout: Duration,
    ) -> anyhow::Result<Bytes> {
-        debug_assert_current_span_has_tenant_id();
-
-        let tag = protocol::BufferTag { rel, blknum };
-
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        //
-        // Most requests start with a before-image with BLCKSZ bytes, followed by
-        // by some other WAL records. Start with a buffer that can hold that
-        // comfortably.
-        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
-        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            protocol::build_push_page_msg(tag, img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            if let NeonWalRecord::Postgres {
-                will_init: _,
-                rec: postgres_rec,
-            } = rec
-            {
-                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
-            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
-            }
-        }
-        protocol::build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let Ok(res) =
-            tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
-        else {
-            anyhow::bail!("WAL redo timed out");
-        };
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
+        self.0
+            .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+            .await
    }

-    /// # Cancel-Safety
-    ///
-    /// When not polled to completion (e.g. because in `tokio::select!` another
-    /// branch becomes ready before this future), concurrent and subsequent
-    /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
-    /// Dispose of this process instance and create a new one.
-    async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
-        let request_no = {
-            let mut lock_guard = self.stdin.lock().await;
-            let mut poison_guard = lock_guard.check_and_arm()?;
-            let input = poison_guard.data_mut();
-            input
-                .stdin
-                .write_all(writebuf)
-                .await
-                .context("write to walredo stdin")?;
-            let request_no = input.n_requests;
-            input.n_requests += 1;
-            poison_guard.disarm();
-            request_no
-        };
-
-        // To improve walredo performance we separate sending requests and receiving
-        // responses. Them are protected by different mutexes (output and input).
-        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
-        // then there is not warranty that T1 will first granted output mutex lock.
-        // To address this issue we maintain number of sent requests, number of processed
-        // responses and ring buffer with pending responses. After sending response
-        // (under input mutex), threads remembers request number. Then it releases
-        // input mutex, locks output mutex and fetch in ring buffer all responses until
-        // its stored request number. The it takes correspondent element from
-        // pending responses ring buffer and truncate all empty elements from the front,
-        // advancing processed responses number.
-
-        let mut lock_guard = self.stdout.lock().await;
-        let mut poison_guard = lock_guard.check_and_arm()?;
-        let output = poison_guard.data_mut();
-        let n_processed_responses = output.n_processed_responses;
-        while n_processed_responses + output.pending_responses.len() <= request_no {
-            // We expect the WAL redo process to respond with an 8k page image. We read it
-            // into this buffer.
-            let mut resultbuf = vec![0; BLCKSZ.into()];
-            output
-                .stdout
-                .read_exact(&mut resultbuf)
-                .await
-                .context("read walredo stdout")?;
-            output
-                .pending_responses
-                .push_back(Some(Bytes::from(resultbuf)));
-        }
-        // Replace our request's response with None in `pending_responses`.
-        // Then make space in the ring buffer by clearing out any seqence of contiguous
-        // `None`'s from the front of `pending_responses`.
-        // NB: We can't pop_front() because other requests' responses because another
-        // requester might have grabbed the output mutex before us:
-        // T1: grab input mutex
-        // T1: send request_no 23
-        // T1: release input mutex
-        // T2: grab input mutex
-        // T2: send request_no 24
-        // T2: release input mutex
-        // T2: grab output mutex
-        // T2: n_processed_responses + output.pending_responses.len() <= request_no
-        //            23                                0                   24
-        // T2: enters poll loop that reads stdout
-        // T2: put response for 23 into pending_responses
-        // T2: put response for 24 into pending_resposnes
-        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
-        // T2: takes its response_24
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: releases output mutex
-        // T1: grabs output mutex
-        // T1: n_processed_responses + output.pending_responses.len() > request_no
-        //            23                                2                   23
-        // T1: skips poll loop that reads stdout
-        // T1: takes its response_23
-        // pending_responses now looks like this: Front None None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Back
-        // n_processed_responses now has value 25
-        let res = output.pending_responses[request_no - n_processed_responses]
-            .take()
-            .expect("we own this request_no, nobody else is supposed to take it");
-        while let Some(front) = output.pending_responses.front() {
-            if front.is_none() {
-                output.pending_responses.pop_front();
-                output.n_processed_responses += 1;
-            } else {
-                break;
-            }
-        }
-        poison_guard.disarm();
-        Ok(res)
-    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        use std::sync::atomic::Ordering;
-
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
-
-        use std::io::Write;
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
-        }
-    }
-
-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
-}
-
-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        // no way to wait for stderr_logger_task from Drop because that is async only
+    pub(crate) fn id(&self) -> u32 {
+        self.0.id()
    }
 }
--- a/pageserver/src/walredo/process/process_impl/process_async.rs
+++ b/pageserver/src/walredo/process/process_impl/process_async.rs
@@ -0,0 +1,374 @@
+use self::no_leak_child::NoLeakChild;
+use crate::{
+    config::PageServerConf,
+    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    walrecord::NeonWalRecord,
+    walredo::process::{no_leak_child, protocol},
+};
+use anyhow::Context;
+use bytes::Bytes;
+use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use postgres_ffi::BLCKSZ;
+#[cfg(feature = "testing")]
+use std::sync::atomic::AtomicUsize;
+use std::{
+    collections::VecDeque,
+    process::{Command, Stdio},
+    time::Duration,
+};
+use tokio::io::{AsyncReadExt, AsyncWriteExt};
+use tracing::{debug, error, instrument, Instrument};
+use utils::{lsn::Lsn, poison::Poison};
+
+pub struct WalRedoProcess {
+    #[allow(dead_code)]
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    // Some() on construction, only becomes None on Drop.
+    child: Option<NoLeakChild>,
+    stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
+    stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
+}
+
+struct ProcessInput {
+    stdin: tokio::process::ChildStdin,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: tokio::process::ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
+
+impl WalRedoProcess {
+    //
+    // Start postgres binary in special WAL redo mode.
+    //
+    #[instrument(skip_all,fields(pg_version=pg_version))]
+    pub(crate) fn launch(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        pg_version: u32,
+    ) -> anyhow::Result<Self> {
+        crate::span::debug_assert_current_span_has_tenant_id();
+
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+
+        use no_leak_child::NoLeakChildCommandExt;
+        // Start postgres itself
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
+            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
+            .stdin(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
+            .spawn_no_leak_child(tenant_shard_id)
+            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait(WalRedoKillCause::Startup);
+        });
+
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
+        let stdin =
+            tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
+        let stdout = tokio::process::ChildStdout::from_std(stdout)
+            .context("convert to tokio::ChildStdout")?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);
+
+        tokio::spawn(
+            async move {
+                scopeguard::defer! {
+                    debug!("wal-redo-postgres stderr_logger_task finished");
+                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                }
+                debug!("wal-redo-postgres stderr_logger_task started");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+                use tokio::io::AsyncBufReadExt;
+                let mut stderr_lines = tokio::io::BufReader::new(stderr);
+                let mut buf = Vec::new();
+                let res = loop {
+                    buf.clear();
+                    // TODO we don't trust the process to cap its stderr length.
+                    // Currently it can do unbounded Vec allocation.
+                    match stderr_lines.read_until(b'\n', &mut buf).await {
+                        Ok(0) => break Ok(()), // eof
+                        Ok(num_bytes) => {
+                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                            error!(%output, "received output");
+                        }
+                        Err(e) => {
+                            break Err(e);
+                        }
+                    }
+                };
+                match res {
+                    Ok(()) => (),
+                    Err(e) => {
+                        error!(error=?e, "failed to read from walredo stderr");
+                    }
+                }
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+        );
+
+        Ok(Self {
+            conf,
+            tenant_shard_id,
+            child: Some(child),
+            stdin: tokio::sync::Mutex::new(Poison::new(
+                "stdin",
+                ProcessInput {
+                    stdin,
+                    n_requests: 0,
+                },
+            )),
+            stdout: tokio::sync::Mutex::new(Poison::new(
+                "stdout",
+                ProcessOutput {
+                    stdout,
+                    pending_responses: VecDeque::new(),
+                    n_processed_responses: 0,
+                },
+            )),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
+        })
+    }
+
+    pub(crate) fn id(&self) -> u32 {
+        self.child
+            .as_ref()
+            .expect("must not call this during Drop")
+            .id()
+    }
+
+    /// Apply given WAL records ('records') over an old page image. Returns
+    /// new page image.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// Cancellation safe.
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    pub(crate) async fn apply_wal_records(
+        &self,
+        rel: RelTag,
+        blknum: u32,
+        base_img: &Option<Bytes>,
+        records: &[(Lsn, NeonWalRecord)],
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let tag = protocol::BufferTag { rel, blknum };
+
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
+        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            protocol::build_push_page_msg(tag, img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            if let NeonWalRecord::Postgres {
+                will_init: _,
+                rec: postgres_rec,
+            } = rec
+            {
+                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
+            } else {
+                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+            }
+        }
+        protocol::build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        let Ok(res) =
+            tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
+        else {
+            anyhow::bail!("WAL redo timed out");
+        };
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
+    }
+
+    /// # Cancel-Safety
+    ///
+    /// When not polled to completion (e.g. because in `tokio::select!` another
+    /// branch becomes ready before this future), concurrent and subsequent
+    /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
+    /// Dispose of this process instance and create a new one.
+    async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
+        let request_no = {
+            let mut lock_guard = self.stdin.lock().await;
+            let mut poison_guard = lock_guard.check_and_arm()?;
+            let input = poison_guard.data_mut();
+            input
+                .stdin
+                .write_all(writebuf)
+                .await
+                .context("write to walredo stdin")?;
+            let request_no = input.n_requests;
+            input.n_requests += 1;
+            poison_guard.disarm();
+            request_no
+        };
+
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut lock_guard = self.stdout.lock().await;
+        let mut poison_guard = lock_guard.check_and_arm()?;
+        let output = poison_guard.data_mut();
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            output
+                .stdout
+                .read_exact(&mut resultbuf)
+                .await
+                .context("read walredo stdout")?;
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        poison_guard.disarm();
+        Ok(res)
+    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        use std::sync::atomic::Ordering;
+
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+
+        use std::io::Write;
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
+        }
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
+}
+
+impl Drop for WalRedoProcess {
+    fn drop(&mut self) {
+        self.child
+            .take()
+            .expect("we only do this once")
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        // no way to wait for stderr_logger_task from Drop because that is async only
+    }
+}
--- a/patches/pg_anon.patch
+++ b/patches/pg_anon.patch
@@ -1,223 +0,0 @@
-commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
-Author: Alexey Masterov <alexeymasterov@neon.tech>
-Date:   Fri May 31 06:34:26 2024 +0000
-
-    These alternative expected files were added to consider the neon features
-
-diff --git a/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
-new file mode 100644
-index 0000000..2539cfd
--- /dev/null
-+++ b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
-@@ -0,0 +1,101 @@
-+BEGIN;
-+CREATE EXTENSION anon CASCADE;
-+NOTICE:  installing required extension "pgcrypto"
-+SELECT anon.init();
-+ init 
-+------
-+ t
-+(1 row)
-+
-+CREATE ROLE mallory_the_masked_user;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
-+CREATE TABLE t1(i INT);
-+ALTER TABLE t1 ADD COLUMN t TEXT;
-+SECURITY LABEL FOR anon ON COLUMN t1.t
-+IS 'MASKED WITH VALUE NULL';
-+INSERT INTO t1 VALUES (1,'test');
-+--
-+-- We're checking the owner's permissions
-+--
-+-- see
-+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
-+--
-+SET ROLE mallory_the_masked_user;
-+SELECT anon.pseudo_first_name(0) IS NOT NULL;
-+ ?column? 
-+----------
-+ t
-+(1 row)
-+
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  PERFORM anon.init();
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  PERFORM anon.anonymize_table('t1');
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+-- SHOULD FAIL
-+SAVEPOINT fail_start_engine;
-+SELECT anon.start_dynamic_masking();
-+ERROR:  Only supersusers can start the dynamic masking engine.
-+CONTEXT:  PL/pgSQL function anon.start_dynamic_masking(boolean) line 18 at RAISE
-+ROLLBACK TO fail_start_engine;
-+RESET ROLE;
-+SELECT anon.start_dynamic_masking();
-+ start_dynamic_masking 
-+-----------------------
-+ t
-+(1 row)
-+
-+SET ROLE mallory_the_masked_user;
-+SELECT * FROM mask.t1;
-+ i | t 
-+---+---
-+ 1 | 
-+(1 row)
-+
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  SELECT * FROM public.t1;
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+-- SHOULD FAIL
-+SAVEPOINT fail_stop_engine;
-+SELECT anon.stop_dynamic_masking();
-+ERROR:  Only supersusers can stop the dynamic masking engine.
-+CONTEXT:  PL/pgSQL function anon.stop_dynamic_masking() line 18 at RAISE
-+ROLLBACK TO fail_stop_engine;
-+RESET ROLE;
-+SELECT anon.stop_dynamic_masking();
-+NOTICE:  The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
-+ stop_dynamic_masking 
-+----------------------
-+ t
-+(1 row)
-+
-+SET ROLE mallory_the_masked_user;
-+SELECT COUNT(*)=1 FROM anon.pg_masking_rules;
-+ ?column? 
-+----------
-+ t
-+(1 row)
-+
-+-- SHOULD FAIL
-+SAVEPOINT fail_seclabel_on_role;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
-+ERROR:  permission denied
-+DETAIL:  The current user must have the CREATEROLE attribute.
-+ROLLBACK TO fail_seclabel_on_role;
-+ROLLBACK;
-diff --git a/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
-new file mode 100644
-index 0000000..8b090fe
--- /dev/null
-+++ b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
-@@ -0,0 +1,104 @@
-+BEGIN;
-+CREATE EXTENSION anon CASCADE;
-+NOTICE:  installing required extension "pgcrypto"
-+SELECT anon.init();
-+ init 
-+------
-+ t
-+(1 row)
-+
-+CREATE ROLE oscar_the_owner;
-+ALTER DATABASE :DBNAME OWNER TO oscar_the_owner;
-+CREATE ROLE mallory_the_masked_user;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
-+--
-+-- We're checking the owner's permissions
-+--
-+-- see
-+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
-+--
-+SET ROLE oscar_the_owner;
-+SELECT anon.pseudo_first_name(0) IS NOT NULL;
-+ ?column? 
-+----------
-+ t
-+(1 row)
-+
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  PERFORM anon.init();
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+CREATE TABLE t1(i INT);
-+ALTER TABLE t1 ADD COLUMN t TEXT;
-+SECURITY LABEL FOR anon ON COLUMN t1.t
-+IS 'MASKED WITH VALUE NULL';
-+INSERT INTO t1 VALUES (1,'test');
-+SELECT anon.anonymize_table('t1');
-+ anonymize_table 
-+-----------------
-+ t
-+(1 row)
-+
-+SELECT * FROM t1;
-+ i | t 
-+---+---
-+ 1 | 
-+(1 row)
-+
-+UPDATE t1 SET t='test' WHERE i=1;
-+-- SHOULD FAIL
-+SAVEPOINT fail_start_engine;
-+SELECT anon.start_dynamic_masking();
-+ start_dynamic_masking 
-+-----------------------
-+ t
-+(1 row)
-+
-+ROLLBACK TO fail_start_engine;
-+RESET ROLE;
-+SELECT anon.start_dynamic_masking();
-+ start_dynamic_masking 
-+-----------------------
-+ t
-+(1 row)
-+
-+SET ROLE oscar_the_owner;
-+SELECT * FROM t1;
-+ i |  t   
-+---+------
-+ 1 | test
-+(1 row)
-+
-+--SELECT * FROM mask.t1;
-+-- SHOULD FAIL
-+SAVEPOINT fail_stop_engine;
-+SELECT anon.stop_dynamic_masking();
-+ERROR:  permission denied for schema mask
-+CONTEXT:  SQL statement "DROP VIEW mask.t1;"
-+PL/pgSQL function anon.mask_drop_view(oid) line 3 at EXECUTE
-+SQL statement "SELECT anon.mask_drop_view(oid)
-+  FROM pg_catalog.pg_class
-+  WHERE relnamespace=quote_ident(pg_catalog.current_setting('anon.sourceschema'))::REGNAMESPACE
-+  AND relkind IN ('r','p','f')"
-+PL/pgSQL function anon.stop_dynamic_masking() line 22 at PERFORM
-+ROLLBACK TO fail_stop_engine;
-+RESET ROLE;
-+SELECT anon.stop_dynamic_masking();
-+NOTICE:  The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
-+ stop_dynamic_masking 
-+----------------------
-+ t
-+(1 row)
-+
-+SET ROLE oscar_the_owner;
-+-- SHOULD FAIL
-+SAVEPOINT fail_seclabel_on_role;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
-+ERROR:  permission denied
-+DETAIL:  The current user must have the CREATEROLE attribute.
-+ROLLBACK TO fail_seclabel_on_role;
-+ROLLBACK;
--- a/patches/pg_cron.patch
+++ b/patches/pg_cron.patch
@@ -1,19 +0,0 @@
-commit b3ea51ee158f113f2f82d0b97c12c54343c9a695 (HEAD -> master)
-Author: Alexey Masterov <alexeymasterov@neon.tech>
-Date:   Fri Jun 7 19:23:42 2024 +0000
-
-    Disable REGRESS_OPTIONS causing initdb
-
-diff --git a/ext-src/pg_cron-src/Makefile b/ext-src/pg_cron-src/Makefile
-index 053314c..fbd5fb5 100644
--- a/ext-src/pg_cron-src/Makefile
-+++ b/ext-src/pg_cron-src/Makefile
-@@ -5,7 +5,7 @@ EXTENSION = pg_cron
- DATA_built = $(EXTENSION)--1.0.sql
- DATA = $(wildcard $(EXTENSION)--*--*.sql)
- 
-REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check
-+#REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check
- REGRESS = pg_cron-test 
- 
- # compilation configuration
--- a/patches/pg_hintplan.patch
+++ b/patches/pg_hintplan.patch
@@ -1,39 +0,0 @@
-commit f7925d4d1406c0f0229e3c691c94b69e381899b1 (HEAD -> master)
-Author: Alexey Masterov <alexeymasterov@neon.tech>
-Date:   Thu Jun 6 08:02:42 2024 +0000
-
-    Patch expected files to consider Neon's log messages
-
-diff --git a/ext-src/pg_hint_plan-src/expected/ut-A.out b/ext-src/pg_hint_plan-src/expected/ut-A.out
-index da723b8..f8d0102 100644
--- a/ext-src/pg_hint_plan-src/expected/ut-A.out
-+++ b/ext-src/pg_hint_plan-src/expected/ut-A.out
-@@ -9,13 +9,16 @@ SET search_path TO public;
- ----
- -- No.A-1-1-3
- CREATE EXTENSION pg_hint_plan;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
- -- No.A-1-2-3
- DROP EXTENSION pg_hint_plan;
- -- No.A-1-1-4
- CREATE SCHEMA other_schema;
- CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
- ERROR:  extension "pg_hint_plan" must be installed in schema "hint_plan"
- CREATE EXTENSION pg_hint_plan;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
- DROP SCHEMA other_schema;
- ----
- ---- No. A-5-1 comment pattern
-diff --git a/ext-src/pg_hint_plan-src/expected/ut-fdw.out b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
-index d372459..6282afe 100644
--- a/ext-src/pg_hint_plan-src/expected/ut-fdw.out
-+++ b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
-@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
- SET client_min_messages TO LOG;
- SET pg_hint_plan.enable_hint TO on;
- CREATE EXTENSION file_fdw;
-+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
- CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
- CREATE USER MAPPING FOR PUBLIC SERVER file_server;
- CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
--- a/patches/pgvector.patch
+++ b/patches/pgvector.patch
@@ -1,8 +1,19 @@
+From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
+From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+Date: Fri, 2 Feb 2024 22:26:45 +0200
+Subject: [PATCH 1/1] Make v0.6.0 work with Neon
+
+Now that the WAL-logging happens as a separate step at the end of the
+build, we need a few neon-specific hints to make it work.
+---
+ src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
+ 1 file changed, 36 insertions(+)
+
 diff --git a/src/hnswbuild.c b/src/hnswbuild.c
-index dcfb2bd..d5189ee 100644
+index 680789b..ec54dea 100644
 --- a/src/hnswbuild.c
 +++ b/src/hnswbuild.c
-@@ -860,9 +860,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
+@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
 
 	hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
 
@@ -20,7 +31,7 @@ index dcfb2bd..d5189ee 100644
 	/* Close relations within worker */
 	index_close(indexRel, indexLockmode);
 	table_close(heapRel, heapLockmode);
-@@ -1117,12 +1125,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
 	SeedRandom(42);
 #endif
 
@@ -32,13 +43,14 @@ index dcfb2bd..d5189ee 100644
 
 	BuildGraph(buildstate, forkNum);
 
-	if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM)
 +#ifdef NEON_SMGR
 +	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
 +#endif
 +
-+	if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM) {
- 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocksInFork(index, forkNum), true);
+ 	if (RelationNeedsWAL(index))
+	{
+ 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
+ 
 +#ifdef NEON_SMGR
 +		{
 +#if PG_VERSION_NUM >= 160000
@@ -48,7 +60,7 @@ index dcfb2bd..d5189ee 100644
 +#endif
 +
 +			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
-+									   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
 +			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
 +		}
 +#endif
@@ -57,6 +69,10 @@ index dcfb2bd..d5189ee 100644
 +#ifdef NEON_SMGR
 +	smgr_end_unlogged_build(RelationGetSmgr(index));
 +#endif
- 
+
 	FreeBuildState(buildstate);
 }
+ 
+-- 
+2.39.2
+
--- a/pgxn/.dir-locals.el
+++ b/pgxn/.dir-locals.el
@@ -1,19 +0,0 @@
-;; see also src/tools/editors/emacs.samples for more complete settings
-
-((c-mode . ((c-basic-offset . 4)
-            (c-file-style . "bsd")
-            (fill-column . 78)
-            (indent-tabs-mode . t)
-            (tab-width . 4)))
- (nxml-mode . ((fill-column . 78)
-               (indent-tabs-mode . nil)))
- (perl-mode . ((perl-indent-level . 4)
-               (perl-continued-statement-offset . 2)
-               (perl-continued-brace-offset . -2)
-               (perl-brace-offset . 0)
-               (perl-brace-imaginary-offset . 0)
-               (perl-label-offset . -2)
-               (indent-tabs-mode . t)
-               (tab-width . 4)))
- (sgml-mode . ((fill-column . 78)
-               (indent-tabs-mode . nil))))
--- a/pgxn/.editorconfig
+++ b/pgxn/.editorconfig
@@ -1,14 +0,0 @@
-root = true
-
-[*.{c,h,l,y,pl,pm}]
-indent_style = tab
-indent_size = tab
-tab_width = 4
-
-[*.{sgml,xml}]
-indent_style = space
-indent_size = 1
-
-[*.xsl]
-indent_style = space
-indent_size = 2
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -19,7 +19,6 @@
 #include "catalog/pg_type.h"
 #include "postmaster/bgworker.h"
 #include "postmaster/interrupt.h"
-#include "replication/logical.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
 #include "storage/procsignal.h"
@@ -281,7 +280,6 @@ _PG_init(void)
 	pg_init_libpagestore();
 	pg_init_walproposer();
        WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
-	LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;

 	InitLogicalReplicationMonitor();

--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -295,10 +295,16 @@ extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum);
 /* utils for neon relsize cache */
 extern void relsize_hash_init(void);
 extern bool get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size);
-extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size);
+extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber new_size);
 extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size);
 extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum);

+extern void start_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum);
+extern bool is_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum);
+extern void stop_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum);
+
+
+
 /* functions for local file cache */
 #if PG_MAJORVERSION_NUM < 16
 extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -10,10 +10,6 @@
 * Temporary and unlogged tables are stored locally, by md.c. The functions
 * here just pass the calls through to corresponding md.c functions.
 *
- * Index build operations that use the buffer cache are also handled locally,
- * just like unlogged tables. Such operations must be marked by calling
- * smgr_start_unlogged_build() and friends.
- *
 * In order to know what relations are permanent and which ones are not, we
 * have added a 'smgr_relpersistence' field to SmgrRelationData, and it is set
 * by smgropen() callers, when they have the relcache entry at hand.  However,
@@ -64,6 +60,7 @@
 #include "storage/fsm_internals.h"
 #include "storage/md.h"
 #include "storage/smgr.h"
+#include "utils/rel.h"

 #include "pagestore_client.h"

@@ -100,17 +97,7 @@ const int	SmgrTrace = DEBUG5;

 page_server_api *page_server;

-/* unlogged relation build states */
-typedef enum
-{
-	UNLOGGED_BUILD_NOT_IN_PROGRESS = 0,
-	UNLOGGED_BUILD_PHASE_1,
-	UNLOGGED_BUILD_PHASE_2,
-	UNLOGGED_BUILD_NOT_PERMANENT
-} UnloggedBuildPhase;
-
-static SMgrRelation unlogged_build_rel = NULL;
-static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+const PGAlignedBlock zero_buffer;

 static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
 static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
@@ -605,7 +592,7 @@ prefetch_read(PrefetchRequest *slot)
 	}
 	else
 	{
-		neon_shard_log(slot->shard_no, LOG,
+		neon_shard_log(slot->shard_no, WARNING,
 					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
 					   (long)slot->my_ring_index,
 					   RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
@@ -1402,10 +1389,6 @@ PageIsEmptyHeapPage(char *buffer)
 	return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
 }

-/*
- * A page is being evicted from the shared buffer cache. Update the
- * last-written LSN of the page, and WAL-log it if needed.
- */
 static void
 #if PG_MAJORVERSION_NUM < 16
 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
@@ -1413,6 +1396,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force)
 #endif
 {
+	BlockNumber relsize;
 	XLogRecPtr	lsn = PageGetLSN((Page) buffer);
 	bool		log_page;

@@ -1429,13 +1413,28 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 		Assert(XLogInsertAllowed());
 		log_page = true;
 	}
-	else if (XLogInsertAllowed() &&
-			 !ShutdownRequestPending &&
-			 (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM))
+	else if (XLogInsertAllowed() && !ShutdownRequestPending)
 	{
-		log_page = true;
+		if (forknum == MAIN_FORKNUM)
+		{
+			if (!PageIsNew((Page) buffer))
+			{
+				if (lsn < FirstNormalUnloggedLSN)
+				{
+					start_unlogged_build(InfoFromSMgrRel(reln), forknum, blocknum);
+					log_page = true;
+				}
+				else if (is_unlogged_build(InfoFromSMgrRel(reln), forknum))
+				{
+					log_page = true;
+				}
+			}
+		}
+		else
+		{
+			log_page = true;
+		}
 	}
-
 	if (log_page)
 	{
 		XLogRecPtr	recptr;
@@ -1508,14 +1507,6 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 			lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */
 		}
 	}
-	else
-	{
-		ereport(SmgrTrace,
-				(errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X",
-						blocknum,
-						RelFileInfoFmt(InfoFromSMgrRel(reln)),
-						forknum, LSN_FORMAT_ARGS(lsn))));
-	}

 	/*
 	 * Remember the LSN on this page. When we read the page again, we must
@@ -1524,6 +1515,19 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 	SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forknum, blocknum);
 }

+/*
+ * Check if unlogged build is in progress for specified relation
+ * and stop it if so. It is used as callback for log_newpage_range( function
+ * which is called at the end of unlogged build.
+ */
+static void
+neon_log_newpage_range_callback(Relation rel, ForkNumber forknum)
+{
+	SMgrRelation smgr = RelationGetSmgr(rel);
+	stop_unlogged_build(InfoFromSMgrRel(smgr), forknum);
+}
+
+
 /*
 *	neon_init() -- Initialize private state
 */
@@ -1559,6 +1563,8 @@ neon_init(void)
 	old_redo_read_buffer_filter = redo_read_buffer_filter;
 	redo_read_buffer_filter = neon_redo_read_buffer_filter;

+	log_newpage_range_callback = neon_log_newpage_range_callback;
+
 #ifdef DEBUG_COMPARE_LOCAL
 	mdinit();
 #endif
@@ -2132,6 +2138,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);

 	neon_wallog_page(reln, forkNum, blkno, buffer, false);
+
 	set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);

 	lsn = PageGetLSN((Page) buffer);
@@ -2167,8 +2174,7 @@ void
 neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 				int nblocks, bool skipFsync)
 {
-	const PGAlignedBlock buffer = {0};
-	int			remblocks = nblocks;
+	BlockNumber	remblocks = nblocks;
 	XLogRecPtr	lsn = 0;

 	switch (reln->smgr_relpersistence)
@@ -2218,8 +2224,24 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	if (!XLogInsertAllowed())
 		return;

-	/* ensure we have enough xlog buffers to log max-sized records */
-	XLogEnsureRecordSpace(Min(remblocks, (XLR_MAX_BLOCK_ID - 1)), 0);
+	set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum + nblocks);
+
+	if (forkNum != MAIN_FORKNUM) /* no need to wal-log zero pages except VM/FSM forks  */
+	{
+		/* ensure we have enough xlog buffers to log max-sized records */
+		XLogEnsureRecordSpace(Min(remblocks, (XLR_MAX_BLOCK_ID - 1)), 0);
+	}
+	else
+	{
+		/*
+		 * smgr_extend is often called with an all-zeroes page, so
+		 * lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
+		 * later, after it has been initialized with the real page contents, and
+		 * it is eventually evicted from the buffer cache. But we need a valid LSN
+		 * to the relation metadata update now.
+		 */
+		lsn = GetXLogInsertRecPtr();
+	}

 	/*
 	 * Iterate over all the pages. They are collected into batches of
@@ -2230,17 +2252,19 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	{
 		int			count = Min(remblocks, XLR_MAX_BLOCK_ID);

-		XLogBeginInsert();
+		if (forkNum != MAIN_FORKNUM) /* no need to wal-log zero pages except VM/FSM forks  */
+		{
+			XLogBeginInsert();

-		for (int i = 0; i < count; i++)
-			XLogRegisterBlock(i, &InfoFromSMgrRel(reln), forkNum, blocknum + i,
-							  (char *) buffer.data, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
-
-		lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI);
+			for (int i = 0; i < count; i++)
+				XLogRegisterBlock(i, &InfoFromSMgrRel(reln), forkNum, blocknum + i,
+								  (char *) zero_buffer.data, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);

+			lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI);
+		}
 		for (int i = 0; i < count; i++)
 		{
-			lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
+			lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, zero_buffer.data);
 			SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forkNum,
 									  blocknum + i);
 		}
@@ -2252,7 +2276,6 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	Assert(lsn != 0);

 	SetLastWrittenLSNForRelation(lsn, InfoFromSMgrRel(reln), forkNum);
-	set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
 }
 #endif

@@ -2519,6 +2542,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 #endif
 {
 	neon_request_lsns request_lsns;
+	BlockNumber relsize;

 	switch (reln->smgr_relpersistence)
 	{
@@ -2939,150 +2963,6 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
 #endif
 }

-/*
- * neon_start_unlogged_build() -- Starting build operation on a rel.
- *
- * Some indexes are built in two phases, by first populating the table with
- * regular inserts, using the shared buffer cache but skipping WAL-logging,
- * and WAL-logging the whole relation after it's done. Neon relies on the
- * WAL to reconstruct pages, so we cannot use the page server in the
- * first phase when the changes are not logged.
- */
-static void
-neon_start_unlogged_build(SMgrRelation reln)
-{
-	/*
-	 * Currently, there can be only one unlogged relation build operation in
-	 * progress at a time. That's enough for the current usage.
-	 */
-	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
-		neon_log(ERROR, "unlogged relation build is already in progress");
-	Assert(unlogged_build_rel == NULL);
-
-	ereport(SmgrTrace,
-			(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
-					RelFileInfoFmt(InfoFromSMgrRel(reln)))));
-
-	switch (reln->smgr_relpersistence)
-	{
-		case 0:
-			neon_log(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
-			break;
-
-		case RELPERSISTENCE_PERMANENT:
-			break;
-
-		case RELPERSISTENCE_TEMP:
-		case RELPERSISTENCE_UNLOGGED:
-			unlogged_build_rel = reln;
-			unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
-			return;
-
-		default:
-			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
-	}
-
-	if (smgrnblocks(reln, MAIN_FORKNUM) != 0)
-		neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
-
-	unlogged_build_rel = reln;
-	unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
-
-	/* Make the relation look like it's unlogged */
-	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
-
-	/*
-	 * Create the local file. In a parallel build, the leader is expected to
-	 * call this first and do it.
-	 *
-	 * FIXME: should we pass isRedo true to create the tablespace dir if it
-	 * doesn't exist? Is it needed?
-	 */
-	if (!IsParallelWorker())
-		mdcreate(reln, MAIN_FORKNUM, false);
-}
-
-/*
- * neon_finish_unlogged_build_phase_1()
- *
- * Call this after you have finished populating a relation in unlogged mode,
- * before you start WAL-logging it.
- */
-static void
-neon_finish_unlogged_build_phase_1(SMgrRelation reln)
-{
-	Assert(unlogged_build_rel == reln);
-
-	ereport(SmgrTrace,
-			(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
-					RelFileInfoFmt(InfoFromSMgrRel(reln)))));
-
-	if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
-		return;
-
-	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
-	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
-
-	/*
-	 * In a parallel build, (only) the leader process performs the 2nd
-	 * phase.
-	 */
-	if (IsParallelWorker())
-	{
-		unlogged_build_rel = NULL;
-		unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
-	}
-	else
-		unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
-}
-
-/*
- * neon_end_unlogged_build() -- Finish an unlogged rel build.
- *
- * Call this after you have finished WAL-logging an relation that was
- * first populated without WAL-logging.
- *
- * This removes the local copy of the rel, since it's now been fully
- * WAL-logged and is present in the page server.
- */
-static void
-neon_end_unlogged_build(SMgrRelation reln)
-{
-	NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln);
-
-	Assert(unlogged_build_rel == reln);
-
-	ereport(SmgrTrace,
-			(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
-					RelFileInfoFmt(InfoFromNInfoB(rinfob)))));
-
-	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
-	{
-		Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
-		Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
-
-		/* Make the relation look permanent again */
-		reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
-
-		/* Remove local copy */
-		rinfob = InfoBFromSMgrRel(reln);
-		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-		{
-			neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
-				 RelFileInfoFmt(InfoFromNInfoB(rinfob)),
-				 forknum);
-
-			forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
-			mdclose(reln, forknum);
-			/* use isRedo == true, so that we drop it immediately */
-			mdunlink(rinfob, forknum, true);
-		}
-	}
-
-	unlogged_build_rel = NULL;
-	unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
-}
-
 #define STRPREFIX(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0)

 static int
@@ -3112,12 +2992,12 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 		request_lsn = UINT64_MAX;

 	/*
-	 * GetRedoStartLsn() returns LSN of the basebackup. We know that the SLRU
+	 * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU
 	 * segment has not changed since the basebackup, because in order to
 	 * modify it, we would have had to download it already. And once
 	 * downloaded, we never evict SLRU segments from local disk.
 	 */
-	not_modified_since = nm_adjust_lsn(GetRedoStartLsn());
+	not_modified_since = GetRedoStartLsn();

 	SlruKind kind;

@@ -3176,40 +3056,6 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	return n_blocks;
 }

-static void
-AtEOXact_neon(XactEvent event, void *arg)
-{
-	switch (event)
-	{
-		case XACT_EVENT_ABORT:
-		case XACT_EVENT_PARALLEL_ABORT:
-
-			/*
-			 * Forget about any build we might have had in progress. The local
-			 * file will be unlinked by smgrDoPendingDeletes()
-			 */
-			unlogged_build_rel = NULL;
-			unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
-			break;
-
-		case XACT_EVENT_COMMIT:
-		case XACT_EVENT_PARALLEL_COMMIT:
-		case XACT_EVENT_PREPARE:
-		case XACT_EVENT_PRE_COMMIT:
-		case XACT_EVENT_PARALLEL_PRE_COMMIT:
-		case XACT_EVENT_PRE_PREPARE:
-			if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
-			{
-				unlogged_build_rel = NULL;
-				unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
-				ereport(ERROR,
-						(errcode(ERRCODE_INTERNAL_ERROR),
-						 (errmsg(NEON_TAG "unlogged index build was not properly finished"))));
-			}
-			break;
-	}
-}
-
 static const struct f_smgr neon_smgr =
 {
 	.smgr_init = neon_init,
@@ -3231,10 +3077,6 @@ static const struct f_smgr neon_smgr =
 	.smgr_truncate = neon_truncate,
 	.smgr_immedsync = neon_immedsync,

-	.smgr_start_unlogged_build = neon_start_unlogged_build,
-	.smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1,
-	.smgr_end_unlogged_build = neon_end_unlogged_build,
-
 	.smgr_read_slru_segment = neon_read_slru_segment,
 };

@@ -3252,8 +3094,6 @@ smgr_neon(BackendId backend, NRelFileInfo rinfo)
 void
 smgr_init_neon(void)
 {
-	RegisterXactCallback(AtEOXact_neon, NULL);
-
 	smgr_init_standard();
 	neon_init();
 }
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -39,7 +39,8 @@ typedef struct
 typedef struct
 {
 	RelTag		tag;
-	BlockNumber size;
+	BlockNumber size : 31;
+	BlockNumber unlogged : 1;
 	dlist_node	lru_node;		/* LRU list node */
 } RelSizeEntry;

@@ -117,9 +118,12 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
 			*size = entry->size;
 			relsize_ctl->hits += 1;
 			found = true;
-			/* Move entry to the LRU list tail */
-			dlist_delete(&entry->lru_node);
-			dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
+			if (!entry->unlogged) /* entries of relation involved in unlogged build are pinned */
+			{
+				/* Move entry to the LRU list tail */
+				dlist_delete(&entry->lru_node);
+				dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
+			}
 		}
 		else
 		{
@@ -130,6 +134,9 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
 	return found;
 }

+/*
+ * Cache relation size.
+ */
 void
 set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 {
@@ -148,31 +155,53 @@ set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 		 */
 		while ((entry = hash_search(relsize_hash, &tag, HASH_ENTER_NULL, &found)) == NULL)
 		{
-			RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
-			hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
-			Assert(relsize_ctl->size > 0);
-			relsize_ctl->size -= 1;
+			if (dlist_is_empty(&relsize_ctl->lru))
+			{
+				elog(FATAL, "No more free relsize cache entries");
+			}
+			else
+			{
+				RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
+				hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
+				Assert(relsize_ctl->size > 0);
+				relsize_ctl->size -= 1;
+			}
 		}
 		entry->size = size;
 		if (!found)
 		{
-			if (++relsize_ctl->size == relsize_hash_size)
+			entry->unlogged = false;
+			if (relsize_ctl->size+1 == relsize_hash_size)
 			{
 				/*
 				 * Remove least recently used elment from the hash.
 				 * Hash size after is becomes `relsize_hash_size-1`.
 				 * But it is not considered to be a problem, because size of this hash is expecrted large enough and +-1 doesn't matter.
 				 */
-				RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
-				hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
-				relsize_ctl->size -= 1;
+				if (dlist_is_empty(&relsize_ctl->lru))
+				{
+					elog(FATAL, "No more free relsize cache entries");
+				}
+				else
+				{
+					RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
+					hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
+				}
+			}
+			else
+			{
+				relsize_ctl->size += 1;
 			}
 		}
-		else
+		else if (entry->unlogged) /* entries of relation involved in unlogged build are pinned */
 		{
 			dlist_delete(&entry->lru_node);
 		}
-		dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
+
+		if (!entry->unlogged) /* entries of relation involved in unlogged build are pinned */
+		{
+			dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
+		}
 		relsize_ctl->writes += 1;
 		LWLockRelease(relsize_lock);
 	}
@@ -191,23 +220,42 @@ update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 		tag.forknum = forknum;
 		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
 		entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found);
-		if (!found || entry->size < size)
+		if (!found) {
+			entry->unlogged = false;
 			entry->size = size;
-		if (!found)
-		{
-			if (++relsize_ctl->size == relsize_hash_size)
+
+			if (relsize_ctl->size+1 == relsize_hash_size)
 			{
-				RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
-				hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
-				relsize_ctl->size -= 1;
+				if (dlist_is_empty(&relsize_ctl->lru))
+				{
+					elog(FATAL, "No more free relsize cache entries");
+				}
+				else
+				{
+					RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
+					hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
+				}
+			}
+			else
+			{
+				relsize_ctl->size += 1;
 			}
 		}
 		else
 		{
-			dlist_delete(&entry->lru_node);
+			if (entry->size < size)
+				entry->size = size;
+
+			if (!entry->unlogged) /* entries of relation involved in unlogged build are pinned */
+			{
+				dlist_delete(&entry->lru_node);
+			}
 		}
 		relsize_ctl->writes += 1;
-		dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
+		if (!entry->unlogged) /* entries of relation involved in unlogged build are pinned */
+		{
+			dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
+		}
 		LWLockRelease(relsize_lock);
 	}
 }
@@ -225,13 +273,154 @@ forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum)
 		entry = hash_search(relsize_hash, &tag, HASH_REMOVE, NULL);
 		if (entry)
 		{
-			dlist_delete(&entry->lru_node);
+			if (!entry->unlogged)
+			{
+				/* Entried of relations involved in unlogged build are pinned */
+				dlist_delete(&entry->lru_node);
+			}
 			relsize_ctl->size -= 1;
 		}
 		LWLockRelease(relsize_lock);
 	}
 }

+/*
+ * This function starts unlogged build if it was not yet started.
+ * The criteria for starting iunlogged build is writing page without normal LSN.
+ * It can happen in any backend when page is evicted from shared buffers.
+ * Or can not happen at all if index fits in shared buffers.
+ */
+void
+start_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum)
+{
+	if (relsize_hash_size > 0)
+	{
+		RelTag		tag;
+		RelSizeEntry *entry;
+		bool		found;
+		bool start = false;
+
+		tag.rinfo = rinfo;
+		tag.forknum = forknum;
+		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
+		entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found);
+		if (!found) {
+			entry->size = blocknum + 1;
+			start = true;
+
+			if (relsize_ctl->size+1 == relsize_hash_size)
+			{
+				if (dlist_is_empty(&relsize_ctl->lru))
+				{
+					elog(FATAL, "No more free relsize cache entries");
+				}
+				else
+				{
+					RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
+					hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
+				}
+			}
+			else
+			{
+				relsize_ctl->size += 1;
+			}
+		}
+		else
+		{
+			start = !entry->unlogged;
+
+			if (entry->size <= blocknum)
+			{
+				entry->size = blocknum + 1;
+			}
+
+			if (start)
+			{
+				/* relation involved in unlogged build are pinned until the end of the build */
+				dlist_delete(&entry->lru_node);
+			}
+		}
+		entry->unlogged = true;
+		relsize_ctl->writes += 1;
+
+		/*
+		 * We are not putting entry in LRU least to prevent it fro eviction until the end of unlogged build
+		 */
+
+		if (start)
+			elog(LOG, "Start unlogged build for %u/%u/%u.%u",
+				 RelFileInfoFmt(rinfo), forknum);
+		LWLockRelease(relsize_lock);
+	}
+}
+
+/*
+ * Check if unlogged build is in progress.
+ */
+bool
+is_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum)
+{
+	bool		unlogged = false;
+
+	if (relsize_hash_size > 0)
+	{
+		RelTag		tag;
+		RelSizeEntry *entry;
+
+		tag.rinfo = rinfo;
+		tag.forknum = forknum;
+		LWLockAcquire(relsize_lock, LW_SHARED);
+		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
+		if (entry != NULL)
+		{
+			unlogged = entry->unlogged;
+			relsize_ctl->hits += 1;
+		}
+		else
+		{
+			relsize_ctl->misses += 1;
+		}
+		LWLockRelease(relsize_lock);
+	}
+	return unlogged;
+}
+
+/*
+ * Clear unlogged build if it was set.
+ */
+void
+stop_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum)
+{
+	if (relsize_hash_size > 0)
+	{
+		RelTag		tag;
+		RelSizeEntry *entry;
+
+		tag.rinfo = rinfo;
+		tag.forknum = forknum;
+		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
+		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
+		if (entry != NULL)
+		{
+			bool unlogged = entry->unlogged;
+			entry->unlogged = false;
+			relsize_ctl->hits += 1;
+			if (unlogged)
+			{
+				elog(LOG, "Stop unlogged build for %u/%u/%u.%u",
+					 RelFileInfoFmt(rinfo), forknum);
+				/* Return entry to the LRU list */
+				dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
+			}
+		}
+		else
+		{
+			relsize_ctl->misses += 1;
+		}
+		LWLockRelease(relsize_lock);
+	}
+}
+
 void
 relsize_hash_init(void)
 {
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -286,6 +286,7 @@ WalProposerPoll(WalProposer *wp)
 void
 WalProposerStart(WalProposer *wp)
 {
+
 	/* Initiate connections to all safekeeper nodes */
 	for (int i = 0; i < wp->n_safekeepers; i++)
 	{
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Konstantin Knizhnik	92e72cc3f3	Restore checkl for FSM/VM fork in neon_wallog_page	2024-06-04 14:08:45 +03:00
Konstantin Knizhnik	f9416ebf2b	Do not write pages to the local disk during unlogged build	2024-06-04 09:20:51 +03:00
Konstantin Knizhnik	0c9dee9d06	Rebase with main	2024-06-03 21:36:37 +03:00
Konstantin Knizhnik	5a5775806f	Restore check for poreserving pgdata_dir content	2024-06-03 21:16:04 +03:00
Konstantin Knizhnik	947f8c59dd	Fix unlogged build	2024-06-03 21:16:02 +03:00
Konstantin Knizhnik	520101170f	Pin information about unlogged relations in relsize cache until end of the build	2024-06-03 21:15:14 +03:00
Konstantin Knizhnik	1bd86c5c6a	Rewrite unlogged relation build	2024-06-03 21:15:12 +03:00
Konstantin Knizhnik	e4fc6c3162	Comment check for pgdatadir match	2024-06-03 21:12:23 +03:00
Konstantin Knizhnik	fcd7d7008f	Support unlogged build in Neon erxtension	2024-06-03 21:12:21 +03:00
				`@@ -1 +0,0 @@`
				`GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION;`