Add /terminate API (#6745 ) (#6853 )

this is to speed up suspends, see https://github.com/neondatabase/cloud/issues/10284 Cherry-pick to release branch to build new compute images
Merge pull request #6803 from neondatabase/releases/2024-02-19
2026-05-21 15:10:44 +00:00 · 2024-02-22 11:51:19 +02:00 · 2024-02-19 16:38:35 +04:00 · 2024-02-15 09:45:08 +00:00 · 2024-02-15 07:42:12 +00:00 · 2024-02-14 14:57:22 +00:00
51 changed files with 875 additions and 1872 deletions
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -59,7 +59,7 @@ runs:
        BUCKET: neon-github-public-dev

    # TODO: We can replace with a special docker image with Java and Allure pre-installed
-    - uses: actions/setup-java@v4
+    - uses: actions/setup-java@v3
      with:
        distribution: 'temurin'
        java-version: '17'
@@ -180,7 +180,7 @@ runs:
        fi

    - name: Cache poetry deps
-      uses: actions/cache@v4
+      uses: actions/cache@v3
      with:
        path: ~/.cache/pypoetry/virtualenvs
        key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
@@ -215,7 +215,7 @@ runs:
          rm -rf ${WORKDIR}
        fi

-    - uses: actions/github-script@v7
+    - uses: actions/github-script@v6
      if: always()
      env:
        REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -80,13 +80,13 @@ runs:

    - name: Checkout
      if: inputs.needs_postgres_source == 'true'
-      uses: actions/checkout@v4
+      uses: actions/checkout@v3
      with:
        submodules: true
        fetch-depth: 1

    - name: Cache poetry deps
-      uses: actions/cache@v4
+      uses: actions/cache@v3
      with:
        path: ~/.cache/pypoetry/virtualenvs
        key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -64,7 +64,7 @@ jobs:
    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"

-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v3
        with:
          ref: main
          token: ${{ secrets.CI_ACCESS_TOKEN }}
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -66,7 +66,7 @@ jobs:
      options: --init

    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v3

    - name: Download Neon artifact
      uses: ./.github/actions/download
@@ -221,7 +221,7 @@ jobs:
    timeout-minutes: 480

    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v3

    - name: Download Neon artifact
      uses: ./.github/actions/download
@@ -366,7 +366,7 @@ jobs:
      options: --init

    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v3

    - name: Download Neon artifact
      uses: ./.github/actions/download
@@ -465,7 +465,7 @@ jobs:
      options: --init

    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v3

    - name: Download Neon artifact
      uses: ./.github/actions/download
@@ -562,7 +562,7 @@ jobs:
      options: --init

    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v3

    - name: Download Neon artifact
      uses: ./.github/actions/download
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -69,7 +69,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          fetch-depth: 0

@@ -106,13 +106,13 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          submodules: false
          fetch-depth: 1

      - name: Cache poetry deps
-        uses: actions/cache@v4
+        uses: actions/cache@v3
        with:
          path: ~/.cache/pypoetry/virtualenvs
          key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
@@ -138,7 +138,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          submodules: true
          fetch-depth: 1
@@ -146,7 +146,7 @@ jobs:
 #      Disabled for now
 #      - name: Restore cargo deps cache
 #        id: cache_cargo
-#        uses: actions/cache@v4
+#        uses: actions/cache@v3
 #        with:
 #          path: |
 #            !~/.cargo/registry/src
@@ -231,7 +231,7 @@ jobs:
          done

      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          submodules: true
          fetch-depth: 1
@@ -303,7 +303,7 @@ jobs:
      # compressed crates.
 #      - name: Cache cargo deps
 #        id: cache_cargo
-#        uses: actions/cache@v4
+#        uses: actions/cache@v3
 #        with:
 #          path: |
 #            ~/.cargo/registry/
@@ -317,21 +317,21 @@ jobs:

      - name: Cache postgres v14 build
        id: cache_pg_14
-        uses: actions/cache@v4
+        uses: actions/cache@v3
        with:
          path: pg_install/v14
          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Cache postgres v15 build
        id: cache_pg_15
-        uses: actions/cache@v4
+        uses: actions/cache@v3
        with:
          path: pg_install/v15
          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Cache postgres v16 build
        id: cache_pg_16
-        uses: actions/cache@v4
+        uses: actions/cache@v3
        with:
          path: pg_install/v16
          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
@@ -451,7 +451,7 @@ jobs:
        pg_version: [ v14, v15, v16 ]
    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          submodules: true
          fetch-depth: 1
@@ -473,12 +473,8 @@ jobs:
          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs

-      # Temporary disable this step until we figure out why it's so flaky
-      # Ref https://github.com/neondatabase/neon/issues/4540
      - name: Merge and upload coverage data
-        if: |
-          false &&
-          matrix.build_type == 'debug' && matrix.pg_version == 'v14'
+        if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
        uses: ./.github/actions/save-coverage-data

  get-benchmarks-durations:
@@ -492,10 +488,10 @@ jobs:
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Cache poetry deps
-        uses: actions/cache@v4
+        uses: actions/cache@v3
        with:
          path: ~/.cache/pypoetry/virtualenvs
          key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
@@ -529,7 +525,7 @@ jobs:
        build_type: [ release ]
    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Pytest benchmarks
        uses: ./.github/actions/run-python-test-set
@@ -558,7 +554,7 @@ jobs:
      options: --init

    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v3

      - name: Create Allure report
        if: ${{ !cancelled() }}
@@ -569,7 +565,7 @@ jobs:
        env:
          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

-      - uses: actions/github-script@v7
+      - uses: actions/github-script@v6
        if: ${{ !cancelled() }}
        with:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
@@ -609,7 +605,7 @@ jobs:
        coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          submodules: true
          fetch-depth: 0
@@ -678,7 +674,7 @@ jobs:
          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json
          echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT

-      - uses: actions/github-script@v7
+      - uses: actions/github-script@v6
        env:
          REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
@@ -904,7 +900,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          fetch-depth: 0

@@ -1118,7 +1114,7 @@ jobs:
          done

      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          submodules: false
          fetch-depth: 0
@@ -1141,7 +1137,7 @@ jobs:

      - name: Create git tag
        if: github.ref_name == 'release'
-        uses: actions/github-script@v7
+        uses: actions/github-script@v6
        with:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
          retries: 5
@@ -1155,7 +1151,7 @@ jobs:

      - name: Create GitHub release
        if: github.ref_name == 'release'
-        uses: actions/github-script@v7
+        uses: actions/github-script@v6
        with:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
          retries: 5
@@ -1204,80 +1200,3 @@ jobs:

            time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
          done
-
-  compute-node-image-merged-base:
-    needs: [ check-permissions, build-buildtools-image, tag ]
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        version: [ v14, v15, v16 ]
-    defaults:
-      run:
-        shell: sh -eu {0}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure Docker Hub login
-        run: |
-          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
-          echo "::add-mask::${DOCKERHUB_AUTH}"
-
-          cat <<-EOF > ~/.docker/config.json
-            {
-              "auths": {
-                "https://index.docker.io/v1/": {
-                  "auth": "${DOCKERHUB_AUTH}"
-                }
-              }
-            }
-          EOF
-
-      - name: Build merged image base
-        run: |
-          docker image build . -f Dockerfile.compute-node-simple -t neondatabase/tmp-compute-node-merged-base-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-            --build-arg PG_VERSION=${{ matrix.version }} \
-            --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
-            --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
-          docker image push neondatabase/tmp-compute-node-merged-base-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
-
-  compute-node-image-merged:
-    needs: [ tag, compute-node-image-merged-base ]
-    runs-on: ubuntu-latest
-    defaults:
-      run:
-        shell: sh -eu {0}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure Docker Hub login
-        run: |
-          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
-          echo "::add-mask::${DOCKERHUB_AUTH}"
-
-          echo ~
-          cat <<-EOF > ~/.docker/config.json
-            {
-              "auths": {
-                "https://index.docker.io/v1/": {
-                  "auth": "${DOCKERHUB_AUTH}"
-                }
-              }
-            }
-          EOF
-
-      - name: Build merged image
-        run: |
-          docker image build . -f Dockerfile.compute-node-merged -t neondatabase/tmp-compute-node-merged:${{needs.tag.outputs.build-tag}} \
-            --build-arg TAG=${{needs.tag.outputs.build-tag}}
-          docker image push neondatabase/tmp-compute-node-merged:${{needs.tag.outputs.build-tag}}
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -57,21 +57,21 @@ jobs:

      - name: Cache postgres v14 build
        id: cache_pg_14
-        uses: actions/cache@v4
+        uses: actions/cache@v3
        with:
          path: pg_install/v14
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Cache postgres v15 build
        id: cache_pg_15
-        uses: actions/cache@v4
+        uses: actions/cache@v3
        with:
          path: pg_install/v15
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Cache postgres v16 build
        id: cache_pg_16
-        uses: actions/cache@v4
+        uses: actions/cache@v3
        with:
          path: pg_install/v16
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
@@ -82,7 +82,7 @@ jobs:
          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV

      - name: Cache cargo deps
-        uses: actions/cache@v4
+        uses: actions/cache@v3
        with:
          path: |
            ~/.cargo/registry
@@ -172,21 +172,21 @@ jobs:

      - name: Cache postgres v14 build
        id: cache_pg_14
-        uses: actions/cache@v4
+        uses: actions/cache@v3
        with:
          path: pg_install/v14
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Cache postgres v15 build
        id: cache_pg_15
-        uses: actions/cache@v4
+        uses: actions/cache@v3
        with:
          path: pg_install/v15
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Cache postgres v16 build
        id: cache_pg_16
-        uses: actions/cache@v4
+        uses: actions/cache@v3
        with:
          path: pg_install/v16
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
@@ -356,7 +356,7 @@ jobs:
          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT

      - name: Publish build stats report
-        uses: actions/github-script@v7
+        uses: actions/github-script@v6
        env:
          REPORT_URL: ${{ steps.upload-stats.outputs.report-url }}
          SHA: ${{ github.event.pull_request.head.sha || github.sha }}
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -28,7 +28,7 @@ jobs:

    steps:
    - name: Checkout
-      uses: actions/checkout@v4
+      uses: actions/checkout@v3

    - uses: actions/setup-python@v4
      with:
@@ -38,7 +38,7 @@ jobs:
      uses: snok/install-poetry@v1

    - name: Cache poetry deps
-      uses: actions/cache@v4
+      uses: actions/cache@v3
      with:
        path: ~/.cache/pypoetry/virtualenvs
        key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
@@ -82,7 +82,7 @@ jobs:
    # It will be fixed after switching to gen2 runner
    - name: Upload python test logs
      if: always()
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v3
      with:
        retention-days: 7
        name: python-test-pg_clients-${{ runner.os }}-stage-logs
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -9,7 +9,7 @@ on:
 defaults:
  run:
    shell: bash -euxo pipefail {0}
-
+    
 env:
  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
@@ -37,7 +37,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          fetch-depth: 0

@@ -115,3 +115,4 @@ jobs:
                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
              }
            }"
+ 
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -286,7 +286,6 @@ dependencies = [
 "git-version",
 "hyper",
 "metrics",
- "once_cell",
 "pageserver_api",
 "pageserver_client",
 "postgres_connection",
--- a/Dockerfile.compute-node-merged
+++ b/Dockerfile.compute-node-merged
@@ -1,83 +0,0 @@
-ARG TAG
-FROM neondatabase/tmp-compute-node-merged-base-v14:$TAG as pg14
-FROM neondatabase/tmp-compute-node-merged-base-v15:$TAG as pg15
-FROM neondatabase/tmp-compute-node-merged-base-v16:$TAG as pg16
-
-#########################################################################################
-#
-# Compile and run the Neon-specific `compute_ctl` binary
-#
-#########################################################################################
-FROM neondatabase/build-tools:pinned AS compute-tools
-ARG BUILD_TAG
-ENV BUILD_TAG=$BUILD_TAG
-
-USER nonroot
-# Copy entire project to get Cargo.* files with proper dependencies for the whole project
-COPY --chown=nonroot . .
-RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
-
-#########################################################################################
-#
-# Final layer
-# Put it all together into the final image
-#
-#########################################################################################
-FROM debian:bullseye-slim
-ARG TAG
-# Add user postgres
-RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
-    echo "postgres:test_console_pass" | chpasswd && \
-    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
-    mkdir /var/db/postgres/pgbouncer && \
-    chown -R postgres:postgres /var/db/postgres && \
-    chmod 0750 /var/db/postgres/compute && \
-    chmod 0750 /var/db/postgres/pgbouncer && \
-    echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
-    # create folder for file cache
-    mkdir -p -m 777 /neon/cache
-
-COPY --from=pg14 --chown=postgres /usr/local/pgsql /usr/local/pgsql-v14
-COPY --from=pg15 --chown=postgres /usr/local/pgsql /usr/local/pgsql-v15
-COPY --from=pg16 --chown=postgres /usr/local/pgsql /usr/local/pgsql-v16
-COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
-
-# Install:
-# libreadline8 for psql
-# libicu67, locales for collations (including ICU and plpgsql_check)
-# liblz4-1 for lz4
-# libossp-uuid16 for extension ossp-uuid
-# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
-# libxml2, libxslt1.1 for xml2
-# libzstd1 for zstd
-# libboost* for rdkit
-# ca-certificates for communicating with s3 by compute_ctl
-RUN apt update &&  \
-    apt install --no-install-recommends -y \
-        gdb \
-        libicu67 \
-        liblz4-1 \
-        libreadline8 \
-        libboost-iostreams1.74.0 \
-        libboost-regex1.74.0 \
-        libboost-serialization1.74.0 \
-        libboost-system1.74.0 \
-        libossp-uuid16 \
-        libgeos-c1v5 \
-        libgdal28 \
-        libproj19 \
-        libprotobuf-c1 \
-        libsfcgal1 \
-        libxml2 \
-        libxslt1.1 \
-        libzstd1 \
-        libcurl4-openssl-dev \
-        locales \
-        procps \
-        ca-certificates && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
-    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
-
-ENV LANG en_US.utf8
-USER postgres
-ENTRYPOINT ["/usr/local/bin/compute_ctl"]
--- a/Dockerfile.compute-node-simple
+++ b/Dockerfile.compute-node-simple
@@ -1,159 +0,0 @@
-ARG PG_VERSION
-ARG REPOSITORY=neondatabase
-ARG IMAGE=build-tools
-ARG TAG=pinned
-ARG BUILD_TAG
-
-#########################################################################################
-#
-# Layer "build-deps"
-#
-#########################################################################################
-FROM debian:bullseye-slim AS build-deps
-RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
-    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
-    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
-
-#########################################################################################
-#
-# Layer "pg-build"
-# Build Postgres from the neon postgres repository.
-#
-#########################################################################################
-FROM build-deps AS pg-build
-ARG PG_VERSION
-COPY vendor/postgres-${PG_VERSION} postgres
-RUN cd postgres && \
-    export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
-    --with-icu --with-libxml --with-libxslt --with-lz4" && \
-    if [ "${PG_VERSION}" != "v14" ]; then \
-        # zstd is available only from PG15
-        export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \
-    fi && \
-    eval $CONFIGURE_CMD && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
-    # Install headers
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
-    # Enable some of contrib extensions
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/moddatetime.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_stat_statements.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \
-    # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
-    # In vanilla postgres this function is limited to Postgres role superuser.
-    # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
-    # We could add the additional grant statements to the postgres repository but it would be hard to maintain,
-    # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
-    # so we do it here.
-    old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
-    # the first loop is for pg_stat_statement extension version <= 1.6
-    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
-        filename=$(basename "$file"); \
-        if echo "$old_list" | grep -q -F "$filename"; then \
-            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
-        fi; \
-    done; \
-    # the second loop is for pg_stat_statement extension versions >= 1.7,
-    # where pg_stat_statement_reset() got 3 additional arguments
-    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
-        filename=$(basename "$file"); \
-        if ! echo "$old_list" | grep -q -F "$filename"; then \
-            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
-        fi; \
-    done
-
-#########################################################################################
-#
-# Layer "neon-pg-ext-build"
-# compile neon extensions
-#
-#########################################################################################
-FROM build-deps AS neon-pg-ext-build
-ARG PG_VERSION
-
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY pgxn/ pgxn/
-
-RUN make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/neon \
-        -s install && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/neon_utils \
-        -s install && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/neon_test_utils \
-        -s install && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/neon_rmgr \
-        -s install && \
-    case "${PG_VERSION}" in \
-        "v14" | "v15") \
-        ;; \
-        "v16") \
-            echo "Skipping HNSW for PostgreSQL 16" && exit 0 \
-        ;; \
-        *) \
-            echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-        esac && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/hnsw \
-        -s install
-
-#########################################################################################
-#
-# Compile and run the Neon-specific `compute_ctl` binary
-#
-#########################################################################################
-FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
-ARG BUILD_TAG
-ENV BUILD_TAG=$BUILD_TAG
-
-USER nonroot
-# Copy entire project to get Cargo.* files with proper dependencies for the whole project
-COPY --chown=nonroot . .
-RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
-
-#########################################################################################
-#
-# Clean up postgres folder before inclusion
-#
-#########################################################################################
-FROM neon-pg-ext-build AS postgres-cleanup-layer
-COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
-
-# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
-RUN cd /usr/local/pgsql/bin && rm ecpg
-
-# Remove headers that we won't need anymore - we've completed installation of all extensions
-RUN rm -r /usr/local/pgsql/include
-
-# Remove static postgresql libraries - all compilation is finished, so we
-# can now remove these files - they must be included in other binaries by now
-# if they were to be used by other libraries.
-RUN rm /usr/local/pgsql/lib/lib*.a
-
-#########################################################################################
-#
-# Final layer
-# Put it all together into the final image
-#
-#########################################################################################
-FROM debian:bullseye-slim
-
-COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local/pgsql
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -45,7 +45,6 @@ use std::{thread, time::Duration};
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
-use nix::sys::signal::{kill, Signal};
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info};
@@ -53,7 +52,9 @@ use url::Url;

 use compute_api::responses::ComputeStatus;

-use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
+use compute_tools::compute::{
+    forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
+};
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version;
 use compute_tools::http::api::launch_http_server;
@@ -394,6 +395,15 @@ fn main() -> Result<()> {
        info!("synced safekeepers at lsn {lsn}");
    }

+    let mut state = compute.state.lock().unwrap();
+    if state.status == ComputeStatus::TerminationPending {
+        state.status = ComputeStatus::Terminated;
+        compute.state_changed.notify_all();
+        // we were asked to terminate gracefully, don't exit to avoid restart
+        delay_exit = true
+    }
+    drop(state);
+
    if let Err(err) = compute.check_for_core_dumps() {
        error!("error while checking for core dumps: {err:?}");
    }
@@ -523,16 +533,7 @@ fn cli() -> clap::Command {
 /// wait for termination which would be easy then.
 fn handle_exit_signal(sig: i32) {
    info!("received {sig} termination signal");
-    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
-    if ss_pid != 0 {
-        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
-        kill(ss_pid, Signal::SIGTERM).ok();
-    }
-    let pg_pid = PG_PID.load(Ordering::SeqCst);
-    if pg_pid != 0 {
-        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        kill(pg_pid, Signal::SIGTERM).ok();
-    }
+    forward_termination_signal();
    exit(1);
 }

--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -28,6 +28,8 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

+use nix::sys::signal::{kill, Signal};
+
 use remote_storage::{DownloadError, RemotePath};

 use crate::checker::create_availability_check_data;
@@ -1322,3 +1324,17 @@ LIMIT 100",
        Ok(remote_ext_metrics)
    }
 }
+
+pub fn forward_termination_signal() {
+    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
+    if ss_pid != 0 {
+        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
+        kill(ss_pid, Signal::SIGTERM).ok();
+    }
+    let pg_pid = PG_PID.load(Ordering::SeqCst);
+    if pg_pid != 0 {
+        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
+        // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
+        kill(pg_pid, Signal::SIGQUIT).ok();
+    }
+}
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -5,6 +5,7 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;

+use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
@@ -123,6 +124,17 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

+        (&Method::POST, "/terminate") => {
+            info!("serving /terminate POST request");
+            match handle_terminate_request(compute).await {
+                Ok(()) => Response::new(Body::empty()),
+                Err((msg, code)) => {
+                    error!("error handling /terminate request: {msg}");
+                    render_json_error(&msg, code)
+                }
+            }
+        }
+
        // download extension files from remote extension storage on demand
        (&Method::POST, route) if route.starts_with("/extension_server/") => {
            info!("serving {:?} POST request", route);
@@ -297,6 +309,49 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
        .unwrap()
 }

+async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
+    {
+        let mut state = compute.state.lock().unwrap();
+        if state.status == ComputeStatus::Terminated {
+            return Ok(());
+        }
+        if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
+            let msg = format!(
+                "invalid compute status for termination request: {:?}",
+                state.status.clone()
+            );
+            return Err((msg, StatusCode::PRECONDITION_FAILED));
+        }
+        state.status = ComputeStatus::TerminationPending;
+        compute.state_changed.notify_all();
+        drop(state);
+    }
+    forward_termination_signal();
+    info!("sent signal and notified waiters");
+
+    // Spawn a blocking thread to wait for compute to become Terminated.
+    // This is needed to do not block the main pool of workers and
+    // be able to serve other requests while some particular request
+    // is waiting for compute to finish configuration.
+    let c = compute.clone();
+    task::spawn_blocking(move || {
+        let mut state = c.state.lock().unwrap();
+        while state.status != ComputeStatus::Terminated {
+            state = c.state_changed.wait(state).unwrap();
+            info!(
+                "waiting for compute to become Terminated, current status: {:?}",
+                state.status
+            );
+        }
+
+        Ok(())
+    })
+    .await
+    .unwrap()?;
+    info!("terminated Postgres");
+    Ok(())
+}
+
 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
 async fn serve(port: u16, state: Arc<ComputeNode>) {
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -168,6 +168,29 @@ paths:
              schema:
                $ref: "#/components/schemas/GenericError"

+  /terminate:
+    post:
+      tags:
+      - Terminate
+      summary: Terminate Postgres and wait for it to exit
+      description: ""
+      operationId: terminate
+      responses:
+        200:
+          description: Result
+        412:
+          description: "wrong state"
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+        500:
+          description: "Unexpected error"
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
 components:
  securitySchemes:
    JWT:
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -18,7 +18,6 @@ clap.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hyper.workspace = true
-once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -333,22 +333,6 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
    json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
 }

-async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&req);
-    state.service.tenants_dump()
-}
-
-async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&req);
-    state.service.scheduler_dump()
-}
-
-async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&req);
-
-    json_response(StatusCode::OK, state.service.consistency_check().await?)
-}
-
 /// Status endpoint is just used for checking that our HTTP listener is up
 async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(StatusCode::OK, ())
@@ -437,13 +421,6 @@ pub fn make_router(
        .post("/debug/v1/node/:node_id/drop", |r| {
            request_span(r, handle_node_drop)
        })
-        .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
-        .get("/debug/v1/scheduler", |r| {
-            request_span(r, handle_scheduler_dump)
-        })
-        .post("/debug/v1/consistency_check", |r| {
-            request_span(r, handle_consistency_check)
-        })
        .get("/control/v1/tenant/:tenant_id/locate", |r| {
            tenant_service_handler(r, handle_tenant_locate)
        })
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -3,7 +3,6 @@ use utils::seqwait::MonotonicCounter;

 mod compute_hook;
 pub mod http;
-pub mod metrics;
 mod node;
 pub mod persistence;
 mod reconciler;
@@ -12,7 +11,7 @@ mod schema;
 pub mod service;
 mod tenant_state;

-#[derive(Clone, Serialize, Deserialize, Debug)]
+#[derive(Clone, Serialize, Deserialize)]
 enum PlacementPolicy {
    /// Cheapest way to attach a tenant: just one pageserver, no secondary
    Single,
@@ -23,7 +22,7 @@ enum PlacementPolicy {
    Detached,
 }

-#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
+#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
 struct Sequence(u64);

 impl Sequence {
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -6,7 +6,6 @@
 ///
 use anyhow::{anyhow, Context};
 use attachment_service::http::make_router;
-use attachment_service::metrics::preinitialize_metrics;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service};
 use aws_config::{self, BehaviorVersion, Region};
@@ -206,8 +205,6 @@ async fn async_main() -> anyhow::Result<()> {
        logging::Output::Stdout,
    )?;

-    preinitialize_metrics();
-
    let args = Cli::parse();
    tracing::info!(
        "version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
--- a/control_plane/attachment_service/src/metrics.rs
+++ b/control_plane/attachment_service/src/metrics.rs
@@ -1,32 +0,0 @@
-use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
-use once_cell::sync::Lazy;
-
-pub(crate) struct ReconcilerMetrics {
-    pub(crate) spawned: IntCounter,
-    pub(crate) complete: IntCounterVec,
-}
-
-impl ReconcilerMetrics {
-    // Labels used on [`Self::complete`]
-    pub(crate) const SUCCESS: &'static str = "ok";
-    pub(crate) const ERROR: &'static str = "success";
-    pub(crate) const CANCEL: &'static str = "cancel";
-}
-
-pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
-    spawned: register_int_counter!(
-        "storage_controller_reconcile_spawn",
-        "Count of how many times we spawn a reconcile task",
-    )
-    .expect("failed to define a metric"),
-    complete: register_int_counter_vec!(
-        "storage_controller_reconcile_complete",
-        "Reconciler tasks completed, broken down by success/failure/cancelled",
-        &["status"],
-    )
-    .expect("failed to define a metric"),
-});
-
-pub fn preinitialize_metrics() {
-    Lazy::force(&RECONCILER);
-}
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,16 +1,9 @@
 use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
-use serde::Serialize;
 use utils::id::NodeId;

 use crate::persistence::NodePersistence;

-/// Represents the in-memory description of a Node.
-///
-/// Scheduling statistics are maintened separately in [`crate::scheduler`].
-///
-/// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the
-/// implementation of serialization on this type is only for debug dumps.
-#[derive(Clone, Serialize, Eq, PartialEq)]
+#[derive(Clone)]
 pub(crate) struct Node {
    pub(crate) id: NodeId,

--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -477,7 +477,7 @@ impl Persistence {
 }

 /// Parts of [`crate::tenant_state::TenantState`] that are stored durably
-#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
+#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
 #[diesel(table_name = crate::schema::tenant_shards)]
 pub(crate) struct TenantShardPersistence {
    #[serde(default)]
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -27,7 +27,7 @@ pub(super) struct Reconciler {
    pub(super) tenant_shard_id: TenantShardId,
    pub(crate) shard: ShardIdentity,
    pub(crate) generation: Generation,
-    pub(crate) intent: TargetState,
+    pub(crate) intent: IntentState,
    pub(crate) config: TenantConfig,
    pub(crate) observed: ObservedState,

@@ -62,38 +62,10 @@ pub(super) struct Reconciler {
    pub(crate) persistence: Arc<Persistence>,
 }

-/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
-/// reference counting for Scheduler.  The IntentState is what the scheduler works with,
-/// and the TargetState is just the instruction for a particular Reconciler run.
-#[derive(Debug)]
-pub(crate) struct TargetState {
-    pub(crate) attached: Option<NodeId>,
-    pub(crate) secondary: Vec<NodeId>,
-}
-
-impl TargetState {
-    pub(crate) fn from_intent(intent: &IntentState) -> Self {
-        Self {
-            attached: *intent.get_attached(),
-            secondary: intent.get_secondary().clone(),
-        }
-    }
-
-    fn all_pageservers(&self) -> Vec<NodeId> {
-        let mut result = self.secondary.clone();
-        if let Some(node_id) = &self.attached {
-            result.push(*node_id);
-        }
-        result
-    }
-}
-
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum ReconcileError {
    #[error(transparent)]
    Notify(#[from] NotifyError),
-    #[error("Cancelled")]
-    Cancel,
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }
@@ -499,9 +471,6 @@ impl Reconciler {
        }

        for (node_id, conf) in changes {
-            if self.cancel.is_cancelled() {
-                return Err(ReconcileError::Cancel);
-            }
            self.location_config(node_id, conf, None).await?;
        }

--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -1,8 +1,9 @@
-use crate::{node::Node, tenant_state::TenantState};
-use serde::Serialize;
-use std::collections::HashMap;
+use pageserver_api::shard::TenantShardId;
+use std::collections::{BTreeMap, HashMap};
 use utils::{http::error::ApiError, id::NodeId};

+use crate::{node::Node, tenant_state::TenantState};
+
 /// Scenarios in which we cannot find a suitable location for a tenant shard
 #[derive(thiserror::Error, Debug)]
 pub enum ScheduleError {
@@ -18,179 +19,52 @@ impl From<ScheduleError> for ApiError {
    }
 }

-#[derive(Serialize, Eq, PartialEq)]
-struct SchedulerNode {
-    /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
-    shard_count: usize,
-
-    /// Whether this node is currently elegible to have new shards scheduled (this is derived
-    /// from a node's availability state and scheduling policy).
-    may_schedule: bool,
-}
-
-/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
-/// on which to run.
-///
-/// The type has no persistent state of its own: this is all populated at startup.  The Serialize
-/// impl is only for debug dumps.
-#[derive(Serialize)]
 pub(crate) struct Scheduler {
-    nodes: HashMap<NodeId, SchedulerNode>,
+    tenant_counts: HashMap<NodeId, usize>,
 }

 impl Scheduler {
-    pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
-        let mut scheduler_nodes = HashMap::new();
-        for node in nodes {
-            scheduler_nodes.insert(
-                node.id,
-                SchedulerNode {
-                    shard_count: 0,
-                    may_schedule: node.may_schedule(),
-                },
-            );
+    pub(crate) fn new(
+        tenants: &BTreeMap<TenantShardId, TenantState>,
+        nodes: &HashMap<NodeId, Node>,
+    ) -> Self {
+        let mut tenant_counts = HashMap::new();
+        for node_id in nodes.keys() {
+            tenant_counts.insert(*node_id, 0);
        }

-        Self {
-            nodes: scheduler_nodes,
-        }
-    }
-
-    /// For debug/support: check that our internal statistics are in sync with the state of
-    /// the nodes & tenant shards.
-    ///
-    /// If anything is inconsistent, log details and return an error.
-    pub(crate) fn consistency_check<'a>(
-        &self,
-        nodes: impl Iterator<Item = &'a Node>,
-        shards: impl Iterator<Item = &'a TenantState>,
-    ) -> anyhow::Result<()> {
-        let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
-        for node in nodes {
-            expect_nodes.insert(
-                node.id,
-                SchedulerNode {
-                    shard_count: 0,
-                    may_schedule: node.may_schedule(),
-                },
-            );
-        }
-
-        for shard in shards {
-            if let Some(node_id) = shard.intent.get_attached() {
-                match expect_nodes.get_mut(node_id) {
-                    Some(node) => node.shard_count += 1,
-                    None => anyhow::bail!(
-                        "Tenant {} references nonexistent node {}",
-                        shard.tenant_shard_id,
-                        node_id
-                    ),
-                }
-            }
-
-            for node_id in shard.intent.get_secondary() {
-                match expect_nodes.get_mut(node_id) {
-                    Some(node) => node.shard_count += 1,
-                    None => anyhow::bail!(
-                        "Tenant {} references nonexistent node {}",
-                        shard.tenant_shard_id,
-                        node_id
-                    ),
-                }
+        for tenant in tenants.values() {
+            if let Some(ps) = tenant.intent.attached {
+                let entry = tenant_counts.entry(ps).or_insert(0);
+                *entry += 1;
            }
        }

-        for (node_id, expect_node) in &expect_nodes {
-            let Some(self_node) = self.nodes.get(node_id) else {
-                anyhow::bail!("Node {node_id} not found in Self")
-            };
-
-            if self_node != expect_node {
-                tracing::error!("Inconsistency detected in scheduling state for node {node_id}");
-                tracing::error!("Expected state: {}", serde_json::to_string(expect_node)?);
-                tracing::error!("Self state: {}", serde_json::to_string(self_node)?);
-
-                anyhow::bail!("Inconsistent state on {node_id}");
+        for (node_id, node) in nodes {
+            if !node.may_schedule() {
+                tenant_counts.remove(node_id);
            }
        }

-        if expect_nodes.len() != self.nodes.len() {
-            // We just checked that all the expected nodes are present.  If the lengths don't match,
-            // it means that we have nodes in Self that are unexpected.
-            for node_id in self.nodes.keys() {
-                if !expect_nodes.contains_key(node_id) {
-                    anyhow::bail!("Node {node_id} found in Self but not in expected nodes");
-                }
-            }
-        }
-
-        Ok(())
-    }
-
-    /// Increment the reference count of a node.  This reference count is used to guide scheduling
-    /// decisions, not for memory management: it represents one tenant shard whose IntentState targets
-    /// this node.
-    ///
-    /// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
-    /// [`Self::new`] or [`Self::node_upsert`])
-    pub(crate) fn node_inc_ref(&mut self, node_id: NodeId) {
-        let Some(node) = self.nodes.get_mut(&node_id) else {
-            tracing::error!("Scheduler missing node {node_id}");
-            debug_assert!(false);
-            return;
-        };
-
-        node.shard_count += 1;
-    }
-
-    /// Decrement a node's reference count.  Inverse of [`Self::node_inc_ref`].
-    pub(crate) fn node_dec_ref(&mut self, node_id: NodeId) {
-        let Some(node) = self.nodes.get_mut(&node_id) else {
-            debug_assert!(false);
-            tracing::error!("Scheduler missing node {node_id}");
-            return;
-        };
-
-        node.shard_count -= 1;
-    }
-
-    pub(crate) fn node_upsert(&mut self, node: &Node) {
-        use std::collections::hash_map::Entry::*;
-        match self.nodes.entry(node.id) {
-            Occupied(mut entry) => {
-                entry.get_mut().may_schedule = node.may_schedule();
-            }
-            Vacant(entry) => {
-                entry.insert(SchedulerNode {
-                    shard_count: 0,
-                    may_schedule: node.may_schedule(),
-                });
-            }
-        }
-    }
-
-    pub(crate) fn node_remove(&mut self, node_id: NodeId) {
-        if self.nodes.remove(&node_id).is_none() {
-            tracing::warn!(node_id=%node_id, "Removed non-existent node from scheduler");
-        }
+        Self { tenant_counts }
    }

    pub(crate) fn schedule_shard(
        &mut self,
        hard_exclude: &[NodeId],
    ) -> Result<NodeId, ScheduleError> {
-        if self.nodes.is_empty() {
+        if self.tenant_counts.is_empty() {
            return Err(ScheduleError::NoPageservers);
        }

        let mut tenant_counts: Vec<(NodeId, usize)> = self
-            .nodes
+            .tenant_counts
            .iter()
            .filter_map(|(k, v)| {
-                if hard_exclude.contains(k) || !v.may_schedule {
+                if hard_exclude.contains(k) {
                    None
                } else {
-                    Some((*k, v.shard_count))
+                    Some((*k, *v))
                }
            })
            .collect();
@@ -199,18 +73,7 @@ impl Scheduler {
        tenant_counts.sort_by_key(|i| (i.1, i.0));

        if tenant_counts.is_empty() {
-            // After applying constraints, no pageservers were left.  We log some detail about
-            // the state of nodes to help understand why this happened.  This is not logged as an error because
-            // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
-            tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
-            for (node_id, node) in &self.nodes {
-                tracing::info!(
-                    "Node {node_id}: may_schedule={} shards={}",
-                    node.may_schedule,
-                    node.shard_count
-                );
-            }
-
+            // After applying constraints, no pageservers were left
            return Err(ScheduleError::ImpossibleConstraint);
        }

@@ -219,88 +82,7 @@ impl Scheduler {
            "scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
            tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
        );
-
-        // Note that we do not update shard count here to reflect the scheduling: that
-        // is IntentState's job when the scheduled location is used.
-
+        *self.tenant_counts.get_mut(&node_id).unwrap() += 1;
        Ok(node_id)
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use std::collections::HashMap;
-
-    use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
-    use utils::id::NodeId;
-
-    use crate::{node::Node, tenant_state::IntentState};
-
-    #[test]
-    fn scheduler_basic() -> anyhow::Result<()> {
-        let mut nodes = HashMap::new();
-        nodes.insert(
-            NodeId(1),
-            Node {
-                id: NodeId(1),
-                availability: NodeAvailability::Active,
-                scheduling: NodeSchedulingPolicy::Active,
-                listen_http_addr: String::new(),
-                listen_http_port: 0,
-                listen_pg_addr: String::new(),
-                listen_pg_port: 0,
-            },
-        );
-
-        nodes.insert(
-            NodeId(2),
-            Node {
-                id: NodeId(2),
-                availability: NodeAvailability::Active,
-                scheduling: NodeSchedulingPolicy::Active,
-                listen_http_addr: String::new(),
-                listen_http_port: 0,
-                listen_pg_addr: String::new(),
-                listen_pg_port: 0,
-            },
-        );
-
-        let mut scheduler = Scheduler::new(nodes.values());
-        let mut t1_intent = IntentState::new();
-        let mut t2_intent = IntentState::new();
-
-        let scheduled = scheduler.schedule_shard(&[])?;
-        t1_intent.set_attached(&mut scheduler, Some(scheduled));
-        let scheduled = scheduler.schedule_shard(&[])?;
-        t2_intent.set_attached(&mut scheduler, Some(scheduled));
-
-        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
-        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
-
-        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
-        t1_intent.push_secondary(&mut scheduler, scheduled);
-
-        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
-        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 2);
-
-        t1_intent.clear(&mut scheduler);
-        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
-        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
-
-        if cfg!(debug_assertions) {
-            // Dropping an IntentState without clearing it causes a panic in debug mode,
-            // because we have failed to properly update scheduler shard counts.
-            let result = std::panic::catch_unwind(move || {
-                drop(t2_intent);
-            });
-            assert!(result.is_err());
-        } else {
-            t2_intent.clear(&mut scheduler);
-            assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
-            assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 0);
-        }
-
-        Ok(())
-    }
-}
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -1,12 +1,10 @@
 use std::{collections::HashMap, sync::Arc, time::Duration};

-use crate::{metrics, persistence::TenantShardPersistence};
 use control_plane::attachment_service::NodeAvailability;
 use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
    shard::{ShardIdentity, TenantShardId},
 };
-use serde::Serialize;
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::{instrument, Instrument};
@@ -21,27 +19,11 @@ use crate::{
    compute_hook::ComputeHook,
    node::Node,
    persistence::{split_state::SplitState, Persistence},
-    reconciler::{
-        attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState,
-    },
+    reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
    scheduler::{ScheduleError, Scheduler},
    service, PlacementPolicy, Sequence,
 };

-/// Serialization helper
-fn read_mutex_content<S, T>(v: &std::sync::Mutex<T>, serializer: S) -> Result<S::Ok, S::Error>
-where
-    S: serde::ser::Serializer,
-    T: Clone + std::fmt::Display,
-{
-    serializer.collect_str(&v.lock().unwrap())
-}
-
-/// In-memory state for a particular tenant shard.
-///
-/// This struct implement Serialize for debugging purposes, but is _not_ persisted
-/// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
-#[derive(Serialize)]
 pub(crate) struct TenantState {
    pub(crate) tenant_shard_id: TenantShardId,

@@ -76,7 +58,6 @@ pub(crate) struct TenantState {
    /// If a reconcile task is currently in flight, it may be joined here (it is
    /// only safe to join if either the result has been received or the reconciler's
    /// cancellation token has been fired)
-    #[serde(skip)]
    pub(crate) reconciler: Option<ReconcilerHandle>,

    /// If a tenant is being split, then all shards with that TenantId will have a
@@ -86,19 +67,16 @@ pub(crate) struct TenantState {

    /// Optionally wait for reconciliation to complete up to a particular
    /// sequence number.
-    #[serde(skip)]
    pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,

    /// Indicates sequence number for which we have encountered an error reconciling.  If
    /// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred,
    /// and callers should stop waiting for `waiter` and propagate the error.
-    #[serde(skip)]
    pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,

    /// The most recent error from a reconcile on this tenant
    /// TODO: generalize to an array of recent events
    /// TOOD: use a ArcSwap instead of mutex for faster reads?
-    #[serde(serialize_with = "read_mutex_content")]
    pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,

    /// If we have a pending compute notification that for some reason we weren't able to send,
@@ -108,112 +86,13 @@ pub(crate) struct TenantState {
    pub(crate) pending_compute_notification: bool,
 }

-#[derive(Default, Clone, Debug, Serialize)]
+#[derive(Default, Clone, Debug)]
 pub(crate) struct IntentState {
-    attached: Option<NodeId>,
-    secondary: Vec<NodeId>,
+    pub(crate) attached: Option<NodeId>,
+    pub(crate) secondary: Vec<NodeId>,
 }

-impl IntentState {
-    pub(crate) fn new() -> Self {
-        Self {
-            attached: None,
-            secondary: vec![],
-        }
-    }
-    pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option<NodeId>) -> Self {
-        if let Some(node_id) = node_id {
-            scheduler.node_inc_ref(node_id);
-        }
-        Self {
-            attached: node_id,
-            secondary: vec![],
-        }
-    }
-
-    pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option<NodeId>) {
-        if self.attached != new_attached {
-            if let Some(old_attached) = self.attached.take() {
-                scheduler.node_dec_ref(old_attached);
-            }
-            if let Some(new_attached) = &new_attached {
-                scheduler.node_inc_ref(*new_attached);
-            }
-            self.attached = new_attached;
-        }
-    }
-
-    pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
-        debug_assert!(!self.secondary.contains(&new_secondary));
-        scheduler.node_inc_ref(new_secondary);
-        self.secondary.push(new_secondary);
-    }
-
-    /// It is legal to call this with a node that is not currently a secondary: that is a no-op
-    pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) {
-        let index = self.secondary.iter().position(|n| *n == node_id);
-        if let Some(index) = index {
-            scheduler.node_dec_ref(node_id);
-            self.secondary.remove(index);
-        }
-    }
-
-    pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) {
-        for secondary in self.secondary.drain(..) {
-            scheduler.node_dec_ref(secondary);
-        }
-    }
-
-    pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
-        if let Some(old_attached) = self.attached.take() {
-            scheduler.node_dec_ref(old_attached);
-        }
-
-        self.clear_secondary(scheduler);
-    }
-
-    pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
-        let mut result = Vec::new();
-        if let Some(p) = self.attached {
-            result.push(p)
-        }
-
-        result.extend(self.secondary.iter().copied());
-
-        result
-    }
-
-    pub(crate) fn get_attached(&self) -> &Option<NodeId> {
-        &self.attached
-    }
-
-    pub(crate) fn get_secondary(&self) -> &Vec<NodeId> {
-        &self.secondary
-    }
-
-    /// When a node goes offline, we update intents to avoid using it
-    /// as their attached pageserver.
-    ///
-    /// Returns true if a change was made
-    pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
-        if self.attached == Some(node_id) {
-            self.attached = None;
-            self.secondary.push(node_id);
-            true
-        } else {
-            false
-        }
-    }
-}
-
-impl Drop for IntentState {
-    fn drop(&mut self) {
-        // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
-        debug_assert!(self.attached.is_none() && self.secondary.is_empty());
-    }
-}
-
-#[derive(Default, Clone, Serialize)]
+#[derive(Default, Clone)]
 pub(crate) struct ObservedState {
    pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
 }
@@ -227,7 +106,7 @@ pub(crate) struct ObservedState {
 ///       what it is (e.g. we failed partway through configuring it)
 ///     * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
 ///       and that configuration will still be present unless something external interfered.
-#[derive(Clone, Serialize)]
+#[derive(Clone)]
 pub(crate) struct ObservedStateLocation {
    /// If None, it means we do not know the status of this shard's location on this node, but
    /// we know that we might have some state on this node.
@@ -303,6 +182,46 @@ pub(crate) struct ReconcileResult {
    pub(crate) pending_compute_notification: bool,
 }

+impl IntentState {
+    pub(crate) fn new() -> Self {
+        Self {
+            attached: None,
+            secondary: vec![],
+        }
+    }
+    pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
+        let mut result = Vec::new();
+        if let Some(p) = self.attached {
+            result.push(p)
+        }
+
+        result.extend(self.secondary.iter().copied());
+
+        result
+    }
+
+    pub(crate) fn single(node_id: Option<NodeId>) -> Self {
+        Self {
+            attached: node_id,
+            secondary: vec![],
+        }
+    }
+
+    /// When a node goes offline, we update intents to avoid using it
+    /// as their attached pageserver.
+    ///
+    /// Returns true if a change was made
+    pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
+        if self.attached == Some(node_id) {
+            self.attached = None;
+            self.secondary.push(node_id);
+            true
+        } else {
+            false
+        }
+    }
+}
+
 impl ObservedState {
    pub(crate) fn new() -> Self {
        Self {
@@ -396,12 +315,12 @@ impl TenantState {
                // Should have exactly one attached, and zero secondaries
                if self.intent.attached.is_none() {
                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
-                    self.intent.set_attached(scheduler, Some(node_id));
+                    self.intent.attached = Some(node_id);
                    used_pageservers.push(node_id);
                    modified = true;
                }
                if !self.intent.secondary.is_empty() {
-                    self.intent.clear_secondary(scheduler);
+                    self.intent.secondary.clear();
                    modified = true;
                }
            }
@@ -409,14 +328,14 @@ impl TenantState {
                // Should have exactly one attached, and N secondaries
                if self.intent.attached.is_none() {
                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
-                    self.intent.set_attached(scheduler, Some(node_id));
+                    self.intent.attached = Some(node_id);
                    used_pageservers.push(node_id);
                    modified = true;
                }

                while self.intent.secondary.len() < secondary_count {
                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
-                    self.intent.push_secondary(scheduler, node_id);
+                    self.intent.secondary.push(node_id);
                    used_pageservers.push(node_id);
                    modified = true;
                }
@@ -424,12 +343,12 @@ impl TenantState {
            Detached => {
                // Should have no attached or secondary pageservers
                if self.intent.attached.is_some() {
-                    self.intent.set_attached(scheduler, None);
+                    self.intent.attached = None;
                    modified = true;
                }

                if !self.intent.secondary.is_empty() {
-                    self.intent.clear_secondary(scheduler);
+                    self.intent.secondary.clear();
                    modified = true;
                }
            }
@@ -571,7 +490,7 @@ impl TenantState {
            tenant_shard_id: self.tenant_shard_id,
            shard: self.shard,
            generation: self.generation,
-            intent: TargetState::from_intent(&self.intent),
+            intent: self.intent.clone(),
            config: self.config.clone(),
            observed: self.observed.clone(),
            pageservers: pageservers.clone(),
@@ -590,7 +509,6 @@ impl TenantState {
        let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
                                                        tenant_id=%reconciler.tenant_shard_id.tenant_id,
                                                        shard_id=%reconciler.tenant_shard_id.shard_slug());
-        metrics::RECONCILER.spawned.inc();
        let join_handle = tokio::task::spawn(
            async move {
                // Wait for any previous reconcile task to complete before we start
@@ -607,10 +525,6 @@ impl TenantState {
                // TODO: wrap all remote API operations in cancellation check
                // as well.
                if reconciler.cancel.is_cancelled() {
-                    metrics::RECONCILER
-                        .complete
-                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
-                        .inc();
                    return;
                }

@@ -624,20 +538,6 @@ impl TenantState {
                    reconciler.compute_notify().await.ok();
                }

-                // Update result counter
-                match &result {
-                    Ok(_) => metrics::RECONCILER
-                        .complete
-                        .with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
-                    Err(ReconcileError::Cancel) => metrics::RECONCILER
-                        .complete
-                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
-                    Err(_) => metrics::RECONCILER
-                        .complete
-                        .with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
-                }
-                .inc();
-
                result_tx
                    .send(ReconcileResult {
                        sequence: reconcile_seq,
@@ -680,18 +580,4 @@ impl TenantState {

        debug_assert!(!self.intent.all_pageservers().contains(&node_id));
    }
-
-    pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
-        TenantShardPersistence {
-            tenant_id: self.tenant_shard_id.tenant_id.to_string(),
-            shard_number: self.tenant_shard_id.shard_number.0 as i32,
-            shard_count: self.tenant_shard_id.shard_count.literal() as i32,
-            shard_stripe_size: self.shard.stripe_size.0 as i32,
-            generation: self.generation.into().unwrap_or(0) as i32,
-            generation_pageserver: i64::MAX,
-            placement_policy: serde_json::to_string(&self.policy).unwrap(),
-            config: serde_json::to_string(&self.config).unwrap(),
-            splitting: SplitState::default(),
-        }
-    }
 }
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -113,7 +113,7 @@ pub struct TenantShardMigrateRequest {
    pub node_id: NodeId,
 }

-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Clone, Copy)]
 pub enum NodeAvailability {
    // Normal, happy state
    Active,
@@ -137,7 +137,7 @@ impl FromStr for NodeAvailability {

 /// FIXME: this is a duplicate of the type in the attachment_service crate, because the
 /// type needs to be defined with diesel traits in there.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Clone, Copy)]
 pub enum NodeSchedulingPolicy {
    Active,
    Filling,
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -652,7 +652,9 @@ impl Endpoint {
                        }
                        ComputeStatus::Empty
                        | ComputeStatus::ConfigurationPending
-                        | ComputeStatus::Configuration => {
+                        | ComputeStatus::Configuration
+                        | ComputeStatus::TerminationPending
+                        | ComputeStatus::Terminated => {
                            bail!("unexpected compute status: {:?}", state.status)
                        }
                    }
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -52,6 +52,10 @@ pub enum ComputeStatus {
    // compute will exit soon or is waiting for
    // control-plane to terminate it.
    Failed,
+    // Termination requested
+    TerminationPending,
+    // Terminated Postgres
+    Terminated,
 }

 fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -622,7 +622,7 @@ async fn timeline_preserve_initdb_handler(
    // location where timeline recreation cand find it.

    async {
-        let tenant = mgr::get_tenant(tenant_shard_id, false)?;
+        let tenant = mgr::get_tenant(tenant_shard_id, true)?;

        let timeline = tenant
            .get_timeline(timeline_id, false)
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,7 +15,6 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
-use itertools::Itertools;
 use pageserver_api::key::{
    dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
    rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -1493,7 +1492,7 @@ impl<'a> DatadirModification<'a> {
            return Ok(());
        }

-        let mut writer = self.tline.writer().await;
+        let writer = self.tline.writer().await;

        // Flush relation and  SLRU data blocks, keep metadata.
        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
@@ -1532,23 +1531,13 @@ impl<'a> DatadirModification<'a> {
    /// All the modifications in this atomic update are stamped by the specified LSN.
    ///
    pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let mut writer = self.tline.writer().await;
+        let writer = self.tline.writer().await;

        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

        if !self.pending_updates.is_empty() {
-            let prev_pending_updates = std::mem::take(&mut self.pending_updates);
-
-            // The put_batch call below expects expects the inputs to be sorted by Lsn,
-            // so we do that first.
-            let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = prev_pending_updates
-                .into_iter()
-                .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
-                .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
-                .collect();
-
-            writer.put_batch(lsn_ordered_batch, ctx).await?;
+            writer.put_batch(&self.pending_updates, ctx).await?;
            self.pending_updates.clear();
        }

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3890,7 +3890,7 @@ mod tests {
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -3902,7 +3902,7 @@ mod tests {
        writer.finish_write(Lsn(0x10));
        drop(writer);

-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -3968,7 +3968,7 @@ mod tests {
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;

        #[allow(non_snake_case)]
        let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
@@ -4002,7 +4002,7 @@ mod tests {
        let newtline = tenant
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");
-        let mut new_writer = newtline.writer().await;
+        let new_writer = newtline.writer().await;
        new_writer
            .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
            .await?;
@@ -4034,7 +4034,7 @@ mod tests {
    ) -> anyhow::Result<()> {
        let mut lsn = start_lsn;
        {
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
            // Create a relation on the timeline
            writer
                .put(
@@ -4059,7 +4059,7 @@ mod tests {
        }
        tline.freeze_and_flush().await?;
        {
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
            writer
                .put(
                    *TEST_KEY,
@@ -4422,7 +4422,7 @@ mod tests {
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -4439,7 +4439,7 @@ mod tests {
            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
            .await?;

-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -4456,7 +4456,7 @@ mod tests {
            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
            .await?;

-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -4473,7 +4473,7 @@ mod tests {
            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
            .await?;

-        let mut writer = tline.writer().await;
+        let writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
@@ -4535,7 +4535,7 @@ mod tests {
        for _ in 0..50 {
            for _ in 0..10000 {
                test_key.field6 = blknum;
-                let mut writer = tline.writer().await;
+                let writer = tline.writer().await;
                writer
                    .put(
                        test_key,
@@ -4597,7 +4597,7 @@ mod tests {
        for blknum in 0..NUM_KEYS {
            lsn = Lsn(lsn.0 + 0x10);
            test_key.field6 = blknum as u32;
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
            writer
                .put(
                    test_key,
@@ -4618,7 +4618,7 @@ mod tests {
                lsn = Lsn(lsn.0 + 0x10);
                let blknum = thread_rng().gen_range(0..NUM_KEYS);
                test_key.field6 = blknum as u32;
-                let mut writer = tline.writer().await;
+                let writer = tline.writer().await;
                writer
                    .put(
                        test_key,
@@ -4686,7 +4686,7 @@ mod tests {
        for blknum in 0..NUM_KEYS {
            lsn = Lsn(lsn.0 + 0x10);
            test_key.field6 = blknum as u32;
-            let mut writer = tline.writer().await;
+            let writer = tline.writer().await;
            writer
                .put(
                    test_key,
@@ -4715,7 +4715,7 @@ mod tests {
                lsn = Lsn(lsn.0 + 0x10);
                let blknum = thread_rng().gen_range(0..NUM_KEYS);
                test_key.field6 = blknum as u32;
-                let mut writer = tline.writer().await;
+                let writer = tline.writer().await;
                writer
                    .put(
                        test_key,
@@ -4792,7 +4792,7 @@ mod tests {
                lsn = Lsn(lsn.0 + 0x10);
                let blknum = thread_rng().gen_range(0..NUM_KEYS);
                test_key.field6 = blknum as u32;
-                let mut writer = tline.writer().await;
+                let writer = tline.writer().await;
                writer
                    .put(
                        test_key,
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -246,8 +246,6 @@ async fn cleanup_remaining_fs_traces(

    rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;

-    rm(conf.tenant_heatmap_path(tenant_shard_id), false).await?;
-
    fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
        Err(anyhow::anyhow!(
            "failpoint: tenant-delete-before-remove-tenant-dir"
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -246,17 +246,32 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-
    pub(crate) async fn put_value(
        &self,
        key: Key,
        lsn: Lsn,
-        buf: &[u8],
+        val: &Value,
        ctx: &RequestContext,
    ) -> Result<()> {
        let mut inner = self.inner.write().await;
        self.assert_writable();
-        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
+        self.put_value_locked(&mut inner, key, lsn, val, ctx).await
+    }
+
+    pub(crate) async fn put_values(
+        &self,
+        values: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        let mut inner = self.inner.write().await;
+        self.assert_writable();
+        for (key, vals) in values {
+            for (lsn, val) in vals {
+                self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
+                    .await?;
+            }
+        }
+        Ok(())
    }

    async fn put_value_locked(
@@ -264,16 +279,22 @@ impl InMemoryLayer {
        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
        key: Key,
        lsn: Lsn,
-        buf: &[u8],
+        val: &Value,
        ctx: &RequestContext,
    ) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);

        let off = {
+            // Avoid doing allocations for "small" values.
+            // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
+            // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
+            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
+            buf.clear();
+            val.ser_into(&mut buf)?;
            locked_inner
                .file
                .write_blob(
-                    buf,
+                    &buf,
                    &RequestContextBuilder::extend(ctx)
                        .page_content_kind(PageContentKind::InMemoryLayer)
                        .build(),
@@ -301,12 +322,7 @@ impl InMemoryLayer {
    pub async fn freeze(&self, end_lsn: Lsn) {
        let inner = self.inner.write().await;

-        assert!(
-            self.start_lsn < end_lsn,
-            "{} >= {}",
-            self.start_lsn,
-            end_lsn
-        );
+        assert!(self.start_lsn < end_lsn);
        self.end_lsn.set(end_lsn).expect("end_lsn set only once");

        for vec_map in inner.index.values() {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -33,7 +33,7 @@ use tokio::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::{bin_ser::BeSer, sync::gate::Gate};
+use utils::sync::gate::Gate;

 use std::ops::{Deref, Range};
 use std::pin::pin;
@@ -274,7 +274,7 @@ pub struct Timeline {
    /// Locked automatically by [`TimelineWriter`] and checkpointer.
    /// Must always be acquired before the layer map/individual layer lock
    /// to avoid deadlock.
-    write_lock: tokio::sync::Mutex<Option<TimelineWriterState>>,
+    write_lock: tokio::sync::Mutex<()>,

    /// Used to avoid multiple `flush_loop` tasks running
    pub(super) flush_loop_state: Mutex<FlushLoopState>,
@@ -1051,10 +1051,53 @@ impl Timeline {
    pub(crate) async fn writer(&self) -> TimelineWriter<'_> {
        TimelineWriter {
            tl: self,
-            write_guard: self.write_lock.lock().await,
+            _write_guard: self.write_lock.lock().await,
        }
    }

+    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
+    /// the in-memory layer, and initiate flushing it if so.
+    ///
+    /// Also flush after a period of time without new data -- it helps
+    /// safekeepers to regard pageserver as caught up and suspend activity.
+    pub(crate) async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
+        let last_lsn = self.get_last_record_lsn();
+        let open_layer_size = {
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+            let Some(open_layer) = layers.open_layer.as_ref() else {
+                return Ok(());
+            };
+            open_layer.size().await?
+        };
+        let last_freeze_at = self.last_freeze_at.load();
+        let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
+        let distance = last_lsn.widening_sub(last_freeze_at);
+        // Checkpointing the open layer can be triggered by layer size or LSN range.
+        // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and
+        // we want to stay below that with a big margin.  The LSN distance determines how
+        // much WAL the safekeepers need to store.
+        if distance >= self.get_checkpoint_distance().into()
+            || open_layer_size > self.get_checkpoint_distance()
+            || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
+        {
+            info!(
+                "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
+                distance,
+                open_layer_size,
+                last_freeze_ts.elapsed()
+            );
+
+            self.freeze_inmem_layer(true).await;
+            self.last_freeze_at.store(last_lsn);
+            *(self.last_freeze_ts.write().unwrap()) = Instant::now();
+
+            // Wake up the layer flusher
+            self.flush_frozen_layers();
+        }
+        Ok(())
+    }
+
    pub(crate) fn activate(
        self: &Arc<Self>,
        broker_client: BrokerClientChannel,
@@ -1486,7 +1529,7 @@ impl Timeline {
                layer_flush_start_tx,
                layer_flush_done_tx,

-                write_lock: tokio::sync::Mutex::new(None),
+                write_lock: tokio::sync::Mutex::new(()),

                gc_info: std::sync::RwLock::new(GcInfo {
                    retain_lsns: Vec::new(),
@@ -2659,6 +2702,43 @@ impl Timeline {
        Ok(layer)
    }

+    async fn put_value(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        val: &Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        //info!("PUT: key {} at {}", key, lsn);
+        let layer = self.get_layer_for_write(lsn).await?;
+        layer.put_value(key, lsn, val, ctx).await?;
+        Ok(())
+    }
+
+    async fn put_values(
+        &self,
+        values: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // Pick the first LSN in the batch to get the layer to write to.
+        for lsns in values.values() {
+            if let Some((lsn, _)) = lsns.first() {
+                let layer = self.get_layer_for_write(*lsn).await?;
+                layer.put_values(values, ctx).await?;
+                break;
+            }
+        }
+        Ok(())
+    }
+
+    async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        if let Some((_, lsn)) = tombstones.first() {
+            let layer = self.get_layer_for_write(*lsn).await?;
+            layer.put_tombstones(tombstones).await?;
+        }
+        Ok(())
+    }
+
    pub(crate) fn finish_write(&self, new_lsn: Lsn) {
        assert!(new_lsn.is_aligned());

@@ -2669,20 +2749,14 @@ impl Timeline {
    async fn freeze_inmem_layer(&self, write_lock_held: bool) {
        // Freeze the current open in-memory layer. It will be written to disk on next
        // iteration.
-
        let _write_guard = if write_lock_held {
            None
        } else {
            Some(self.write_lock.lock().await)
        };
-
-        self.freeze_inmem_layer_at(self.get_last_record_lsn()).await;
-    }
-
-    async fn freeze_inmem_layer_at(&self, at: Lsn) {
        let mut guard = self.layers.write().await;
        guard
-            .try_freeze_in_memory_layer(at, &self.last_freeze_at)
+            .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at)
            .await;
    }

@@ -3856,6 +3930,27 @@ impl Timeline {
                // Remember size of key value because at next iteration we will access next item
                key_values_total_size = next_key_size;
            }
+            if writer.is_none() {
+                // Create writer if not initiaized yet
+                writer = Some(
+                    DeltaLayerWriter::new(
+                        self.conf,
+                        self.timeline_id,
+                        self.tenant_shard_id,
+                        key,
+                        if dup_end_lsn.is_valid() {
+                            // this is a layer containing slice of values of the same key
+                            debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
+                            dup_start_lsn..dup_end_lsn
+                        } else {
+                            debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
+                            lsn_range.clone()
+                        },
+                    )
+                    .await?,
+                );
+            }
+
            fail_point!("delta-layer-writer-fail-before-finish", |_| {
                Err(CompactionError::Other(anyhow::anyhow!(
                    "failpoint delta-layer-writer-fail-before-finish"
@@ -3863,27 +3958,6 @@ impl Timeline {
            });

            if !self.shard_identity.is_key_disposable(&key) {
-                if writer.is_none() {
-                    // Create writer if not initiaized yet
-                    writer = Some(
-                        DeltaLayerWriter::new(
-                            self.conf,
-                            self.timeline_id,
-                            self.tenant_shard_id,
-                            key,
-                            if dup_end_lsn.is_valid() {
-                                // this is a layer containing slice of values of the same key
-                                debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
-                                dup_start_lsn..dup_end_lsn
-                            } else {
-                                debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
-                                lsn_range.clone()
-                            },
-                        )
-                        .await?,
-                    );
-                }
-
                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
            } else {
                debug!(
@@ -4705,43 +4779,13 @@ fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageRecon
    PageReconstructError::from(msg)
 }

-struct TimelineWriterState {
-    open_layer: Arc<InMemoryLayer>,
-    current_size: u64,
-    // Previous Lsn which passed through
-    prev_lsn: Option<Lsn>,
-    // Largest Lsn which passed through the current writer
-    max_lsn: Option<Lsn>,
-    // Cached details of the last freeze. Avoids going trough the atomic/lock on every put.
-    cached_last_freeze_at: Lsn,
-    cached_last_freeze_ts: Instant,
-}
-
-impl TimelineWriterState {
-    fn new(
-        open_layer: Arc<InMemoryLayer>,
-        current_size: u64,
-        last_freeze_at: Lsn,
-        last_freeze_ts: Instant,
-    ) -> Self {
-        Self {
-            open_layer,
-            current_size,
-            prev_lsn: None,
-            max_lsn: None,
-            cached_last_freeze_at: last_freeze_at,
-            cached_last_freeze_ts: last_freeze_ts,
-        }
-    }
-}
-
 /// Various functions to mutate the timeline.
 // TODO Currently, Deref is used to allow easy access to read methods from this trait.
 // This is probably considered a bad practice in Rust and should be fixed eventually,
 // but will cause large code changes.
 pub(crate) struct TimelineWriter<'a> {
    tl: &'a Timeline,
-    write_guard: tokio::sync::MutexGuard<'a, Option<TimelineWriterState>>,
+    _write_guard: tokio::sync::MutexGuard<'a, ()>,
 }

 impl Deref for TimelineWriter<'_> {
@@ -4752,189 +4796,31 @@ impl Deref for TimelineWriter<'_> {
    }
 }

-impl Drop for TimelineWriter<'_> {
-    fn drop(&mut self) {
-        self.write_guard.take();
-    }
-}
-
-enum OpenLayerAction {
-    Roll,
-    Open,
-    None,
-}
-
 impl<'a> TimelineWriter<'a> {
    /// Put a new page version that can be constructed from a WAL record
    ///
    /// This will implicitly extend the relation, if the page is beyond the
    /// current end-of-file.
    pub(crate) async fn put(
-        &mut self,
+        &self,
        key: Key,
        lsn: Lsn,
        value: &Value,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        // Avoid doing allocations for "small" values.
-        // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
-        // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
-        let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
-        buf.clear();
-        value.ser_into(&mut buf)?;
-        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
-
-        let action = self.get_open_layer_action(lsn, buf_size);
-        let layer = self.handle_open_layer_action(lsn, action).await?;
-        let res = layer.put_value(key, lsn, &buf, ctx).await;
-
-        if res.is_ok() {
-            // Update the current size only when the entire write was ok.
-            // In case of failures, we may have had partial writes which
-            // render the size tracking out of sync. That's ok because
-            // the checkpoint distance should be significantly smaller
-            // than the S3 single shot upload limit of 5GiB.
-            let state = self.write_guard.as_mut().unwrap();
-
-            state.current_size += buf_size;
-            state.prev_lsn = Some(lsn);
-            state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
-        }
-
-        res
+        self.tl.put_value(key, lsn, value, ctx).await
    }

-    async fn handle_open_layer_action(
-        &mut self,
-        at: Lsn,
-        action: OpenLayerAction,
-    ) -> anyhow::Result<&Arc<InMemoryLayer>> {
-        match action {
-            OpenLayerAction::Roll => {
-                let max_lsn = self.write_guard.as_ref().unwrap().max_lsn.unwrap();
-                self.tl.freeze_inmem_layer_at(max_lsn).await;
-
-                let now = Instant::now();
-                *(self.last_freeze_ts.write().unwrap()) = now;
-
-                self.tl.flush_frozen_layers();
-
-                let current_size = self.write_guard.as_ref().unwrap().current_size;
-                if current_size > self.get_checkpoint_distance() {
-                    warn!("Flushed oversized open layer with size {}", current_size)
-                }
-
-                assert!(self.write_guard.is_some());
-
-                let layer = self.tl.get_layer_for_write(at).await?;
-                let initial_size = layer.size().await?;
-                self.write_guard.replace(TimelineWriterState::new(
-                    layer,
-                    initial_size,
-                    Lsn(max_lsn.0 + 1),
-                    now,
-                ));
-            }
-            OpenLayerAction::Open => {
-                assert!(self.write_guard.is_none());
-
-                let layer = self.tl.get_layer_for_write(at).await?;
-                let initial_size = layer.size().await?;
-
-                let last_freeze_at = self.last_freeze_at.load();
-                let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
-                self.write_guard.replace(TimelineWriterState::new(
-                    layer,
-                    initial_size,
-                    last_freeze_at,
-                    last_freeze_ts,
-                ));
-            }
-            OpenLayerAction::None => {
-                assert!(self.write_guard.is_some());
-            }
-        }
-
-        Ok(&self.write_guard.as_ref().unwrap().open_layer)
-    }
-
-    fn get_open_layer_action(&self, lsn: Lsn, new_value_size: u64) -> OpenLayerAction {
-        let state = &*self.write_guard;
-        let Some(state) = &state else {
-            return OpenLayerAction::Open;
-        };
-
-        if state.prev_lsn == Some(lsn) {
-            // Rolling mid LSN is not supported by downstream code.
-            // Hence, only roll at LSN boundaries.
-            return OpenLayerAction::None;
-        }
-
-        let distance = lsn.widening_sub(state.cached_last_freeze_at);
-        let proposed_open_layer_size = state.current_size + new_value_size;
-
-        // Rolling the open layer can be triggered by:
-        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
-        //    the safekeepers need to store.
-        // 2. The size of the currently open layer.
-        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
-        //    up and suspend activity.
-        if distance >= self.get_checkpoint_distance().into() {
-            info!(
-                "Will roll layer at {} with layer size {} due to LSN distance ({})",
-                lsn, state.current_size, distance
-            );
-
-            OpenLayerAction::Roll
-        } else if state.current_size > 0
-            && proposed_open_layer_size >= self.get_checkpoint_distance()
-        {
-            info!(
-                "Will roll layer at {} with layer size {} due to layer size ({})",
-                lsn, state.current_size, proposed_open_layer_size
-            );
-
-            OpenLayerAction::Roll
-        } else if distance > 0
-            && state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()
-        {
-            info!(
-                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
-                lsn,
-                state.current_size,
-                state.cached_last_freeze_ts.elapsed()
-            );
-
-            OpenLayerAction::Roll
-        } else {
-            OpenLayerAction::None
-        }
-    }
-
-    /// Put a batch keys at the specified Lsns.
-    ///
-    /// The batch should be sorted by Lsn such that it's safe
-    /// to roll the open layer mid batch.
    pub(crate) async fn put_batch(
-        &mut self,
-        batch: Vec<(Key, Lsn, Value)>,
+        &self,
+        batch: &HashMap<Key, Vec<(Lsn, Value)>>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        for (key, lsn, val) in batch {
-            self.put(key, lsn, &val, ctx).await?
-        }
-
-        Ok(())
+        self.tl.put_values(batch, ctx).await
    }

-    pub(crate) async fn delete_batch(&mut self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        if let Some((_, lsn)) = batch.first() {
-            let action = self.get_open_layer_action(*lsn, 0);
-            let layer = self.handle_open_layer_action(*lsn, action).await?;
-            layer.put_tombstones(batch).await?;
-        }
-
-        Ok(())
+    pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        self.tl.put_tombstones(batch).await
    }

    /// Track the end of the latest digested WAL record.
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -343,6 +343,23 @@ pub(super) async fn handle_walreceiver_connection(
                            modification.commit(&ctx).await?;
                            uncommitted_records = 0;
                            filtered_records = 0;
+
+                            //
+                            // We should check checkpoint distance after appending each ingest_batch_size bytes because otherwise
+                            // layer size can become much larger than `checkpoint_distance`.
+                            // It can append because wal-sender is sending WAL using 125kb chucks and some WAL records can cause writing large
+                            // amount of data to key-value storage. So performing this check only after processing
+                            // all WAL records in the chunk, can cause huge L0 layer files.
+                            //
+                            timeline
+                                .check_checkpoint_distance()
+                                .await
+                                .with_context(|| {
+                                    format!(
+                                        "Failed to check checkpoint distance for timeline {}",
+                                        timeline.timeline_id
+                                    )
+                                })?;
                        }
                    }

@@ -389,6 +406,16 @@ pub(super) async fn handle_walreceiver_connection(
            }
        }

+        timeline
+            .check_checkpoint_distance()
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to check checkpoint distance for timeline {}",
+                    timeline.timeline_id
+                )
+            })?;
+
        if let Some(last_lsn) = status_update {
            let timeline_remote_consistent_lsn = timeline
                .get_remote_consistent_lsn_visible()
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -132,8 +132,9 @@ struct Scram(scram::ServerSecret);

 impl Scram {
    fn new(password: &str) -> anyhow::Result<Self> {
-        let secret =
-            scram::ServerSecret::build(password).context("failed to generate scram secret")?;
+        let salt = rand::random::<[u8; 16]>();
+        let secret = scram::ServerSecret::build(password, &salt, 256)
+            .context("failed to generate scram secret")?;
        Ok(Scram(secret))
    }

--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -12,6 +12,9 @@ mod messages;
 mod secret;
 mod signature;

+#[cfg(any(test, doc))]
+mod password;
+
 pub use exchange::{exchange, Exchange};
 pub use key::ScramKey;
 pub use secret::ServerSecret;
@@ -56,21 +59,27 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {

 #[cfg(test)]
 mod tests {
-    use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
-
    use crate::sasl::{Mechanism, Step};

-    use super::{Exchange, ServerSecret};
+    use super::{password::SaltedPassword, Exchange, ServerSecret};

    #[test]
-    fn snapshot() {
+    fn happy_path() {
        let iterations = 4096;
-        let salt = "QSXCR+Q6sek8bf92";
-        let stored_key = "FO+9jBb3MUukt6jJnzjPZOWc5ow/Pu6JtPyju0aqaE8=";
-        let server_key = "qxJ1SbmSAi5EcS0J5Ck/cKAm/+Ixa+Kwp63f4OHDgzo=";
-        let secret = format!("SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}",);
-        let secret = ServerSecret::parse(&secret).unwrap();
+        let salt_base64 = "QSXCR+Q6sek8bf92";
+        let pw = SaltedPassword::new(
+            b"pencil",
+            base64::decode(salt_base64).unwrap().as_slice(),
+            iterations,
+        );

+        let secret = ServerSecret {
+            iterations,
+            salt_base64: salt_base64.to_owned(),
+            stored_key: pw.client_key().sha256(),
+            server_key: pw.server_key(),
+            doomed: false,
+        };
        const NONCE: [u8; 18] = [
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        ];
@@ -112,33 +121,4 @@ mod tests {
            ]
        );
    }
-
-    fn run_round_trip_test(server_password: &str, client_password: &str) {
-        let scram_secret = ServerSecret::build(server_password).unwrap();
-        let sasl_client =
-            ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported());
-
-        let outcome = super::exchange(
-            &scram_secret,
-            sasl_client,
-            crate::config::TlsServerEndPoint::Undefined,
-        )
-        .unwrap();
-
-        match outcome {
-            crate::sasl::Outcome::Success(_) => {}
-            crate::sasl::Outcome::Failure(r) => panic!("{r}"),
-        }
-    }
-
-    #[test]
-    fn round_trip() {
-        run_round_trip_test("pencil", "pencil")
-    }
-
-    #[test]
-    #[should_panic(expected = "password doesn't match")]
-    fn failure() {
-        run_round_trip_test("pencil", "eraser")
-    }
 }
--- a/proxy/src/scram/key.rs
+++ b/proxy/src/scram/key.rs
@@ -3,7 +3,7 @@
 /// Faithfully taken from PostgreSQL.
 pub const SCRAM_KEY_LEN: usize = 32;

-/// One of the keys derived from the user's password.
+/// One of the keys derived from the [password](super::password::SaltedPassword).
 /// We use the same structure for all keys, i.e.
 /// `ClientKey`, `StoredKey`, and `ServerKey`.
 #[derive(Clone, Default, PartialEq, Eq, Debug)]
--- a/proxy/src/scram/password.rs
+++ b/proxy/src/scram/password.rs
@@ -0,0 +1,74 @@
+//! Password hashing routines.
+
+use super::key::ScramKey;
+
+pub const SALTED_PASSWORD_LEN: usize = 32;
+
+/// Salted hashed password is essential for [key](super::key) derivation.
+#[repr(transparent)]
+pub struct SaltedPassword {
+    bytes: [u8; SALTED_PASSWORD_LEN],
+}
+
+impl SaltedPassword {
+    /// See `scram-common.c : scram_SaltedPassword` for details.
+    /// Further reading: <https://datatracker.ietf.org/doc/html/rfc2898> (see `PBKDF2`).
+    pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
+        pbkdf2::pbkdf2_hmac_array::<sha2::Sha256, 32>(password, salt, iterations).into()
+    }
+
+    /// Derive `ClientKey` from a salted hashed password.
+    pub fn client_key(&self) -> ScramKey {
+        super::hmac_sha256(&self.bytes, [b"Client Key".as_ref()]).into()
+    }
+
+    /// Derive `ServerKey` from a salted hashed password.
+    pub fn server_key(&self) -> ScramKey {
+        super::hmac_sha256(&self.bytes, [b"Server Key".as_ref()]).into()
+    }
+}
+
+impl From<[u8; SALTED_PASSWORD_LEN]> for SaltedPassword {
+    #[inline(always)]
+    fn from(bytes: [u8; SALTED_PASSWORD_LEN]) -> Self {
+        Self { bytes }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::SaltedPassword;
+
+    fn legacy_pbkdf2_impl(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
+        let one = 1_u32.to_be_bytes(); // magic
+
+        let mut current = super::super::hmac_sha256(password, [salt, &one]);
+        let mut result = current;
+        for _ in 1..iterations {
+            current = super::super::hmac_sha256(password, [current.as_ref()]);
+            // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094
+            for (i, x) in current.iter().enumerate() {
+                result[i] ^= x;
+            }
+        }
+
+        result.into()
+    }
+
+    #[test]
+    fn pbkdf2() {
+        let password = "a-very-secure-password";
+        let salt = "such-a-random-salt";
+        let iterations = 4096;
+        let output = [
+            203, 18, 206, 81, 4, 154, 193, 100, 147, 41, 211, 217, 177, 203, 69, 210, 194, 211,
+            101, 1, 248, 156, 96, 0, 8, 223, 30, 87, 158, 41, 20, 42,
+        ];
+
+        let actual = SaltedPassword::new(password.as_bytes(), salt.as_bytes(), iterations);
+        let expected = legacy_pbkdf2_impl(password.as_bytes(), salt.as_bytes(), iterations);
+
+        assert_eq!(actual.bytes, output);
+        assert_eq!(actual.bytes, expected.bytes);
+    }
+}
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -3,7 +3,7 @@
 use super::base64_decode_array;
 use super::key::ScramKey;

-/// Server secret is produced from user's password,
+/// Server secret is produced from [password](super::password::SaltedPassword)
 /// and is used throughout the authentication process.
 #[derive(Clone, Eq, PartialEq, Debug)]
 pub struct ServerSecret {
@@ -59,10 +59,21 @@ impl ServerSecret {
    /// Build a new server secret from the prerequisites.
    /// XXX: We only use this function in tests.
    #[cfg(test)]
-    pub fn build(password: &str) -> Option<Self> {
-        Self::parse(&postgres_protocol::password::scram_sha_256(
-            password.as_bytes(),
-        ))
+    pub fn build(password: &str, salt: &[u8], iterations: u32) -> Option<Self> {
+        // TODO: implement proper password normalization required by the RFC
+        if !password.is_ascii() {
+            return None;
+        }
+
+        let password = super::password::SaltedPassword::new(password.as_bytes(), salt, iterations);
+
+        Some(Self {
+            iterations,
+            salt_base64: base64::encode(salt),
+            stored_key: password.client_key().sha256(),
+            server_key: password.server_key(),
+            doomed: false,
+        })
    }
 }

@@ -92,4 +103,20 @@ mod tests {
        assert_eq!(base64::encode(parsed.stored_key), stored_key);
        assert_eq!(base64::encode(parsed.server_key), server_key);
    }
+
+    #[test]
+    fn build_scram_secret() {
+        let salt = b"salt";
+        let secret = ServerSecret::build("password", salt, 4096).unwrap();
+        assert_eq!(secret.iterations, 4096);
+        assert_eq!(secret.salt_base64, base64::encode(salt));
+        assert_eq!(
+            base64::encode(secret.stored_key.as_ref()),
+            "lF4cRm/Jky763CN4HtxdHnjV4Q8AWTNlKvGmEFFU8IQ="
+        );
+        assert_eq!(
+            base64::encode(secret.server_key.as_ref()),
+            "ub8OgRsftnk2ccDMOt7ffHXNcikRkQkq1lh4xaAqrSw="
+        );
+    }
 }
--- a/scripts/comment-test-report.js
+++ b/scripts/comment-test-report.js
@@ -188,7 +188,7 @@ const reportSummary = async (params) => {
 }

 const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => {
-    let summary = `\n### Code coverage* ([full report](${coverageUrl}))\n`
+    let summary = `\n### Code coverage ([full report](${coverageUrl}))\n`

    const coverage = await (await fetch(summaryJsonUrl)).json()
    for (const covType of Object.keys(coverage).sort()) {
@@ -198,7 +198,7 @@ const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => {

        summary += `- \`${covType}s\`: \`${coverage[covType]["_summary"]}\`\n`
    }
-    summary += "\n\\* collected from Rust tests only\n"
+
    summary += `\n___\n`

    return summary
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -4,8 +4,6 @@ from typing import Dict, List, Optional, Tuple
 from prometheus_client.parser import text_string_to_metric_families
 from prometheus_client.samples import Sample

-from fixtures.log_helper import log
-

 class Metrics:
    metrics: Dict[str, List[Sample]]
@@ -33,55 +31,6 @@ class Metrics:
        return res[0]


-class MetricsGetter:
-    """
-    Mixin for types that implement a `get_metrics` function and would like associated
-    helpers for querying the metrics
-    """
-
-    def get_metrics(self) -> Metrics:
-        raise NotImplementedError()
-
-    def get_metric_value(
-        self, name: str, filter: Optional[Dict[str, str]] = None
-    ) -> Optional[float]:
-        metrics = self.get_metrics()
-        results = metrics.query_all(name, filter=filter)
-        if not results:
-            log.info(f'could not find metric "{name}"')
-            return None
-        assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
-        return results[0].value
-
-    def get_metrics_values(
-        self, names: list[str], filter: Optional[Dict[str, str]] = None
-    ) -> Dict[str, float]:
-        """
-        When fetching multiple named metrics, it is more efficient to use this
-        than to call `get_metric_value` repeatedly.
-
-        Throws RuntimeError if no metrics matching `names` are found, or if
-        not all of `names` are found: this method is intended for loading sets
-        of metrics whose existence is coupled.
-        """
-        metrics = self.get_metrics()
-        samples = []
-        for name in names:
-            samples.extend(metrics.query_all(name, filter=filter))
-
-        result = {}
-        for sample in samples:
-            if sample.name in result:
-                raise RuntimeError(f"Multiple values found for {sample.name}")
-            result[sample.name] = sample.value
-
-        if len(result) != len(names):
-            log.info(f"Metrics found: {metrics.metrics}")
-            raise RuntimeError(f"could not find all metrics {' '.join(names)}")
-
-        return result
-
-
 def parse_metrics(text: str, name: str = "") -> Metrics:
    metrics = Metrics(name)
    gen = text_string_to_metric_families(text)
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -46,7 +46,6 @@ from urllib3.util.retry import Retry
 from fixtures import overlayfs
 from fixtures.broker import NeonBroker
 from fixtures.log_helper import log
-from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pageserver.allowed_errors import (
    DEFAULT_PAGESERVER_ALLOWED_ERRORS,
    scan_pageserver_log_for_errors,
@@ -1914,7 +1913,7 @@ class Pagectl(AbstractNeonCli):
        return IndexPartDump.from_json(parsed)


-class NeonAttachmentService(MetricsGetter):
+class NeonAttachmentService:
    def __init__(self, env: NeonEnv, auth_enabled: bool):
        self.env = env
        self.running = False
@@ -1952,11 +1951,6 @@ class NeonAttachmentService(MetricsGetter):

        return headers

-    def get_metrics(self) -> Metrics:
-        res = self.request("GET", f"{self.env.attachment_service_api}/metrics")
-        res.raise_for_status()
-        return parse_metrics(res.text)
-
    def ready(self) -> bool:
        resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
        if resp.status_code == 503:
@@ -2100,17 +2094,6 @@ class NeonAttachmentService(MetricsGetter):
        log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}")
        assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id

-    def consistency_check(self):
-        """
-        Throw an exception if the service finds any inconsistencies in its state
-        """
-        response = self.request(
-            "POST",
-            f"{self.env.attachment_service_api}/debug/v1/consistency_check",
-        )
-        response.raise_for_status()
-        log.info("Attachment service passed consistency check")
-
    def __enter__(self) -> "NeonAttachmentService":
        return self

--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -82,11 +82,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
    # During shutdown, DownloadError::Cancelled may be logged as an error.  Cleaning this
    # up is tracked in https://github.com/neondatabase/neon/issues/6096
    ".*Cancelled, shutting down.*",
-    # Open layers are only rolled at Lsn boundaries to avoid name clashses.
-    # Hence, we can overshoot the soft limit set by checkpoint distance.
-    # This is especially pronounced in tests that set small checkpoint
-    # distances.
-    ".*Flushed oversized open layer with size.*",
 )


--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -12,7 +12,7 @@ from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry

 from fixtures.log_helper import log
-from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
+from fixtures.metrics import Metrics, parse_metrics
 from fixtures.pg_version import PgVersion
 from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import Fn
@@ -125,7 +125,7 @@ class TenantConfig:
        )


-class PageserverHttpClient(requests.Session, MetricsGetter):
+class PageserverHttpClient(requests.Session):
    def __init__(
        self,
        port: int,
@@ -721,6 +721,45 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
            assert len(matches) < 2, "above filter should uniquely identify metric"
        return value

+    def get_metric_value(
+        self, name: str, filter: Optional[Dict[str, str]] = None
+    ) -> Optional[float]:
+        metrics = self.get_metrics()
+        results = metrics.query_all(name, filter=filter)
+        if not results:
+            log.info(f'could not find metric "{name}"')
+            return None
+        assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
+        return results[0].value
+
+    def get_metrics_values(
+        self, names: list[str], filter: Optional[Dict[str, str]] = None
+    ) -> Dict[str, float]:
+        """
+        When fetching multiple named metrics, it is more efficient to use this
+        than to call `get_metric_value` repeatedly.
+
+        Throws RuntimeError if no metrics matching `names` are found, or if
+        not all of `names` are found: this method is intended for loading sets
+        of metrics whose existence is coupled.
+        """
+        metrics = self.get_metrics()
+        samples = []
+        for name in names:
+            samples.extend(metrics.query_all(name, filter=filter))
+
+        result = {}
+        for sample in samples:
+            if sample.name in result:
+                raise RuntimeError(f"Multiple values found for {sample.name}")
+            result[sample.name] = sample.value
+
+        if len(result) != len(names):
+            log.info(f"Metrics found: {metrics.metrics}")
+            raise RuntimeError(f"could not find all metrics {' '.join(names)}")
+
+        return result
+
    def layer_map_info(
        self,
        tenant_id: Union[TenantId, TenantShardId],
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -2,7 +2,6 @@ import time
 from typing import Any, Dict, List, Optional, Union

 from mypy_boto3_s3.type_defs import (
-    DeleteObjectOutputTypeDef,
    EmptyResponseMetadataTypeDef,
    ListObjectsV2OutputTypeDef,
    ObjectTypeDef,
@@ -332,6 +331,7 @@ def list_prefix(
    """
    # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
    assert isinstance(remote, S3Storage), "localfs is currently not supported"
+    assert remote.client is not None

    prefix_in_bucket = remote.prefix_in_bucket or ""
    if not prefix:
@@ -350,29 +350,6 @@ def list_prefix(
    return response


-def remote_storage_delete_key(
-    remote: RemoteStorage,
-    key: str,
-) -> DeleteObjectOutputTypeDef:
-    """
-    Note that this function takes into account prefix_in_bucket.
-    """
-    # For local_fs we need to use a different implementation. As we don't need local_fs, just don't support it for now.
-    assert isinstance(remote, S3Storage), "localfs is currently not supported"
-
-    prefix_in_bucket = remote.prefix_in_bucket or ""
-
-    # real s3 tests have uniqie per test prefix
-    # mock_s3 tests use special pageserver prefix for pageserver stuff
-    key = "/".join((prefix_in_bucket, key))
-
-    response = remote.client.delete_object(
-        Bucket=remote.bucket_name,
-        Key=key,
-    )
-    return response
-
-
 def enable_remote_storage_versioning(
    remote: RemoteStorage,
 ) -> EmptyResponseMetadataTypeDef:
@@ -381,6 +358,7 @@ def enable_remote_storage_versioning(
    """
    # local_fs has no support for versioning
    assert isinstance(remote, S3Storage), "localfs is currently not supported"
+    assert remote.client is not None

    # The SDK supports enabling versioning on normal S3 as well but we don't want to change
    # these settings from a test in a live bucket (also, our access isn't enough nor should it be)
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -7,7 +7,6 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
 from fixtures.pageserver.utils import (
    assert_prefix_empty,
-    poll_for_remote_storage_iterations,
    tenant_delete_wait_completed,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
@@ -225,8 +224,9 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
    Test the sequence of location states that are used in a live migration.
    """
    neon_env_builder.num_pageservers = 2
-    remote_storage_kind = RemoteStorageKind.MOCK_S3
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind=remote_storage_kind)
+    neon_env_builder.enable_pageserver_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+    )
    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)

    tenant_id = env.initial_tenant
@@ -342,12 +342,6 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):

    workload.churn_rows(64, pageserver_b.id)
    workload.validate(pageserver_b.id)
-    del workload
-
-    # Check that deletion works properly on a tenant that was live-migrated
-    # (reproduce https://github.com/neondatabase/neon/issues/6802)
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-    tenant_delete_wait_completed(pageserver_b.http_client(), tenant_id, iterations)


 def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -83,8 +83,6 @@ def test_sharding_smoke(
        )
        assert timelines == {env.initial_timeline, timeline_b}

-    env.attachment_service.consistency_check()
-

 def test_sharding_split_unsharded(
    neon_env_builder: NeonEnvBuilder,
@@ -115,8 +113,6 @@ def test_sharding_split_unsharded(

    workload.validate()

-    env.attachment_service.consistency_check()
-

 def test_sharding_split_smoke(
    neon_env_builder: NeonEnvBuilder,
@@ -259,28 +255,3 @@ def test_sharding_split_smoke(
        env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10)

    workload.validate()
-
-    # Check that we didn't do any spurious reconciliations.
-    # Total number of reconciles should have been one per original shard, plus
-    # one for each shard that was migrated.
-    reconcile_ok = env.attachment_service.get_metric_value(
-        "storage_controller_reconcile_complete_total", filter={"status": "ok"}
-    )
-    assert reconcile_ok == shard_count + split_shard_count // 2
-
-    # Check that no cancelled or errored reconciliations occurred: this test does no
-    # failure injection and should run clean.
-    assert (
-        env.attachment_service.get_metric_value(
-            "storage_controller_reconcile_complete_total", filter={"status": "cancel"}
-        )
-        is None
-    )
-    assert (
-        env.attachment_service.get_metric_value(
-            "storage_controller_reconcile_complete_total", filter={"status": "error"}
-        )
-        is None
-    )
-
-    env.attachment_service.consistency_check()
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -51,13 +51,13 @@ def test_sharding_service_smoke(
    # The pageservers we started should have registered with the sharding service on startup
    nodes = env.attachment_service.node_list()
    assert len(nodes) == 2
-    assert set(n["id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id}
+    assert set(n["node_id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id}

    # Starting an additional pageserver should register successfully
    env.pageservers[2].start()
    nodes = env.attachment_service.node_list()
    assert len(nodes) == 3
-    assert set(n["id"] for n in nodes) == {ps.id for ps in env.pageservers}
+    assert set(n["node_id"] for n in nodes) == {ps.id for ps in env.pageservers}

    # Use a multiple of pageservers to get nice even number of shards on each one
    tenant_shard_count = len(env.pageservers) * 4
@@ -127,8 +127,6 @@ def test_sharding_service_smoke(
    assert counts[env.pageservers[0].id] == tenant_shard_count // 2
    assert counts[env.pageservers[2].id] == tenant_shard_count // 2

-    env.attachment_service.consistency_check()
-

 def test_node_status_after_restart(
    neon_env_builder: NeonEnvBuilder,
@@ -161,8 +159,6 @@ def test_node_status_after_restart(
    # should have had its availabilty state set to Active.
    env.attachment_service.tenant_create(TenantId.generate())

-    env.attachment_service.consistency_check()
-

 def test_sharding_service_passthrough(
    neon_env_builder: NeonEnvBuilder,
@@ -188,8 +184,6 @@ def test_sharding_service_passthrough(
    }
    assert status["state"]["slug"] == "Active"

-    env.attachment_service.consistency_check()
-

 def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
@@ -222,8 +216,6 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
    assert tenant_a not in observed
    assert tenant_b in observed

-    env.attachment_service.consistency_check()
-

 def test_sharding_service_onboarding(
    neon_env_builder: NeonEnvBuilder,
@@ -326,8 +318,6 @@ def test_sharding_service_onboarding(
    dest_ps.stop()
    dest_ps.start()

-    env.attachment_service.consistency_check()
-

 def test_sharding_service_compute_hook(
    httpserver: HTTPServer,
@@ -398,8 +388,6 @@ def test_sharding_service_compute_hook(

    wait_until(10, 1, received_restart_notification)

-    env.attachment_service.consistency_check()
-

 def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
    """
@@ -413,47 +401,13 @@ def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder):
    tenant_id = TenantId.generate()
    env.attachment_service.tenant_create(tenant_id, shard_count=2, shard_stripe_size=8192)

-    # Check that the consistency check passes on a freshly setup system
-    env.attachment_service.consistency_check()
-
    # These APIs are intentionally not implemented as methods on NeonAttachmentService, as
    # they're just for use in unanticipated circumstances.
-
-    # Initial tenant (1 shard) and the one we just created (2 shards) should be visible
-    response = env.attachment_service.request(
-        "GET", f"{env.attachment_service_api}/debug/v1/tenant"
-    )
-    response.raise_for_status()
-    assert len(response.json()) == 3
-
-    # Scheduler should report the expected nodes and shard counts
-    response = env.attachment_service.request(
-        "GET", f"{env.attachment_service_api}/debug/v1/scheduler"
-    )
-    response.raise_for_status()
-    # Two nodes, in a dict of node_id->node
-    assert len(response.json()["nodes"]) == 2
-    assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
-    assert all(v["may_schedule"] for v in response.json()["nodes"].values())
-
-    response = env.attachment_service.request(
+    env.attachment_service.request(
        "POST", f"{env.attachment_service_api}/debug/v1/node/{env.pageservers[1].id}/drop"
    )
-    response.raise_for_status()
    assert len(env.attachment_service.node_list()) == 1

-    response = env.attachment_service.request(
+    env.attachment_service.request(
        "POST", f"{env.attachment_service_api}/debug/v1/tenant/{tenant_id}/drop"
    )
-    response.raise_for_status()
-
-    # Tenant drop should be reflected in dump output
-    response = env.attachment_service.request(
-        "GET", f"{env.attachment_service_api}/debug/v1/tenant"
-    )
-    response.raise_for_status()
-    assert len(response.json()) == 1
-
-    # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
-    # meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
-    env.attachment_service.consistency_check()
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -2,7 +2,6 @@ import sys
 import tarfile
 import tempfile
 from pathlib import Path
-from typing import List

 import pytest
 import zstandard
@@ -12,17 +11,10 @@ from fixtures.neon_fixtures import (
    PgBin,
    VanillaPostgres,
 )
-from fixtures.pageserver.utils import (
-    list_prefix,
-    remote_storage_delete_key,
-    timeline_delete_wait_completed,
-)
+from fixtures.pageserver.utils import timeline_delete_wait_completed
 from fixtures.port_distributor import PortDistributor
-from fixtures.remote_storage import LocalFsStorage, S3Storage, s3_storage
+from fixtures.remote_storage import LocalFsStorage
 from fixtures.types import Lsn, TenantId, TimelineId
-from mypy_boto3_s3.type_defs import (
-    ObjectTypeDef,
-)


@pytest.mark.skipif(
@@ -136,11 +128,7 @@ def test_wal_restore_initdb(
        assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]


-@pytest.mark.parametrize("broken_tenant", [True, False])
-def test_wal_restore_http(neon_env_builder: NeonEnvBuilder, broken_tenant: bool):
-    remote_storage_kind = s3_storage()
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-
+def test_wal_restore_http(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    endpoint = env.endpoints.create_start("main")
    endpoint.safe_psql("create table t as select generate_series(1,300000)")
@@ -149,36 +137,15 @@ def test_wal_restore_http(neon_env_builder: NeonEnvBuilder, broken_tenant: bool)

    ps_client = env.pageserver.http_client()

-    if broken_tenant:
-        env.pageserver.allowed_errors.append(
-            r".* Changing Active tenant to Broken state, reason: broken from test"
-        )
-        ps_client.tenant_break(tenant_id)
-
    # Mark the initdb archive for preservation
    ps_client.timeline_preserve_initdb_archive(tenant_id, timeline_id)

    # shut down the endpoint and delete the timeline from the pageserver
    endpoint.stop()

-    assert isinstance(env.pageserver_remote_storage, S3Storage)
+    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)

-    if broken_tenant:
-        ps_client.tenant_detach(tenant_id)
-        objects: List[ObjectTypeDef] = list_prefix(
-            env.pageserver_remote_storage, f"tenants/{tenant_id}/timelines/{timeline_id}/"
-        ).get("Contents", [])
-        for obj in objects:
-            obj_key = obj["Key"]
-            if "initdb-preserved.tar.zst" in obj_key:
-                continue
-            log.info(f"Deleting key from remote storage: {obj_key}")
-            remote_storage_delete_key(env.pageserver_remote_storage, obj_key)
-            pass
-
-        ps_client.tenant_attach(tenant_id, generation=10)
-    else:
-        timeline_delete_wait_completed(ps_client, tenant_id, timeline_id)
+    timeline_delete_wait_completed(ps_client, tenant_id, timeline_id)

    # issue the restoration command
    ps_client.timeline_create(