Merge pull request #8647 from neondatabase/rc/proxy/2024-08-08

Proxy release 2024-08-08
Merge branch 'release-proxy' into rc/proxy/2024-08-08
2026-05-15 12:10:37 +00:00 · 2024-08-08 15:37:09 +01:00 · 2024-08-08 13:53:33 +01:00 · 2024-08-07 21:17:08 +01:00 · 2024-08-07 21:04:19 +01:00 · 2024-08-07 17:14:45 +00:00
37 changed files with 468 additions and 1026 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -13,4 +13,3 @@ config-variables:
  - REMOTE_STORAGE_AZURE_CONTAINER
  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
-  - DEV_AWS_OIDC_ROLE_ARN
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -56,10 +56,6 @@ concurrency:
 jobs:
  bench:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
-    permissions:
-      contents: write
-      statuses: write
-      id-token: write # Required for OIDC authentication in azure runners
    strategy:
      fail-fast: false
      matrix:
@@ -67,13 +63,9 @@ jobs:
          - DEFAULT_PG_VERSION: 16
            PLATFORM: "neon-staging"
            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
-            RUNNER: [ self-hosted, us-east-2, x64 ]
-            IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
          - DEFAULT_PG_VERSION: 16
            PLATFORM: "azure-staging"
            region_id: 'azure-eastus2'
-            RUNNER: [ self-hosted, eastus2, x64 ]
-            IMAGE: neondatabase/build-tools:pinned
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -84,21 +76,14 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.PLATFORM }}

-    runs-on: ${{ matrix.RUNNER }}
+    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: ${{ matrix.IMAGE }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    steps:
    - uses: actions/checkout@v4

-    - name: Configure AWS credentials # necessary on Azure runners
-      uses: aws-actions/configure-aws-credentials@v4
-      with:
-        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}  
-        role-duration-seconds: 18000 # 5 hours
-
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
@@ -176,7 +161,6 @@ jobs:
    steps:
    - uses: actions/checkout@v4

-    
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
@@ -253,9 +237,6 @@ jobs:
      id: pgbench-compare-matrix
      run: |
        region_id_default=${{ env.DEFAULT_REGION_ID }}
-        runner_default='["self-hosted", "us-east-2", "x64"]'
-        runner_azure='["self-hosted", "eastus2", "x64"]'
-        image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned"
        matrix='{
          "pg_version" : [
            16
@@ -269,19 +250,16 @@ jobs:
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "runner": ['"$runner_default"'],
-          "image": [ "'"$image_default"'" ],
-          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
+          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -324,10 +302,6 @@ jobs:
  pgbench-compare:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    needs: [ generate-matrices ]
-    permissions:
-      contents: write
-      statuses: write
-      id-token: write # Required for OIDC authentication in azure runners

    strategy:
      fail-fast: false
@@ -343,9 +317,9 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.platform }}

-    runs-on: ${{ matrix.runner }}
+    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: ${{ matrix.image }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    # Increase timeout to 8h, default timeout is 6h
@@ -354,13 +328,6 @@ jobs:
    steps:
    - uses: actions/checkout@v4

-    - name: Configure AWS credentials # necessary on Azure runners
-      uses: aws-actions/configure-aws-credentials@v4
-      with:
-        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-        role-duration-seconds: 18000 # 5 hours
-        
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
@@ -468,20 +435,12 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  pgbench-pgvector:
-    permissions:
-      contents: write
-      statuses: write
-      id-token: write # Required for OIDC authentication in azure runners
    strategy:
      fail-fast: false
      matrix:
        include:
          - PLATFORM: "neonvm-captest-pgvector"
-            RUNNER: [ self-hosted, us-east-2, x64 ]
-            IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
          - PLATFORM: "azure-captest-pgvector"
-            RUNNER: [ self-hosted, eastus2, x64 ]
-            IMAGE: neondatabase/build-tools:pinned

    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
@@ -494,9 +453,9 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.PLATFORM }}

-    runs-on: ${{ matrix.RUNNER }}
+    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: ${{ matrix.IMAGE }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    steps:
@@ -507,12 +466,12 @@ jobs:
    - name: Install postgresql-16 where pytest expects it
      run: |
        cd /home/nonroot
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb 
-        dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb 
+        dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg
        mkdir -p /tmp/neon/pg_install/v16/bin
        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench  
        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql  
@@ -537,13 +496,6 @@ jobs:
        esac

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-        
-    - name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
-      uses: aws-actions/configure-aws-credentials@v4
-      with:
-        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-        role-duration-seconds: 18000 # 5 hours

    - name: Benchmark pgvector hnsw indexing
      uses: ./.github/actions/run-python-test-set
@@ -572,7 +524,7 @@ jobs:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-    
+
    - name: Create Allure report
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -149,6 +149,8 @@ jobs:

    env:
      BUILD_TYPE: release
+      # remove the cachepot wrapper and build without crate caches
+      RUSTC_WRAPPER: ""
      # build with incremental compilation produce partial results
      # so do not attempt to cache this build, also disable the incremental compilation
      CARGO_INCREMENTAL: 0
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -7,20 +7,12 @@ on:
        description: 'Source tag'
        required: true
        type: string
-      force:
-        description: 'Force the image to be pinned'
-        default: false
-        type: boolean
  workflow_call:
    inputs:
      from-tag:
        description: 'Source tag'
        required: true
        type: string
-      force:
-        description: 'Force the image to be pinned'
-        default: false
-        type: boolean

 defaults:
  run:
@@ -30,18 +22,15 @@ concurrency:
  group: pin-build-tools-image-${{ inputs.from-tag }}
  cancel-in-progress: false

-# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}

-env:
-  FROM_TAG: ${{ inputs.from-tag }}
-  TO_TAG: pinned
-
 jobs:
-  check-manifests:
+  tag-image:
    runs-on: ubuntu-22.04
-    outputs:
-      skip: ${{ steps.check-manifests.outputs.skip }}
+
+    env:
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: pinned

    steps:
      - name: Check if we really need to pin the image
@@ -58,31 +47,27 @@ jobs:

          echo "skip=${skip}" | tee -a $GITHUB_OUTPUT

-  tag-image:
-    needs: check-manifests
-
-    # use format(..) to catch both inputs.force = true AND inputs.force = 'true'
-    if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true'
-
-    runs-on: ubuntu-22.04
-
-    permissions:
-      id-token: write # for `azure/login`
-
-    steps:
      - uses: docker/login-action@v3
-
+        if: steps.check-manifests.outputs.skip == 'false'
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

+      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
+        if: steps.check-manifests.outputs.skip == 'false'
+        run: |
+          docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
+                                             neondatabase/build-tools:${FROM_TAG}
+
      - uses: docker/login-action@v3
+        if: steps.check-manifests.outputs.skip == 'false'
        with:
          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
          password: ${{ secrets.AWS_SECRET_KEY_DEV }}

      - name: Azure login
+        if: steps.check-manifests.outputs.skip == 'false'
        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
        with:
          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
@@ -90,12 +75,13 @@ jobs:
          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}

      - name: Login to ACR
+        if: steps.check-manifests.outputs.skip == 'false'
        run: |
          az acr login --name=neoneastus2

-      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
+      - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR and ACR
+        if: steps.check-manifests.outputs.skip == 'false'
        run: |
          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
                                          -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \
-                                          -t neondatabase/build-tools:${TO_TAG} \
                                             neondatabase/build-tools:${FROM_TAG}
--- a/21
+++ b/21
@@ -17,7 +17,7 @@ COPY --chown=nonroot pgxn pgxn
 COPY --chown=nonroot Makefile Makefile
 COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh

-ENV BUILD_TYPE=release
+ENV BUILD_TYPE release
 RUN set -e \
    && mold -run make -j $(nproc) -s neon-pg-ext \
    && rm -rf pg_install/build \
@@ -29,12 +29,24 @@ WORKDIR /home/nonroot
 ARG GIT_VERSION=local
 ARG BUILD_TAG

+# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
+# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
+# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build
+ARG RUSTC_WRAPPER=cachepot
+ENV AWS_REGION=eu-central-1
+ENV CACHEPOT_S3_KEY_PREFIX=cachepot
+ARG CACHEPOT_BUCKET=neon-github-dev
+#ARG AWS_ACCESS_KEY_ID
+#ARG AWS_SECRET_ACCESS_KEY
+
 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
 COPY --chown=nonroot . .

+# Show build caching stats to check if it was used in the end.
+# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
    && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
      --bin pg_sni_router  \
@@ -46,7 +58,8 @@ RUN set -e \
      --bin proxy  \
      --bin neon_local \
      --bin storage_scrubber \
-      --locked --release
+      --locked --release \
+    && cachepot -s

 # Build final image
 #
@@ -91,7 +104,7 @@ RUN mkdir -p /data/.neon/ && \

 # When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
 # that want a particular postgres version will select it explicitly: this is just a default.
-ENV LD_LIBRARY_PATH=/usr/local/v16/lib
+ENV LD_LIBRARY_PATH /usr/local/v16/lib


 VOLUME ["/data"]
@@ -99,5 +112,5 @@ USER neon
 EXPOSE 6400
 EXPOSE 9898

-CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"]
+CMD /usr/local/bin/pageserver -D /data/.neon

--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -58,7 +58,7 @@ RUN set -e \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

 # protobuf-compiler (protoc)
-ENV PROTOC_VERSION=25.1
+ENV PROTOC_VERSION 25.1
 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
    && unzip -q protoc.zip -d protoc \
    && mv protoc/bin/protoc /usr/local/bin/protoc \
@@ -99,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
    && rm awscliv2.zip

 # Mold: A Modern Linker
-ENV MOLD_VERSION=v2.33.0
+ENV MOLD_VERSION v2.31.0
 RUN set -e \
    && git clone https://github.com/rui314/mold.git \
    && mkdir mold/build \
@@ -168,7 +168,7 @@ USER nonroot:nonroot
 WORKDIR /home/nonroot

 # Python
-ENV PYTHON_VERSION=3.9.19 \
+ENV PYTHON_VERSION=3.9.18 \
    PYENV_ROOT=/home/nonroot/.pyenv \
    PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
 RUN set -e \
@@ -192,14 +192,9 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.80.1
+ENV RUSTC_VERSION=1.80.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
-ARG RUSTFILT_VERSION=0.2.1
-ARG CARGO_HAKARI_VERSION=0.9.30
-ARG CARGO_DENY_VERSION=0.16.1
-ARG CARGO_HACK_VERSION=0.6.31
-ARG CARGO_NEXTEST_VERSION=0.9.72
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
 	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
@@ -208,13 +203,15 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    . "$HOME/.cargo/env" && \
    cargo --version && rustup --version && \
    rustup component add llvm-tools-preview rustfmt clippy && \
-    cargo install rustfilt            --version ${RUSTFILT_VERSION} && \
-    cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} && \
-    cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
-    cargo install cargo-hack          --version ${CARGO_HACK_VERSION} && \
-    cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
+    cargo install --git https://github.com/paritytech/cachepot && \
+    cargo install rustfilt && \
+    cargo install cargo-hakari && \
+    cargo install cargo-deny --locked && \
+    cargo install cargo-hack && \
+    cargo install cargo-nextest && \
    rm -rf /home/nonroot/.cargo/registry && \
    rm -rf /home/nonroot/.cargo/git
+ENV RUSTC_WRAPPER=cachepot

 # Show versions
 RUN whoami \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -94,7 +94,7 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar
    DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
    make clean && cp -R /sfcgal/* /

-ENV PATH="/usr/local/pgsql/bin:$PATH"
+ENV PATH "/usr/local/pgsql/bin:$PATH"

 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
    echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
@@ -411,7 +411,7 @@ FROM build-deps AS timescaledb-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ARG PG_VERSION
-ENV PATH="/usr/local/pgsql/bin:$PATH"
+ENV PATH "/usr/local/pgsql/bin:$PATH"

 RUN case "${PG_VERSION}" in \
      "v14" | "v15") \
@@ -444,7 +444,7 @@ FROM build-deps AS pg-hint-plan-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ARG PG_VERSION
-ENV PATH="/usr/local/pgsql/bin:$PATH"
+ENV PATH "/usr/local/pgsql/bin:$PATH"

 RUN case "${PG_VERSION}" in \
      "v14") \
@@ -480,7 +480,7 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS pg-cron-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH="/usr/local/pgsql/bin/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
    mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
@@ -506,7 +506,7 @@ RUN apt-get update && \
        libboost-system1.74-dev \
        libeigen3-dev

-ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
    echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
    mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
@@ -546,7 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
 FROM build-deps AS pg-uuidv7-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH="/usr/local/pgsql/bin/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
    echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
@@ -563,7 +563,7 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz
 FROM build-deps AS pg-roaringbitmap-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH="/usr/local/pgsql/bin/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
    echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
@@ -580,7 +580,7 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
 FROM build-deps AS pg-semver-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH="/usr/local/pgsql/bin/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
    echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
    mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
@@ -598,7 +598,7 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ARG PG_VERSION
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
      "v14" | "v15") \
        export PG_EMBEDDING_VERSION=0.3.5 \
@@ -622,7 +622,7 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH="/usr/local/pgsql/bin/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
    echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
@@ -750,7 +750,7 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -
 FROM build-deps AS wal2json-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH="/usr/local/pgsql/bin/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
    echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
@@ -766,7 +766,7 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
 FROM build-deps AS pg-ivm-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH="/usr/local/pgsql/bin/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
    echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
    mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
@@ -783,7 +783,7 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv
 FROM build-deps AS pg-partman-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ENV PATH="/usr/local/pgsql/bin/:$PATH"
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
    echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
    mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
@@ -1034,6 +1034,6 @@ RUN apt update &&  \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

-ENV LANG=en_US.utf8
+ENV LANG en_US.utf8
 USER postgres
 ENTRYPOINT ["/usr/local/bin/compute_ctl"]
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -158,8 +158,6 @@ pub struct NeonStorageControllerConf {

    /// Threshold for auto-splitting a tenant into shards
    pub split_threshold: Option<u64>,
-
-    pub max_secondary_lag_bytes: Option<u64>,
 }

 impl NeonStorageControllerConf {
@@ -175,7 +173,6 @@ impl Default for NeonStorageControllerConf {
            max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
            max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
            split_threshold: None,
-            max_secondary_lag_bytes: None,
        }
    }
 }
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -383,10 +383,6 @@ impl StorageController {
            args.push(format!("--split-threshold={split_threshold}"))
        }

-        if let Some(lag) = self.config.max_secondary_lag_bytes.as_ref() {
-            args.push(format!("--max-secondary-lag-bytes={lag}"))
-        }
-
        args.push(format!(
            "--neon-local-repo-dir={}",
            self.env.base_data_dir.display()
--- a/deny.toml
+++ b/deny.toml
@@ -4,7 +4,6 @@
 # to your expectations and requirements.

 # Root options
-[graph]
 targets = [
    { triple = "x86_64-unknown-linux-gnu" },
    { triple = "aarch64-unknown-linux-gnu" },
@@ -13,7 +12,6 @@ targets = [
 ]
 all-features = false
 no-default-features = false
-[output]
 feature-depth = 1

 # This section is considered when running `cargo deny check advisories`
@@ -21,13 +19,17 @@ feature-depth = 1
 # https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
 [advisories]
 db-urls = ["https://github.com/rustsec/advisory-db"]
+vulnerability = "deny"
+unmaintained = "warn"
 yanked = "warn"
+notice = "warn"
 ignore = []

 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
 [licenses]
+unlicensed = "deny"
 allow = [
    "Apache-2.0",
    "Artistic-2.0",
@@ -40,6 +42,10 @@ allow = [
    "OpenSSL",
    "Unicode-DFS-2016",
 ]
+deny = []
+copyleft = "warn"
+allow-osi-fsf-free = "neither"
+default = "deny"
 confidence-threshold = 0.8
 exceptions = [
    # Zlib license has some restrictions if we decide to change sth
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -1,18 +1,13 @@
 # Summary

-# Looking for `neon.tech` docs?
-
-This page linkes to a selection of technical content about the open source code in this repository.
-
-Please visit https://neon.tech/docs for documentation about using the Neon service, which is based on the code
-in this repository.
-
-# Architecture
-
 [Introduction]()
 - [Separation of Compute and Storage](./separation-compute-storage.md)

+# Architecture
+
 - [Compute]()
+  - [WAL proposer]()
+  - [WAL Backpressure]()
  - [Postgres changes](./core_changes.md)

 - [Pageserver](./pageserver.md)
@@ -21,15 +16,33 @@ in this repository.
    - [WAL Redo](./pageserver-walredo.md)
    - [Page cache](./pageserver-pagecache.md)
    - [Storage](./pageserver-storage.md)
+        - [Datadir mapping]()
+        - [Layer files]()
+        - [Branching]()
+        - [Garbage collection]()
+    - [Cloud Storage]()
    - [Processing a GetPage request](./pageserver-processing-getpage.md)
    - [Processing WAL](./pageserver-processing-wal.md)
+	- [Management API]()
+	- [Tenant Rebalancing]()

 - [WAL Service](walservice.md)
  - [Consensus protocol](safekeeper-protocol.md)
+  - [Management API]()
+  - [Rebalancing]()
+
+- [Control Plane]()
+
+- [Proxy]()

 - [Source view](./sourcetree.md)
  - [docker.md](./docker.md) — Docker images and building pipeline.
  - [Error handling and logging](./error-handling.md)
+  - [Testing]()
+    - [Unit testing]()
+    - [Integration testing]()
+    - [Benchmarks]()
+

 - [Glossary](./glossary.md)

@@ -45,6 +58,28 @@ in this repository.

 # RFCs

-Major changes are documented in RFCS:
- See [RFCs](./rfcs/README.md) for more information
- view the RFCs at https://github.com/neondatabase/neon/tree/main/docs/rfcs
+- [RFCs](./rfcs/README.md)
+
+- [002-storage](rfcs/002-storage.md)
+- [003-laptop-cli](rfcs/003-laptop-cli.md)
+- [004-durability](rfcs/004-durability.md)
+- [005-zenith_local](rfcs/005-zenith_local.md)
+- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md)
+- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md)
+- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md)
+- [008-push-pull](rfcs/008-push-pull.md)
+- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md)
+- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md)
+- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md)
+- [010-storage_details](rfcs/010-storage_details.md)
+- [011-retention-policy](rfcs/011-retention-policy.md)
+- [012-background-tasks](rfcs/012-background-tasks.md)
+- [013-term-history](rfcs/013-term-history.md)
+- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md)
+- [014-storage-lsm](rfcs/014-storage-lsm.md)
+- [015-storage-messaging](rfcs/015-storage-messaging.md)
+- [016-connection-routing](rfcs/016-connection-routing.md)
+- [017-timeline-data-management](rfcs/017-timeline-data-management.md)
+- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md)
+- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md)
+- [cluster-size-limits](rfcs/cluster-size-limits.md)
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -128,7 +128,7 @@ pub mod circuit_breaker;
 ///
 /// #############################################################################################
 /// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details.
-/// We used `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
+/// We use `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
 /// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
 /// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
 /// The problem needs further investigation and regular `const` declaration instead of a macro.
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -56,6 +56,7 @@ impl Statvfs {
 }

 pub mod mock {
+    use anyhow::Context;
    use camino::Utf8Path;
    use regex::Regex;
    use tracing::log::info;
@@ -134,30 +135,14 @@ pub mod mock {
            {
                continue;
            }
-            let m = match entry.metadata() {
-                Ok(m) => m,
-                Err(e) if is_not_found(&e) => {
-                    // some temp file which got removed right as we are walking
-                    continue;
-                }
-                Err(e) => {
-                    return Err(anyhow::Error::new(e)
-                        .context(format!("get metadata of {:?}", entry.path())))
-                }
-            };
-            total += m.len();
+            total += entry
+                .metadata()
+                .with_context(|| format!("get metadata of {:?}", entry.path()))?
+                .len();
        }
        Ok(total)
    }

-    fn is_not_found(e: &walkdir::Error) -> bool {
-        let Some(io_error) = e.io_error() else {
-            return false;
-        };
-        let kind = io_error.kind();
-        matches!(kind, std::io::ErrorKind::NotFound)
-    }
-
    pub struct Statvfs {
        pub blocks: u64,
        pub blocks_available: u64,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3012,6 +3012,54 @@ impl Tenant {
        // because that will stall branch creation.
        let gc_cs = self.gc_cs.lock().await;

+        // Paranoia check: it is critical that GcInfo's list of child timelines is correct, to avoid incorrectly GC'ing data they
+        // depend on.  So although GcInfo is updated continuously by Timeline::new and Timeline::drop, we also calculate it here
+        // and fail out if it's inaccurate.
+        // (this can be removed later, it's a risk mitigation for https://github.com/neondatabase/neon/pull/8427)
+        {
+            let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId)>> =
+                BTreeMap::new();
+            timelines.iter().for_each(|timeline| {
+                if let Some(ancestor_timeline_id) = &timeline.get_ancestor_timeline_id() {
+                    let ancestor_children =
+                        all_branchpoints.entry(*ancestor_timeline_id).or_default();
+                    ancestor_children.push((timeline.get_ancestor_lsn(), timeline.timeline_id));
+                }
+            });
+
+            for timeline in &timelines {
+                let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints
+                    .remove(&timeline.timeline_id)
+                    .unwrap_or_default();
+
+                branchpoints.sort_by_key(|b| b.0);
+
+                let target = timeline.gc_info.read().unwrap();
+
+                // We require that retain_lsns contains everything in `branchpoints`, but not that
+                // they are exactly equal: timeline deletions can race with us, so retain_lsns
+                // may contain some extra stuff.  It is safe to have extra timelines in there, because it
+                // just means that we retain slightly more data than we otherwise might.
+                let have_branchpoints = target.retain_lsns.iter().copied().collect::<HashSet<_>>();
+                for b in &branchpoints {
+                    if !have_branchpoints.contains(b) {
+                        tracing::error!(
+                            "Bug: `retain_lsns` is set incorrectly.  Expected be {:?}, but found {:?}",
+                            branchpoints,
+                            target.retain_lsns
+                        );
+                        debug_assert!(false);
+                        // Do not GC based on bad information!
+                        // (ab-use an existing GcError type rather than adding a new one, since this is a
+                        // "should never happen" check that will be removed soon).
+                        return Err(GcError::Remote(anyhow::anyhow!(
+                            "retain_lsns failed validation!"
+                        )));
+                    }
+                }
+            }
+        }
+
        // Ok, we now know all the branch points.
        // Update the GC information for each timeline.
        let mut gc_timelines = Vec::with_capacity(timelines.len());
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -224,8 +224,21 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
 }

 /// See [`Self::spawn`].
-#[derive(Clone, Default)]
-pub struct BackgroundPurges(tokio_util::task::TaskTracker);
+#[derive(Clone)]
+pub struct BackgroundPurges(Arc<std::sync::Mutex<BackgroundPurgesInner>>);
+enum BackgroundPurgesInner {
+    Open(tokio::task::JoinSet<()>),
+    // we use the async mutex for coalescing
+    ShuttingDown(Arc<tokio::sync::Mutex<tokio::task::JoinSet<()>>>),
+}
+
+impl Default for BackgroundPurges {
+    fn default() -> Self {
+        Self(Arc::new(std::sync::Mutex::new(
+            BackgroundPurgesInner::Open(JoinSet::new()),
+        )))
+    }
+}

 impl BackgroundPurges {
    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
@@ -234,32 +247,24 @@ impl BackgroundPurges {
    /// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
    /// Thus the [`BackgroundPurges`] type to keep track of these tasks.
    pub fn spawn(&self, tmp_path: Utf8PathBuf) {
-        // because on shutdown we close and wait, we are misusing TaskTracker a bit.
-        //
-        // so first acquire a token, then check if the tracker has been closed. the tracker might get closed
-        // right after, but at least the shutdown will wait for what we are spawning next.
-        let token = self.0.token();
-
-        if self.0.is_closed() {
-            warn!(
-                %tmp_path,
-                "trying to spawn background purge during shutdown, ignoring"
-            );
-            return;
-        }
-
-        let span = info_span!(parent: None, "background_purge", %tmp_path);
-
-        let task = move || {
-            let _token = token;
-            let _entered = span.entered();
-            if let Err(error) = std::fs::remove_dir_all(tmp_path.as_path()) {
-                // should we fatal_io_error here?
-                warn!(%error, "failed to purge tenant directory");
+        let mut guard = self.0.lock().unwrap();
+        let jset = match &mut *guard {
+            BackgroundPurgesInner::Open(ref mut jset) => jset,
+            BackgroundPurgesInner::ShuttingDown(_) => {
+                warn!("trying to spawn background purge during shutdown, ignoring");
+                return;
            }
        };
-
-        BACKGROUND_RUNTIME.spawn_blocking(task);
+        jset.spawn_on(
+            async move {
+                if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await {
+                    // should we fatal_io_error here?
+                    warn!(%error, path=%tmp_path, "failed to purge tenant directory");
+                }
+            }
+            .instrument(info_span!(parent: None, "background_purge")),
+            BACKGROUND_RUNTIME.handle(),
+        );
    }

    /// When this future completes, all background purges have completed.
@@ -273,9 +278,42 @@ impl BackgroundPurges {
    /// instances of this future will continue to be correct.
    #[instrument(skip_all)]
    pub async fn shutdown(&self) {
-        // forbid new tasks (can be called many times)
-        self.0.close();
-        self.0.wait().await;
+        let jset = {
+            let mut guard = self.0.lock().unwrap();
+            match &mut *guard {
+                BackgroundPurgesInner::Open(jset) => {
+                    *guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new(
+                        std::mem::take(jset),
+                    )))
+                }
+                BackgroundPurgesInner::ShuttingDown(_) => {
+                    // calling shutdown multiple times is most likely a bug in pageserver shutdown code
+                    warn!("already shutting down");
+                }
+            };
+            match &mut *guard {
+                BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(),
+                BackgroundPurgesInner::Open(_) => {
+                    unreachable!("above code transitions into shut down state");
+                }
+            }
+        };
+        let mut jset = jset.lock().await; // concurrent callers coalesce here
+        while let Some(res) = jset.join_next().await {
+            match res {
+                Ok(()) => {}
+                Err(e) if e.is_panic() => {
+                    // If it panicked, the error is already logged by the panic hook.
+                }
+                Err(e) if e.is_cancelled() => {
+                    unreachable!("we don't cancel the joinset or runtime")
+                }
+                Err(e) => {
+                    // No idea when this can happen, but let's log it.
+                    warn!(%e, "background purge task failed or panicked");
+                }
+            }
+        }
    }
 }

--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -55,7 +55,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
 use utils::{
    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
-    id::TimelineId, pausable_failpoint, serde_system_time,
+    id::TimelineId, serde_system_time,
 };

 use super::{
@@ -1146,14 +1146,12 @@ impl<'a> TenantDownloader<'a> {
        layer: HeatMapLayer,
        ctx: &RequestContext,
    ) -> Result<Option<HeatMapLayer>, UpdateError> {
-        // Failpoints for simulating slow remote storage
+        // Failpoint for simulating slow remote storage
        failpoint_support::sleep_millis_async!(
            "secondary-layer-download-sleep",
            &self.secondary_state.cancel
        );

-        pausable_failpoint!("secondary-layer-download-pausable");
-
        let local_path = local_layer_path(
            self.conf,
            tenant_shard_id,
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4412,11 +4412,11 @@ impl From<CollectKeySpaceError> for CompactionError {
 impl From<super::upload_queue::NotInitialized> for CompactionError {
    fn from(value: super::upload_queue::NotInitialized) -> Self {
        match value {
-            super::upload_queue::NotInitialized::Uninitialized => {
+            super::upload_queue::NotInitialized::Uninitialized
+            | super::upload_queue::NotInitialized::Stopped => {
                CompactionError::Other(anyhow::anyhow!(value))
            }
-            super::upload_queue::NotInitialized::ShuttingDown
-            | super::upload_queue::NotInitialized::Stopped => CompactionError::ShuttingDown,
+            super::upload_queue::NotInitialized::ShuttingDown => CompactionError::ShuttingDown,
        }
    }
 }
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -45,7 +45,6 @@ static const char *jwt_token = NULL;
 /* GUCs */
 static char *ConsoleURL = NULL;
 static bool ForwardDDL = true;
-static bool RegressTestMode = false;

 /*
 * CURL docs say that this buffer must exist until we call curl_easy_cleanup
@@ -803,14 +802,6 @@ NeonProcessUtility(
 		case T_DropRoleStmt:
 			HandleDropRole(castNode(DropRoleStmt, parseTree));
 			break;
-		case T_CreateTableSpaceStmt:
-			if (!RegressTestMode)
-			{
-				ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					errmsg("CREATE TABLESPACE is not supported on Neon")));
-			}
-   			break;
 		default:
 			break;
 	}
@@ -873,18 +864,6 @@ InitControlPlaneConnector()
 							 NULL,
 							 NULL);

-	DefineCustomBoolVariable(
-							 "neon.regress_test_mode",
-							 "Controls whether we are running in the regression test mode",
-							 NULL,
-							 &RegressTestMode,
-							 false,
-							 PGC_SUSET,
-							 0,
-							 NULL,
-							 NULL,
-							 NULL);
-
 	jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN");
 	if (!jwt_token)
 	{
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,103 +1,91 @@
 # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.

-[[package]]
-name = "aiohappyeyeballs"
-version = "2.3.5"
-description = "Happy Eyeballs for asyncio"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "aiohappyeyeballs-2.3.5-py3-none-any.whl", hash = "sha256:4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03"},
-    {file = "aiohappyeyeballs-2.3.5.tar.gz", hash = "sha256:6fa48b9f1317254f122a07a131a86b71ca6946ca989ce6326fff54a99a920105"},
-]
-
 [[package]]
 name = "aiohttp"
-version = "3.10.2"
+version = "3.9.4"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:95213b3d79c7e387144e9cb7b9d2809092d6ff2c044cb59033aedc612f38fb6d"},
-    {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1aa005f060aff7124cfadaa2493f00a4e28ed41b232add5869e129a2e395935a"},
-    {file = "aiohttp-3.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eabe6bf4c199687592f5de4ccd383945f485779c7ffb62a9b9f1f8a3f9756df8"},
-    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e010736fc16d21125c7e2dc5c350cd43c528b85085c04bf73a77be328fe944"},
-    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99f81f9c1529fd8e03be4a7bd7df32d14b4f856e90ef6e9cbad3415dbfa9166c"},
-    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d611d1a01c25277bcdea06879afbc11472e33ce842322496b211319aa95441bb"},
-    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00191d38156e09e8c81ef3d75c0d70d4f209b8381e71622165f22ef7da6f101"},
-    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74c091a5ded6cb81785de2d7a8ab703731f26de910dbe0f3934eabef4ae417cc"},
-    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:18186a80ec5a701816adbf1d779926e1069392cf18504528d6e52e14b5920525"},
-    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5a7ceb2a0d2280f23a02c64cd0afdc922079bb950400c3dd13a1ab2988428aac"},
-    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8bd7be6ff6c162a60cb8fce65ee879a684fbb63d5466aba3fa5b9288eb04aefa"},
-    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:fae962b62944eaebff4f4fddcf1a69de919e7b967136a318533d82d93c3c6bd1"},
-    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a0fde16d284efcacbe15fb0c1013f0967b6c3e379649239d783868230bf1db42"},
-    {file = "aiohttp-3.10.2-cp310-cp310-win32.whl", hash = "sha256:f81cd85a0e76ec7b8e2b6636fe02952d35befda4196b8c88f3cec5b4fb512839"},
-    {file = "aiohttp-3.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:54ba10eb5a3481c28282eb6afb5f709aedf53cf9c3a31875ffbdc9fc719ffd67"},
-    {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87fab7f948e407444c2f57088286e00e2ed0003ceaf3d8f8cc0f60544ba61d91"},
-    {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ec6ad66ed660d46503243cbec7b2b3d8ddfa020f984209b3b8ef7d98ce69c3f2"},
-    {file = "aiohttp-3.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a4be88807283bd96ae7b8e401abde4ca0bab597ba73b5e9a2d98f36d451e9aac"},
-    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01c98041f90927c2cbd72c22a164bb816fa3010a047d264969cf82e1d4bcf8d1"},
-    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54e36c67e1a9273ecafab18d6693da0fb5ac48fd48417e4548ac24a918c20998"},
-    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7de3ddb6f424af54535424082a1b5d1ae8caf8256ebd445be68c31c662354720"},
-    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dd9c7db94b4692b827ce51dcee597d61a0e4f4661162424faf65106775b40e7"},
-    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e57e21e1167705f8482ca29cc5d02702208d8bf4aff58f766d94bcd6ead838cd"},
-    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a1a50e59b720060c29e2951fd9f13c01e1ea9492e5a527b92cfe04dd64453c16"},
-    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:686c87782481fda5ee6ba572d912a5c26d9f98cc5c243ebd03f95222af3f1b0f"},
-    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:dafb4abb257c0ed56dc36f4e928a7341b34b1379bd87e5a15ce5d883c2c90574"},
-    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:494a6f77560e02bd7d1ab579fdf8192390567fc96a603f21370f6e63690b7f3d"},
-    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6fe8503b1b917508cc68bf44dae28823ac05e9f091021e0c41f806ebbb23f92f"},
-    {file = "aiohttp-3.10.2-cp311-cp311-win32.whl", hash = "sha256:4ddb43d06ce786221c0dfd3c91b4892c318eaa36b903f7c4278e7e2fa0dd5102"},
-    {file = "aiohttp-3.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:ca2f5abcb0a9a47e56bac173c01e9f6c6e7f27534d91451c5f22e6a35a5a2093"},
-    {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:14eb6b17f6246959fb0b035d4f4ae52caa870c4edfb6170aad14c0de5bfbf478"},
-    {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:465e445ec348d4e4bd349edd8b22db75f025da9d7b6dc1369c48e7935b85581e"},
-    {file = "aiohttp-3.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:341f8ece0276a828d95b70cd265d20e257f5132b46bf77d759d7f4e0443f2906"},
-    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c01fbb87b5426381cd9418b3ddcf4fc107e296fa2d3446c18ce6c76642f340a3"},
-    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c474af073e1a6763e1c5522bbb2d85ff8318197e4c6c919b8d7886e16213345"},
-    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d9076810a5621236e29b2204e67a68e1fe317c8727ee4c9abbfbb1083b442c38"},
-    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8f515d6859e673940e08de3922b9c4a2249653b0ac181169313bd6e4b1978ac"},
-    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:655e583afc639bef06f3b2446972c1726007a21003cd0ef57116a123e44601bc"},
-    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8da9449a575133828cc99985536552ea2dcd690e848f9d41b48d8853a149a959"},
-    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:19073d57d0feb1865d12361e2a1f5a49cb764bf81a4024a3b608ab521568093a"},
-    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c8e98e1845805f184d91fda6f9ab93d7c7b0dddf1c07e0255924bfdb151a8d05"},
-    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:377220a5efde6f9497c5b74649b8c261d3cce8a84cb661be2ed8099a2196400a"},
-    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92f7f4a4dc9cdb5980973a74d43cdbb16286dacf8d1896b6c3023b8ba8436f8e"},
-    {file = "aiohttp-3.10.2-cp312-cp312-win32.whl", hash = "sha256:9bb2834a6f11d65374ce97d366d6311a9155ef92c4f0cee543b2155d06dc921f"},
-    {file = "aiohttp-3.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:518dc3cb37365255708283d1c1c54485bbacccd84f0a0fb87ed8917ba45eda5b"},
-    {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7f98e70bbbf693086efe4b86d381efad8edac040b8ad02821453083d15ec315f"},
-    {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9f6f0b252a009e98fe84028a4ec48396a948e7a65b8be06ccfc6ef68cf1f614d"},
-    {file = "aiohttp-3.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9360e3ffc7b23565600e729e8c639c3c50d5520e05fdf94aa2bd859eef12c407"},
-    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3988044d1635c7821dd44f0edfbe47e9875427464e59d548aece447f8c22800a"},
-    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30a9d59da1543a6f1478c3436fd49ec59be3868bca561a33778b4391005e499d"},
-    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9f49bdb94809ac56e09a310a62f33e5f22973d6fd351aac72a39cd551e98194"},
-    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfd2dca3f11c365d6857a07e7d12985afc59798458a2fdb2ffa4a0332a3fd43"},
-    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c1508ec97b2cd3e120bfe309a4ff8e852e8a7460f1ef1de00c2c0ed01e33c"},
-    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:49904f38667c44c041a0b44c474b3ae36948d16a0398a8f8cd84e2bb3c42a069"},
-    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:352f3a4e5f11f3241a49b6a48bc5b935fabc35d1165fa0d87f3ca99c1fcca98b"},
-    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:fc61f39b534c5d5903490478a0dd349df397d2284a939aa3cbaa2fb7a19b8397"},
-    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:ad2274e707be37420d0b6c3d26a8115295fe9d8e6e530fa6a42487a8ca3ad052"},
-    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c836bf3c7512100219fe1123743fd8dd9a2b50dd7cfb0c3bb10d041309acab4b"},
-    {file = "aiohttp-3.10.2-cp38-cp38-win32.whl", hash = "sha256:53e8898adda402be03ff164b0878abe2d884e3ea03a4701e6ad55399d84b92dc"},
-    {file = "aiohttp-3.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:7cc8f65f5b22304693de05a245b6736b14cb5bc9c8a03da6e2ae9ef15f8b458f"},
-    {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9dfc906d656e14004c5bc672399c1cccc10db38df2b62a13fb2b6e165a81c316"},
-    {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:91b10208b222ddf655c3a3d5b727879d7163db12b634492df41a9182a76edaae"},
-    {file = "aiohttp-3.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fd16b5e1a7bdd14668cd6bde60a2a29b49147a535c74f50d8177d11b38433a7"},
-    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2bfdda4971bd79201f59adbad24ec2728875237e1c83bba5221284dbbf57bda"},
-    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69d73f869cf29e8a373127fc378014e2b17bcfbe8d89134bc6fb06a2f67f3cb3"},
-    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df59f8486507c421c0620a2c3dce81fbf1d54018dc20ff4fecdb2c106d6e6abc"},
-    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df930015db36b460aa9badbf35eccbc383f00d52d4b6f3de2ccb57d064a6ade"},
-    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:562b1153ab7f766ee6b8b357ec777a302770ad017cf18505d34f1c088fccc448"},
-    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d984db6d855de58e0fde1ef908d48fe9a634cadb3cf715962722b4da1c40619d"},
-    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:14dc3fcb0d877911d775d511eb617a486a8c48afca0a887276e63db04d3ee920"},
-    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b52a27a5c97275e254704e1049f4b96a81e67d6205f52fa37a4777d55b0e98ef"},
-    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:cd33d9de8cfd006a0d0fe85f49b4183c57e91d18ffb7e9004ce855e81928f704"},
-    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1238fc979160bc03a92fff9ad021375ff1c8799c6aacb0d8ea1b357ea40932bb"},
-    {file = "aiohttp-3.10.2-cp39-cp39-win32.whl", hash = "sha256:e2f43d238eae4f0b04f58d4c0df4615697d4ca3e9f9b1963d49555a94f0f5a04"},
-    {file = "aiohttp-3.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:947847f07a8f81d7b39b2d0202fd73e61962ebe17ac2d8566f260679e467da7b"},
-    {file = "aiohttp-3.10.2.tar.gz", hash = "sha256:4d1f694b5d6e459352e5e925a42e05bac66655bfde44d81c59992463d2897014"},
+    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"},
+    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"},
+    {file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"},
+    {file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"},
+    {file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"},
+    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"},
+    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"},
+    {file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"},
+    {file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"},
+    {file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"},
+    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"},
+    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"},
+    {file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"},
+    {file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"},
+    {file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"},
+    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"},
+    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"},
+    {file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"},
+    {file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"},
+    {file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"},
+    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"},
+    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"},
+    {file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"},
+    {file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"},
+    {file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"},
+    {file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"},
 ]

 [package.dependencies]
-aiohappyeyeballs = ">=2.3.0"
 aiosignal = ">=1.1.2"
 async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
 attrs = ">=17.3.0"
@@ -106,7 +94,7 @@ multidict = ">=4.5,<7.0"
 yarl = ">=1.0,<2.0"

 [package.extras]
-speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"]
+speedups = ["Brotli", "aiodns", "brotlicffi"]

 [[package]]
 name = "aiopg"
@@ -3383,4 +3371,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "c09bcb333ab550958b33dbf4fec968c500d8e701fd4c96402cddbd9bb8048055"
+content-hash = "d569a3593b98baceb0a88e176bdad63cae99d6bfc2a81bf6741663a4abcafd72"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
-aiohttp = "3.10.2"
+aiohttp = "3.9.4"
 pytest-rerunfailures = "^13.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.80.1"
+channel = "1.80.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
--- a/storage_controller/src/drain_utils.rs
+++ b/storage_controller/src/drain_utils.rs
@@ -1,225 +0,0 @@
-use std::{
-    collections::{BTreeMap, HashMap},
-    sync::Arc,
-};
-
-use pageserver_api::controller_api::NodeSchedulingPolicy;
-use utils::{id::NodeId, shard::TenantShardId};
-
-use crate::{
-    background_node_operations::OperationError, node::Node, scheduler::Scheduler,
-    tenant_shard::TenantShard,
-};
-
-pub(crate) struct TenantShardIterator<F> {
-    tenants_accessor: F,
-    inspected_all_shards: bool,
-    last_inspected_shard: Option<TenantShardId>,
-}
-
-/// A simple iterator which can be used in tandem with [`crate::service::Service`]
-/// to iterate over all known tenant shard ids without holding the lock on the
-/// service state at all times.
-impl<F> TenantShardIterator<F>
-where
-    F: Fn(Option<TenantShardId>) -> Option<TenantShardId>,
-{
-    pub(crate) fn new(tenants_accessor: F) -> Self {
-        Self {
-            tenants_accessor,
-            inspected_all_shards: false,
-            last_inspected_shard: None,
-        }
-    }
-
-    /// Returns the next tenant shard id if one exists
-    pub(crate) fn next(&mut self) -> Option<TenantShardId> {
-        if self.inspected_all_shards {
-            return None;
-        }
-
-        match (self.tenants_accessor)(self.last_inspected_shard) {
-            Some(tid) => {
-                self.last_inspected_shard = Some(tid);
-                Some(tid)
-            }
-            None => {
-                self.inspected_all_shards = true;
-                None
-            }
-        }
-    }
-
-    /// Returns true when the end of the iterator is reached and false otherwise
-    pub(crate) fn finished(&self) -> bool {
-        self.inspected_all_shards
-    }
-}
-
-/// Check that the state of the node being drained is as expected:
-/// node is present in memory and scheduling policy is set to [`NodeSchedulingPolicy::Draining`]
-pub(crate) fn validate_node_state(
-    node_id: &NodeId,
-    nodes: Arc<HashMap<NodeId, Node>>,
-) -> Result<(), OperationError> {
-    let node = nodes.get(node_id).ok_or(OperationError::NodeStateChanged(
-        format!("node {} was removed", node_id).into(),
-    ))?;
-
-    let current_policy = node.get_scheduling();
-    if !matches!(current_policy, NodeSchedulingPolicy::Draining) {
-        // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
-        // about it
-        return Err(OperationError::NodeStateChanged(
-            format!("node {} changed state to {:?}", node_id, current_policy).into(),
-        ));
-    }
-
-    Ok(())
-}
-
-/// Struct that houses a few utility methods for draining pageserver nodes
-pub(crate) struct TenantShardDrain {
-    pub(crate) drained_node: NodeId,
-    pub(crate) tenant_shard_id: TenantShardId,
-}
-
-impl TenantShardDrain {
-    /// Check if the tenant shard under question is eligible for drainining:
-    /// it's primary attachment is on the node being drained
-    pub(crate) fn tenant_shard_eligible_for_drain(
-        &self,
-        tenants: &BTreeMap<TenantShardId, TenantShard>,
-        scheduler: &Scheduler,
-    ) -> Option<NodeId> {
-        let tenant_shard = tenants.get(&self.tenant_shard_id)?;
-
-        if *tenant_shard.intent.get_attached() != Some(self.drained_node) {
-            return None;
-        }
-
-        match scheduler.node_preferred(tenant_shard.intent.get_secondary()) {
-            Some(node) => Some(node),
-            None => {
-                tracing::warn!(
-                    tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
-                    "No eligible secondary while draining {}", self.drained_node
-                );
-
-                None
-            }
-        }
-    }
-
-    /// Attempt to reschedule the tenant shard under question to one of its secondary locations
-    /// Returns an Err when the operation should be aborted and Ok(None) when the tenant shard
-    /// should be skipped.
-    pub(crate) fn reschedule_to_secondary<'a>(
-        &self,
-        destination: NodeId,
-        tenants: &'a mut BTreeMap<TenantShardId, TenantShard>,
-        scheduler: &mut Scheduler,
-        nodes: &Arc<HashMap<NodeId, Node>>,
-    ) -> Result<Option<&'a mut TenantShard>, OperationError> {
-        let tenant_shard = match tenants.get_mut(&self.tenant_shard_id) {
-            Some(some) => some,
-            None => {
-                // Tenant shard was removed in the meantime.
-                // Skip to the next one, but don't fail the overall operation
-                return Ok(None);
-            }
-        };
-
-        if !nodes.contains_key(&destination) {
-            return Err(OperationError::NodeStateChanged(
-                format!("node {} was removed", destination).into(),
-            ));
-        }
-
-        if !tenant_shard.intent.get_secondary().contains(&destination) {
-            tracing::info!(
-                tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
-                "Secondary moved away from {destination} during drain"
-            );
-
-            return Ok(None);
-        }
-
-        match tenant_shard.reschedule_to_secondary(Some(destination), scheduler) {
-            Err(e) => {
-                tracing::warn!(
-                    tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
-                    "Scheduling error when draining pageserver {} : {}", self.drained_node, e
-                );
-
-                Ok(None)
-            }
-            Ok(()) => {
-                let scheduled_to = tenant_shard.intent.get_attached();
-                tracing::info!(
-                    tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
-                    "Rescheduled shard while draining node {}: {} -> {:?}",
-                    self.drained_node,
-                    self.drained_node,
-                    scheduled_to
-                );
-
-                Ok(Some(tenant_shard))
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use utils::{
-        id::TenantId,
-        shard::{ShardCount, ShardNumber, TenantShardId},
-    };
-
-    use super::TenantShardIterator;
-
-    #[test]
-    fn test_tenant_shard_iterator() {
-        let tenant_id = TenantId::generate();
-        let shard_count = ShardCount(8);
-
-        let mut tenant_shards = Vec::default();
-        for i in 0..shard_count.0 {
-            tenant_shards.push((
-                TenantShardId {
-                    tenant_id,
-                    shard_number: ShardNumber(i),
-                    shard_count,
-                },
-                (),
-            ))
-        }
-
-        let tenant_shards = Arc::new(tenant_shards);
-
-        let mut tid_iter = TenantShardIterator::new({
-            let tenants = tenant_shards.clone();
-            move |last_inspected_shard: Option<TenantShardId>| {
-                let entry = match last_inspected_shard {
-                    Some(skip_past) => {
-                        let mut cursor = tenants.iter().skip_while(|(tid, _)| *tid != skip_past);
-                        cursor.nth(1)
-                    }
-                    None => tenants.first(),
-                };
-
-                entry.map(|(tid, _)| tid).copied()
-            }
-        });
-
-        let mut iterated_over = Vec::default();
-        while let Some(tid) = tid_iter.next() {
-            iterated_over.push((tid, ()));
-        }
-
-        assert_eq!(iterated_over, *tenant_shards);
-    }
-}
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -4,7 +4,6 @@ use utils::seqwait::MonotonicCounter;
 mod auth;
 mod background_node_operations;
 mod compute_hook;
-mod drain_utils;
 mod heartbeater;
 pub mod http;
 mod id_lock_map;
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -92,11 +92,6 @@ struct Cli {
    /// Chaos testing
    #[arg(long)]
    chaos_interval: Option<humantime::Duration>,
-
-    // Maximum acceptable lag for the secondary location while draining
-    // a pageserver
-    #[arg(long)]
-    max_secondary_lag_bytes: Option<u64>,
 }

 enum StrictMode {
@@ -284,7 +279,6 @@ async fn async_main() -> anyhow::Result<()> {
            .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
        split_threshold: args.split_threshold,
        neon_local_repo_dir: args.neon_local_repo_dir,
-        max_secondary_lag_bytes: args.max_secondary_lag_bytes,
    };

    // After loading secrets & config, but before starting anything else, apply database migrations
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -39,9 +39,6 @@ pub(super) struct Reconciler {
    /// to detach this tenant shard.
    pub(crate) detach: Vec<Node>,

-    /// Configuration specific to this reconciler
-    pub(crate) reconciler_config: ReconcilerConfig,
-
    pub(crate) config: TenantConfig,
    pub(crate) observed: ObservedState,

@@ -76,65 +73,6 @@ pub(super) struct Reconciler {
    pub(crate) persistence: Arc<Persistence>,
 }

-pub(crate) struct ReconcilerConfigBuilder {
-    config: ReconcilerConfig,
-}
-
-impl ReconcilerConfigBuilder {
-    pub(crate) fn new() -> Self {
-        Self {
-            config: ReconcilerConfig::default(),
-        }
-    }
-
-    pub(crate) fn secondary_warmup_timeout(self, value: Duration) -> Self {
-        Self {
-            config: ReconcilerConfig {
-                secondary_warmup_timeout: Some(value),
-                ..self.config
-            },
-        }
-    }
-
-    pub(crate) fn secondary_download_request_timeout(self, value: Duration) -> Self {
-        Self {
-            config: ReconcilerConfig {
-                secondary_download_request_timeout: Some(value),
-                ..self.config
-            },
-        }
-    }
-
-    pub(crate) fn build(self) -> ReconcilerConfig {
-        self.config
-    }
-}
-
-#[derive(Default, Debug, Copy, Clone)]
-pub(crate) struct ReconcilerConfig {
-    // During live migration give up on warming-up the secondary
-    // after this timeout.
-    secondary_warmup_timeout: Option<Duration>,
-
-    // During live migrations this is the amount of time that
-    // the pagserver will hold our poll.
-    secondary_download_request_timeout: Option<Duration>,
-}
-
-impl ReconcilerConfig {
-    pub(crate) fn get_secondary_warmup_timeout(&self) -> Duration {
-        const SECONDARY_WARMUP_TIMEOUT_DEFAULT: Duration = Duration::from_secs(300);
-        self.secondary_warmup_timeout
-            .unwrap_or(SECONDARY_WARMUP_TIMEOUT_DEFAULT)
-    }
-
-    pub(crate) fn get_secondary_download_request_timeout(&self) -> Duration {
-        const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT: Duration = Duration::from_secs(20);
-        self.secondary_download_request_timeout
-            .unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT)
-    }
-}
-
 /// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
 pub(crate) struct ReconcileUnits {
    _sem_units: tokio::sync::OwnedSemaphorePermit,
@@ -362,13 +300,11 @@ impl Reconciler {
    ) -> Result<(), ReconcileError> {
        // This is not the timeout for a request, but the total amount of time we're willing to wait
        // for a secondary location to get up to date before
-        let total_download_timeout = self.reconciler_config.get_secondary_warmup_timeout();
+        const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300);

        // This the long-polling interval for the secondary download requests we send to destination pageserver
        // during a migration.
-        let request_download_timeout = self
-            .reconciler_config
-            .get_secondary_download_request_timeout();
+        const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);

        let started_at = Instant::now();

@@ -379,14 +315,14 @@ impl Reconciler {
                        client
                            .tenant_secondary_download(
                                tenant_shard_id,
-                                Some(request_download_timeout),
+                                Some(REQUEST_DOWNLOAD_TIMEOUT),
                            )
                            .await
                    },
                    &self.service_config.jwt_token,
                    1,
                    3,
-                    request_download_timeout * 2,
+                    REQUEST_DOWNLOAD_TIMEOUT * 2,
                    &self.cancel,
                )
                .await
@@ -414,7 +350,7 @@ impl Reconciler {
                return Ok(());
            } else if status == StatusCode::ACCEPTED {
                let total_runtime = started_at.elapsed();
-                if total_runtime > total_download_timeout {
+                if total_runtime > TOTAL_DOWNLOAD_TIMEOUT {
                    tracing::warn!("Timed out after {}ms downloading layers to {node}.  Progress so far: {}/{} layers, {}/{} bytes",
                        total_runtime.as_millis(),
                        progress.layers_downloaded,
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -14,11 +14,10 @@ use crate::{
        Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION,
    },
    compute_hook::NotifyError,
-    drain_utils::{self, TenantShardDrain, TenantShardIterator},
    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
    metrics::LeadershipStatusGroup,
    persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter},
-    reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
+    reconciler::{ReconcileError, ReconcileUnits},
    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
    tenant_shard::{
        MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
@@ -326,12 +325,6 @@ pub struct Config {

    // TODO: make this cfg(feature  = "testing")
    pub neon_local_repo_dir: Option<PathBuf>,
-
-    // Maximum acceptable download lag for the secondary location
-    // while draining a node. If the secondary location is lagging
-    // by more than the configured amount, then the secondary is not
-    // upgraded to primary.
-    pub max_secondary_lag_bytes: Option<u64>,
 }

 impl From<DatabaseError> for ApiError {
@@ -5194,22 +5187,11 @@ impl Service {
        Ok(())
    }

-    /// Like [`Self::maybe_configured_reconcile_shard`], but uses the default reconciler
-    /// configuration
+    /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`],
    fn maybe_reconcile_shard(
        &self,
        shard: &mut TenantShard,
        nodes: &Arc<HashMap<NodeId, Node>>,
-    ) -> Option<ReconcilerWaiter> {
-        self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::default())
-    }
-
-    /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`],
-    fn maybe_configured_reconcile_shard(
-        &self,
-        shard: &mut TenantShard,
-        nodes: &Arc<HashMap<NodeId, Node>>,
-        reconciler_config: ReconcilerConfig,
    ) -> Option<ReconcilerWaiter> {
        let reconcile_needed = shard.get_reconcile_needed(nodes);

@@ -5259,7 +5241,6 @@ impl Service {
            &self.result_tx,
            nodes,
            &self.compute_hook,
-            reconciler_config,
            &self.config,
            &self.persistence,
            units,
@@ -5734,92 +5715,18 @@ impl Service {
        self.gate.close().await;
    }

-    /// Spot check the download lag for a secondary location of a shard.
-    /// Should be used as a heuristic, since it's not always precise: the
-    /// secondary might have not downloaded the new heat map yet and, hence,
-    /// is not aware of the lag.
-    ///
-    /// Returns:
-    /// * Ok(None) if the lag could not be determined from the status,
-    /// * Ok(Some(_)) if the lag could be determind
-    /// * Err on failures to query the pageserver.
-    async fn secondary_lag(
-        &self,
-        secondary: &NodeId,
-        tenant_shard_id: TenantShardId,
-    ) -> Result<Option<u64>, mgmt_api::Error> {
-        let nodes = self.inner.read().unwrap().nodes.clone();
-        let node = nodes.get(secondary).ok_or(mgmt_api::Error::ApiError(
-            StatusCode::NOT_FOUND,
-            format!("Node with id {} not found", secondary),
-        ))?;
-
-        match node
-            .with_client_retries(
-                |client| async move { client.tenant_secondary_status(tenant_shard_id).await },
-                &self.config.jwt_token,
-                1,
-                3,
-                Duration::from_millis(250),
-                &self.cancel,
-            )
-            .await
-        {
-            Some(Ok(status)) => match status.heatmap_mtime {
-                Some(_) => Ok(Some(status.bytes_total - status.bytes_downloaded)),
-                None => Ok(None),
-            },
-            Some(Err(e)) => Err(e),
-            None => Err(mgmt_api::Error::Cancelled),
-        }
-    }
-
    /// Drain a node by moving the shards attached to it as primaries.
    /// This is a long running operation and it should run as a separate Tokio task.
    pub(crate) async fn drain_node(
-        self: &Arc<Self>,
+        &self,
        node_id: NodeId,
        cancel: CancellationToken,
    ) -> Result<(), OperationError> {
-        const MAX_SECONDARY_LAG_BYTES_DEFAULT: u64 = 256 * 1024 * 1024;
-        let max_secondary_lag_bytes = self
-            .config
-            .max_secondary_lag_bytes
-            .unwrap_or(MAX_SECONDARY_LAG_BYTES_DEFAULT);
-
-        // By default, live migrations are generous about the wait time for getting
-        // the secondary location up to speed. When draining, give up earlier in order
-        // to not stall the operation when a cold secondary is encountered.
-        const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20);
-        const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
-        let reconciler_config = ReconcilerConfigBuilder::new()
-            .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)
-            .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT)
-            .build();
-
+        let mut last_inspected_shard: Option<TenantShardId> = None;
+        let mut inspected_all_shards = false;
        let mut waiters = Vec::new();

-        let mut tid_iter = TenantShardIterator::new({
-            let service = self.clone();
-            move |last_inspected_shard: Option<TenantShardId>| {
-                let locked = &service.inner.read().unwrap();
-                let tenants = &locked.tenants;
-                let entry = match last_inspected_shard {
-                    Some(skip_past) => {
-                        // Skip to the last seen tenant shard id
-                        let mut cursor = tenants.iter().skip_while(|(tid, _)| **tid != skip_past);
-
-                        // Skip past the last seen
-                        cursor.nth(1)
-                    }
-                    None => tenants.first_key_value(),
-                };
-
-                entry.map(|(tid, _)| tid).copied()
-            }
-        });
-
-        while !tid_iter.finished() {
+        while !inspected_all_shards {
            if cancel.is_cancelled() {
                match self
                    .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
@@ -5838,82 +5745,71 @@ impl Service {
                }
            }

-            drain_utils::validate_node_state(&node_id, self.inner.read().unwrap().nodes.clone())?;
+            {
+                let mut locked = self.inner.write().unwrap();
+                let (nodes, tenants, scheduler) = locked.parts_mut();

-            while waiters.len() < MAX_RECONCILES_PER_OPERATION {
-                let tid = match tid_iter.next() {
-                    Some(tid) => tid,
-                    None => {
-                        break;
-                    }
-                };
+                let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged(
+                    format!("node {node_id} was removed").into(),
+                ))?;

-                let tid_drain = TenantShardDrain {
-                    drained_node: node_id,
-                    tenant_shard_id: tid,
-                };
-
-                let dest_node_id = {
-                    let locked = self.inner.read().unwrap();
-
-                    match tid_drain
-                        .tenant_shard_eligible_for_drain(&locked.tenants, &locked.scheduler)
-                    {
-                        Some(node_id) => node_id,
-                        None => {
-                            continue;
-                        }
-                    }
-                };
-
-                match self.secondary_lag(&dest_node_id, tid).await {
-                    Ok(Some(lag)) if lag <= max_secondary_lag_bytes => {
-                        // The secondary is reasonably up to date.
-                        // Migrate to it
-                    }
-                    Ok(Some(lag)) => {
-                        tracing::info!(
-                            tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                            "Secondary on node {dest_node_id} is lagging by {lag}. Skipping reconcile."
-                        );
-                        continue;
-                    }
-                    Ok(None) => {
-                        tracing::info!(
-                            tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                            "Could not determine lag for secondary on node {dest_node_id}. Skipping reconcile."
-                        );
-                        continue;
-                    }
-                    Err(err) => {
-                        tracing::warn!(
-                            tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                            "Failed to get secondary lag from node {dest_node_id}. Skipping reconcile: {err}"
-                        );
-                        continue;
-                    }
+                let current_policy = node.get_scheduling();
+                if !matches!(current_policy, NodeSchedulingPolicy::Draining) {
+                    // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
+                    // about it
+                    return Err(OperationError::NodeStateChanged(
+                        format!("node {node_id} changed state to {current_policy:?}").into(),
+                    ));
                }

-                {
-                    let mut locked = self.inner.write().unwrap();
-                    let (nodes, tenants, scheduler) = locked.parts_mut();
-                    let rescheduled = tid_drain.reschedule_to_secondary(
-                        dest_node_id,
-                        tenants,
-                        scheduler,
-                        nodes,
-                    )?;
+                let mut cursor = tenants.iter_mut().skip_while({
+                    let skip_past = last_inspected_shard;
+                    move |(tid, _)| match skip_past {
+                        Some(last) => **tid != last,
+                        None => false,
+                    }
+                });

-                    if let Some(tenant_shard) = rescheduled {
-                        let waiter = self.maybe_configured_reconcile_shard(
-                            tenant_shard,
-                            nodes,
-                            reconciler_config,
-                        );
-                        if let Some(some) = waiter {
-                            waiters.push(some);
+                while waiters.len() < MAX_RECONCILES_PER_OPERATION {
+                    let (tid, tenant_shard) = match cursor.next() {
+                        Some(some) => some,
+                        None => {
+                            inspected_all_shards = true;
+                            break;
+                        }
+                    };
+
+                    // If the shard is not attached to the node being drained, skip it.
+                    if *tenant_shard.intent.get_attached() != Some(node_id) {
+                        last_inspected_shard = Some(*tid);
+                        continue;
+                    }
+
+                    match tenant_shard.reschedule_to_secondary(None, scheduler) {
+                        Err(e) => {
+                            tracing::warn!(
+                                tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                                "Scheduling error when draining pageserver {} : {e}", node_id
+                            );
+                        }
+                        Ok(()) => {
+                            let scheduled_to = tenant_shard.intent.get_attached();
+                            tracing::info!(
+                                tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                                "Rescheduled shard while draining node {}: {} -> {:?}",
+                                node_id,
+                                node_id,
+                                scheduled_to
+                            );
+
+                            let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
+                            if let Some(some) = waiter {
+                                waiters.push(some);
+                            }
                        }
                    }
+
+                    last_inspected_shard = Some(*tid);
                }
            }

--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -7,7 +7,7 @@ use std::{
 use crate::{
    metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
    persistence::TenantShardPersistence,
-    reconciler::{ReconcileUnits, ReconcilerConfig},
+    reconciler::ReconcileUnits,
    scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext},
    service::ReconcileResultRequest,
 };
@@ -1063,7 +1063,6 @@ impl TenantShard {
        result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResultRequest>,
        pageservers: &Arc<HashMap<NodeId, Node>>,
        compute_hook: &Arc<ComputeHook>,
-        reconciler_config: ReconcilerConfig,
        service_config: &service::Config,
        persistence: &Arc<Persistence>,
        units: ReconcileUnits,
@@ -1102,7 +1101,6 @@ impl TenantShard {
            generation: self.generation,
            intent: reconciler_intent,
            detach,
-            reconciler_config,
            config: self.config.clone(),
            observed: self.observed.clone(),
            compute_hook: compute_hook.clone(),
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -14,7 +14,6 @@ import textwrap
 import threading
 import time
 import uuid
-from collections import defaultdict
 from contextlib import closing, contextmanager
 from dataclasses import dataclass
 from datetime import datetime
@@ -2668,69 +2667,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
        log.info(f"Got failpoints request response code {res.status_code}")
        res.raise_for_status()

-    def get_tenants_placement(self) -> defaultdict[str, Dict[str, Any]]:
-        """
-        Get the intent and observed placements of all tenants known to the storage controller.
-        """
-        tenants = self.tenant_list()
-
-        tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict(
-            lambda: {
-                "observed": {"attached": None, "secondary": []},
-                "intent": {"attached": None, "secondary": []},
-            }
-        )
-
-        for t in tenants:
-            for node_id, loc_state in t["observed"]["locations"].items():
-                if (
-                    loc_state is not None
-                    and "conf" in loc_state
-                    and loc_state["conf"] is not None
-                    and loc_state["conf"]["mode"]
-                    in set(["AttachedSingle", "AttachedMulti", "AttachedStale"])
-                ):
-                    tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id)
-
-                if (
-                    loc_state is not None
-                    and "conf" in loc_state
-                    and loc_state["conf"] is not None
-                    and loc_state["conf"]["mode"] == "Secondary"
-                ):
-                    tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append(
-                        int(node_id)
-                    )
-
-            if "attached" in t["intent"]:
-                tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"][
-                    "attached"
-                ]
-
-            if "secondary" in t["intent"]:
-                tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][
-                    "secondary"
-                ]
-
-        return tenant_placement
-
-    def warm_up_all_secondaries(self):
-        log.info("Warming up all secondary locations")
-
-        tenant_placement = self.get_tenants_placement()
-        for tid, placement in tenant_placement.items():
-            assert placement["observed"]["attached"] is not None
-            primary_id = placement["observed"]["attached"]
-
-            assert len(placement["observed"]["secondary"]) == 1
-            secondary_id = placement["observed"]["secondary"][0]
-
-            parsed_tid = TenantShardId.parse(tid)
-            self.env.get_pageserver(primary_id).http_client().tenant_heatmap_upload(parsed_tid)
-            self.env.get_pageserver(secondary_id).http_client().tenant_secondary_download(
-                parsed_tid, wait_ms=250
-            )
-
    @property
    def workdir(self) -> Path:
        return self.env.repo_dir
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -361,12 +361,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        self.verbose_error(res)
        return (res.status_code, res.json())

-    def tenant_secondary_status(self, tenant_id: Union[TenantId, TenantShardId]):
-        url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/status"
-        res = self.get(url)
-        self.verbose_error(res)
-        return res.json()
-
    def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]):
        assert "tenant_id" not in config.keys()
        res = self.put(
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -177,14 +177,9 @@ class S3Storage:

    def access_env_vars(self) -> Dict[str, str]:
        if self.aws_profile is not None:
-            env = {
+            return {
                "AWS_PROFILE": self.aws_profile,
            }
-            # Pass through HOME env var because AWS_PROFILE needs it in order to work
-            home = os.getenv("HOME")
-            if home is not None:
-                env["HOME"] = home
-            return env
        if self.access_key is not None and self.secret_key is not None:
            return {
                "AWS_ACCESS_KEY_ID": self.access_key,
--- a/test_runner/logical_repl/test_debezium.py
+++ b/test_runner/logical_repl/test_debezium.py
@@ -12,6 +12,7 @@ import requests
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import RemotePostgres
 from fixtures.utils import wait_until
+from kafka import KafkaConsumer


 class DebeziumAPI:
@@ -94,8 +95,6 @@ def debezium(remote_pg: RemotePostgres):
    log.debug("%s %s %s", resp.status_code, resp.ok, resp.text)
    assert resp.status_code == 201
    assert len(dbz.list_connectors()) == 1
-    from kafka import KafkaConsumer
-
    consumer = KafkaConsumer(
        "dbserver1.inventory.customers",
        bootstrap_servers=["kafka:9092"],
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -2,6 +2,7 @@ import concurrent.futures
 import random
 import time
 from collections import defaultdict
+from typing import Any, Dict

 import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
@@ -23,14 +24,51 @@ def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[
    This function takes into account the intersection of the intent and the observed state.
    If they do not match, it asserts out.
    """
-    tenant_placement = env.storage_controller.get_tenants_placement()
+    tenants = env.storage_controller.tenant_list()
+
+    intent = dict()
+    observed = dict()
+
+    tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict(
+        lambda: {
+            "observed": {"attached": None, "secondary": []},
+            "intent": {"attached": None, "secondary": []},
+        }
+    )
+
+    for t in tenants:
+        for node_id, loc_state in t["observed"]["locations"].items():
+            if (
+                loc_state is not None
+                and "conf" in loc_state
+                and loc_state["conf"] is not None
+                and loc_state["conf"]["mode"]
+                in set(["AttachedSingle", "AttachedMulti", "AttachedStale"])
+            ):
+                observed[t["tenant_shard_id"]] = int(node_id)
+                tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id)
+
+            if (
+                loc_state is not None
+                and "conf" in loc_state
+                and loc_state["conf"] is not None
+                and loc_state["conf"]["mode"] == "Secondary"
+            ):
+                tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append(int(node_id))
+
+        if "attached" in t["intent"]:
+            intent[t["tenant_shard_id"]] = t["intent"]["attached"]
+            tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"]["attached"]
+
+        if "secondary" in t["intent"]:
+            tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][
+                "secondary"
+            ]
+
    log.info(f"{tenant_placement=}")

    matching = {
-        tid: tenant_placement[tid]["intent"]["attached"]
-        for tid in tenant_placement
-        if tenant_placement[tid]["intent"]["attached"]
-        == tenant_placement[tid]["observed"]["attached"]
+        tid: intent[tid] for tid in observed if tid in intent and intent[tid] == observed[tid]
    }
    assert len(matching) == total_shards

--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -144,13 +144,7 @@ def test_pg_regress(
    )

    # Connect to postgres and create a database called "regression".
-    endpoint = env.endpoints.create_start(
-        "main",
-        config_lines=[
-            # Enable the test mode, so that we don't need to patch the test cases.
-            "neon.regress_test_mode = true",
-        ],
-    )
+    endpoint = env.endpoints.create_start("main")
    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")

    # Create some local directories for pg_regress to run in.
@@ -213,14 +207,7 @@ def test_isolation(

    # Connect to postgres and create a database called "regression".
    # isolation tests use prepared transactions, so enable them
-    endpoint = env.endpoints.create_start(
-        "main",
-        config_lines=[
-            "max_prepared_transactions=100",
-            # Enable the test mode, so that we don't need to patch the test cases.
-            "neon.regress_test_mode = true",
-        ],
-    )
+    endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"])
    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")

    # Create some local directories for pg_isolation_regress to run in.
@@ -281,13 +268,7 @@ def test_sql_regress(
    )

    # Connect to postgres and create a database called "regression".
-    endpoint = env.endpoints.create_start(
-        "main",
-        config_lines=[
-            # Enable the test mode, so that we don't need to patch the test cases.
-            "neon.regress_test_mode = true",
-        ],
-    )
+    endpoint = env.endpoints.create_start("main")
    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")

    # Create some local directories for pg_regress to run in.
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -17,7 +17,6 @@ from fixtures.neon_fixtures import (
    PgBin,
    StorageControllerApiException,
    TokenScope,
-    last_flush_lsn_upload,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -1598,8 +1597,6 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):

    # Perform a graceful rolling restart
    for ps in env.pageservers:
-        env.storage_controller.warm_up_all_secondaries()
-
        env.storage_controller.retryable_node_operation(
            lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
        )
@@ -1648,115 +1645,6 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
    assert_shard_counts_balanced(env, shard_counts, total_shards)


-def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    """
-    Artificially make a tenant shard's secondary location lag behind the primary
-    and check that storage controller driven node drains skip the lagging tenant shard.
-    Finally, validate that the tenant shard is migrated when a new drain request comes
-    in and it's no longer lagging.
-    """
-    neon_env_builder.num_pageservers = 2
-    neon_env_builder.storage_controller_config = {
-        "max_secondary_lag_bytes": 1 * 1024 * 1024,
-    }
-
-    env = neon_env_builder.init_configs()
-    env.start()
-
-    tid, timeline_id = env.neon_cli.create_tenant(placement_policy='{"Attached":1}')
-
-    # Give things a chance to settle.
-    env.storage_controller.reconcile_until_idle(timeout_secs=30)
-
-    locations = env.storage_controller.locate(tid)
-    assert len(locations) == 1
-    primary: int = locations[0]["node_id"]
-    not_primary = [ps.id for ps in env.pageservers if ps.id != primary]
-    assert len(not_primary) == 1
-    secondary = not_primary[0]
-
-    log.info(f"Paused secondary downloads on {secondary}")
-    env.get_pageserver(secondary).http_client().configure_failpoints(
-        ("secondary-layer-download-pausable", "pause")
-    )
-
-    log.info(f"Ingesting some data for {tid}")
-
-    with env.endpoints.create_start("main", tenant_id=tid) as endpoint:
-        run_pg_bench_small(pg_bin, endpoint.connstr())
-        endpoint.safe_psql("CREATE TABLE created_foo(id integer);")
-        last_flush_lsn_upload(env, endpoint, tid, timeline_id)
-
-    log.info(f"Uploading heatmap from {primary} and requesting download from {secondary}")
-
-    env.get_pageserver(primary).http_client().tenant_heatmap_upload(tid)
-    env.get_pageserver(secondary).http_client().tenant_secondary_download(tid, wait_ms=100)
-
-    def secondary_is_lagging():
-        resp = env.get_pageserver(secondary).http_client().tenant_secondary_status(tid)
-        lag = resp["bytes_total"] - resp["bytes_downloaded"]
-
-        if lag <= 1 * 1024 * 1024:
-            raise Exception(f"Secondary lag not big enough: {lag}")
-
-    log.info(f"Looking for lag to develop on the secondary {secondary}")
-    wait_until(10, 1, secondary_is_lagging)
-
-    log.info(f"Starting drain of primary {primary} with laggy secondary {secondary}")
-    env.storage_controller.retryable_node_operation(
-        lambda ps_id: env.storage_controller.node_drain(ps_id), primary, max_attempts=3, backoff=2
-    )
-
-    env.storage_controller.poll_node_status(
-        primary,
-        PageserverAvailability.ACTIVE,
-        PageserverSchedulingPolicy.PAUSE_FOR_RESTART,
-        max_attempts=6,
-        backoff=5,
-    )
-
-    locations = env.storage_controller.locate(tid)
-    assert len(locations) == 1
-    assert locations[0]["node_id"] == primary
-
-    log.info(f"Unpausing secondary downloads on {secondary}")
-    env.get_pageserver(secondary).http_client().configure_failpoints(
-        ("secondary-layer-download-pausable", "off")
-    )
-    env.get_pageserver(secondary).http_client().tenant_secondary_download(tid, wait_ms=100)
-
-    log.info(f"Waiting for lag to reduce on {secondary}")
-
-    def lag_is_acceptable():
-        resp = env.get_pageserver(secondary).http_client().tenant_secondary_status(tid)
-        lag = resp["bytes_total"] - resp["bytes_downloaded"]
-
-        if lag > 1 * 1024 * 1024:
-            raise Exception(f"Secondary lag not big enough: {lag}")
-
-    wait_until(10, 1, lag_is_acceptable)
-
-    env.storage_controller.node_configure(primary, {"scheduling": "Active"})
-
-    log.info(f"Starting drain of primary {primary} with non-laggy secondary {secondary}")
-
-    env.storage_controller.retryable_node_operation(
-        lambda ps_id: env.storage_controller.node_drain(ps_id), primary, max_attempts=3, backoff=2
-    )
-
-    env.storage_controller.poll_node_status(
-        primary,
-        PageserverAvailability.ACTIVE,
-        PageserverSchedulingPolicy.PAUSE_FOR_RESTART,
-        max_attempts=6,
-        backoff=5,
-    )
-
-    locations = env.storage_controller.locate(tid)
-    assert len(locations) == 1
-    assert locations[0]["node_id"] == secondary
-
-
 def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.num_pageservers = 2
    env = neon_env_builder.init_configs()
@@ -1783,7 +1671,6 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):

    ps_id_to_drain = env.pageservers[0].id

-    env.storage_controller.warm_up_all_secondaries()
    env.storage_controller.retryable_node_operation(
        lambda ps_id: env.storage_controller.node_drain(ps_id),
        ps_id_to_drain,
--- a/test_runner/regress/test_subscriber_restart.py
+++ b/test_runner/regress/test_subscriber_restart.py
@@ -37,9 +37,7 @@ def test_subscriber_restart(neon_simple_env: NeonEnv):
            scur.execute("CREATE TABLE t (pk integer primary key, sk integer)")
            # scur.execute("CREATE INDEX on t(sk)") # slowdown applying WAL at replica
            pub_conn = f"host=localhost port={pub.pg_port} dbname=postgres user=cloud_admin"
-            # synchronous_commit=on to test a hypothesis for why this test has been flaky.
-            # XXX: Add link to the issue
-            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub with (synchronous_commit=on)"
+            query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub"
            scur.execute(query)
            time.sleep(2)  # let initial table sync complete

--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -128,8 +128,6 @@ def test_tenant_delete_smoke(
    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
    assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0

-    env.pageserver.stop()
-

 def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder):
    """Reproduction of 2023-11-23 stuck tenants investigation"""
@@ -202,10 +200,11 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
        if deletion is not None:
            deletion.join()

-    env.pageserver.stop()

-
-def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder):
+def test_tenant_delete_races_timeline_creation(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+):
    """
    Validate that timeline creation executed in parallel with deletion works correctly.

@@ -319,8 +318,6 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
    # We deleted our only tenant, and the scrubber fails if it detects nothing
    neon_env_builder.disable_scrub_on_exit()

-    env.pageserver.stop()
-

 def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
    """
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -416,7 +416,7 @@ build: |
  # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
  # requires cgroup v2, so we'll build cgroup-tools ourselves.
  FROM debian:bullseye-slim as libcgroup-builder
-  ENV LIBCGROUP_VERSION=v2.0.3
+  ENV LIBCGROUP_VERSION v2.0.3

  RUN set -exu \
      && apt update \
@@ -460,7 +460,7 @@ build: |
          pkg-config

  # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
-  ENV PGBOUNCER_TAG=pgbouncer_1_22_1
+  ENV PGBOUNCER_TAG pgbouncer_1_22_1
  RUN set -e \
      && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
      && cd pgbouncer \