Merge remote-tracking branch 'origin/main' into problame/integrate-tokio-epoll-uring/benchmarking/2024-01-31-prs/async-walredo

2026-01-16 09:52:54 +00:00 · 2024-04-03 21:11:03 +02:00
parent 67a7abc7cf b30b15e7cb
commit c77ce7cdc3
182 changed files with 7330 additions and 2410 deletions
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -147,15 +147,16 @@ jobs:
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "platform": "neon-captest-freetier",   "db_size": "3gb"  },
-                      { "platform": "neon-captest-new",        "db_size": "50gb" },
-                      { "platform": "neonvm-captest-freetier", "db_size": "3gb"  },
-                      { "platform": "neonvm-captest-new",      "db_size": "50gb" }]
+          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
+                      { "platform": "neon-captest-new",              "db_size": "50gb" },
+                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
+                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
-                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
+                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -171,7 +172,7 @@ jobs:

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
-                                                   { "platform": "rds-aurora"   }]')
+                                                     { "platform": "rds-aurora"   }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -190,7 +191,7 @@ jobs:

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                    { "platform": "rds-aurora",   "scale": "10" }]')
+                                                     { "platform": "rds-aurora",   "scale": "10" }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -253,6 +254,9 @@ jobs:
          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
+          neonvm-captest-sharding-reuse)
+            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
+            ;;
          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
@@ -270,11 +274,15 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
+          QUERIES+=("SHOW neon.timeline_id")
        fi
-        psql ${CONNSTR} -c "${QUERY}"
+
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done

    - name: Benchmark init
      uses: ./.github/actions/run-python-test-set
@@ -401,11 +409,15 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
+          QUERIES+=("SHOW neon.timeline_id")
        fi
-        psql ${CONNSTR} -c "${QUERY}"
+
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done

    - name: ClickBench benchmark
      uses: ./.github/actions/run-python-test-set
@@ -507,11 +519,15 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
+          QUERIES+=("SHOW neon.timeline_id")
        fi
-        psql ${CONNSTR} -c "${QUERY}"
+
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done

    - name: Run TPC-H benchmark
      uses: ./.github/actions/run-python-test-set
@@ -597,11 +613,15 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
+          QUERIES+=("SHOW neon.timeline_id")
        fi
-        psql ${CONNSTR} -c "${QUERY}"
+
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done

    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1121,18 +1121,36 @@ jobs:
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
-
-            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
+              -f deployPgSniRouter=false \
+              -f deployProxy=false \
+              -f deployStorage=true \
+              -f deployStorageBroker=true \
+              -f deployStorageController=true \
+              -f branch=main \
+              -f dockerTag=${{needs.tag.outputs.build-tag}} \
+              -f deployPreprodRegion=true
+
            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
              -f deployPgSniRouter=false \
              -f deployProxy=false \
              -f deployStorage=true \
              -f deployStorageBroker=true \
+              -f deployStorageController=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
+              -f deployPgSniRouter=true \
+              -f deployProxy=true \
+              -f deployStorage=false \
+              -f deployStorageBroker=false \
+              -f deployStorageController=false \
+              -f branch=main \
+              -f dockerTag=${{needs.tag.outputs.build-tag}} \
+              -f deployPreprodRegion=true
+
            gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
              -f deployPgSniRouter=true \
              -f deployProxy=true \
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -62,14 +62,14 @@ jobs:

  trigger-e2e-tests:
    needs: [ tag ]
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: ubuntu-latest
    env:
      TAG: ${{ needs.tag.outputs.build-tag }}
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-      options: --init
    steps:
      - name: check if ecr image are present
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
        run: |
          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
@@ -79,41 +79,55 @@ jobs:
            fi
          done

-      - name: Set PR's status to pending and request a remote CI test
+      - name: Set e2e-platforms
+        id: e2e-platforms
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
-          # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
-          # but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
-          # to place a job run status update later.
-          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
-          # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
-          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+          # Default set of platforms to run e2e tests on
+          platforms='["docker", "k8s"]'

-          REMOTE_REPO="${{ github.repository_owner }}/cloud"
+          # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
+          # If the workflow run is not a pull request, add k8s-neonvm to the list.
+          if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
+            for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
+              case "$f" in
+                vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
+                  platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
+                  ;;
+                *)
+                  # no-op
+                  ;;
+              esac
+            done
+          else
+            platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
+          fi

-          curl -f -X POST \
-          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
-          -H "Accept: application/vnd.github.v3+json" \
-          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
-          --data \
-            "{
-              \"state\": \"pending\",
-              \"context\": \"neon-cloud-e2e\",
-              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
-            }"
+          echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT

-          curl -f -X POST \
-          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
-          -H "Accept: application/vnd.github.v3+json" \
-          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
-          --data \
-            "{
-              \"ref\": \"main\",
-              \"inputs\": {
-                \"ci_job_name\": \"neon-cloud-e2e\",
-                \"commit_hash\": \"$COMMIT_SHA\",
-                \"remote_repo\": \"${{ github.repository }}\",
-                \"storage_image_tag\": \"${TAG}\",
-                \"compute_image_tag\": \"${TAG}\",
-                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
-              }
-            }"
+      - name: Set PR's status to pending and request a remote CI test
+        env:
+          E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud"
+
+          gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \
+            --method POST \
+            --raw-field "state=pending" \
+            --raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
+            --raw-field "context=neon-cloud-e2e"
+
+          gh workflow --repo ${REMOTE_REPO} \
+            run testing.yml \
+              --ref "main" \
+              --raw-field "ci_job_name=neon-cloud-e2e" \
+              --raw-field "commit_hash=$COMMIT_SHA" \
+              --raw-field "remote_repo=${GITHUB_REPOSITORY}" \
+              --raw-field "storage_image_tag=${TAG}" \
+              --raw-field "compute_image_tag=${TAG}" \
+              --raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
+              --raw-field "e2e-platforms=${E2E_PLATFORMS}"
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -276,7 +276,6 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "aws-config",
- "aws-sdk-secretsmanager",
 "bytes",
 "camino",
 "clap",
@@ -289,6 +288,7 @@ dependencies = [
 "hex",
 "humantime",
 "hyper",
+ "itertools",
 "lasso",
 "measured",
 "metrics",
@@ -347,9 +347,9 @@ dependencies = [

 [[package]]
 name = "aws-credential-types"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33cc49dcdd31c8b6e79850a179af4c367669150c7ac0135f176c61bec81a70f7"
+checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8"
 dependencies = [
 "aws-smithy-async",
 "aws-smithy-runtime-api",
@@ -359,9 +359,9 @@ dependencies = [

 [[package]]
 name = "aws-runtime"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb031bff99877c26c28895766f7bb8484a05e24547e370768d6cc9db514662aa"
+checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8"
 dependencies = [
 "aws-credential-types",
 "aws-sigv4",
@@ -381,6 +381,29 @@ dependencies = [
 "uuid",
 ]

+[[package]]
+name = "aws-sdk-iam"
+version = "1.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8ae76026bfb1b80a6aed0bb400c1139cd9c0563e26bce1986cd021c6a968c7b"
+dependencies = [
+ "aws-credential-types",
+ "aws-runtime",
+ "aws-smithy-async",
+ "aws-smithy-http",
+ "aws-smithy-json",
+ "aws-smithy-query",
+ "aws-smithy-runtime",
+ "aws-smithy-runtime-api",
+ "aws-smithy-types",
+ "aws-smithy-xml",
+ "aws-types",
+ "http 0.2.9",
+ "once_cell",
+ "regex-lite",
+ "tracing",
+]
+
 [[package]]
 name = "aws-sdk-s3"
 version = "1.14.0"
@@ -410,29 +433,6 @@ dependencies = [
 "url",
 ]

-[[package]]
-name = "aws-sdk-secretsmanager"
-version = "1.14.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0a0b64e61e7d632d9df90a2e0f32630c68c24960cab1d27d848718180af883d3"
-dependencies = [
- "aws-credential-types",
- "aws-runtime",
- "aws-smithy-async",
- "aws-smithy-http",
- "aws-smithy-json",
- "aws-smithy-runtime",
- "aws-smithy-runtime-api",
- "aws-smithy-types",
- "aws-types",
- "bytes",
- "fastrand 2.0.0",
- "http 0.2.9",
- "once_cell",
- "regex-lite",
- "tracing",
-]
-
 [[package]]
 name = "aws-sdk-sso"
 version = "1.12.0"
@@ -502,9 +502,9 @@ dependencies = [

 [[package]]
 name = "aws-sigv4"
-version = "1.1.4"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c371c6b0ac54d4605eb6f016624fb5c7c2925d315fdf600ac1bf21b19d5f1742"
+checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263"
 dependencies = [
 "aws-credential-types",
 "aws-smithy-eventstream",
@@ -517,7 +517,7 @@ dependencies = [
 "hex",
 "hmac",
 "http 0.2.9",
- "http 1.0.0",
+ "http 1.1.0",
 "once_cell",
 "p256",
 "percent-encoding",
@@ -531,9 +531,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-async"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72ee2d09cce0ef3ae526679b522835d63e75fb427aca5413cd371e490d52dcc6"
+checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46"
 dependencies = [
 "futures-util",
 "pin-project-lite",
@@ -574,9 +574,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-http"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dab56aea3cd9e1101a0a999447fb346afb680ab1406cebc44b32346e25b4117d"
+checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9"
 dependencies = [
 "aws-smithy-eventstream",
 "aws-smithy-runtime-api",
@@ -595,18 +595,18 @@ dependencies = [

 [[package]]
 name = "aws-smithy-json"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd3898ca6518f9215f62678870064398f00031912390efd03f1f6ef56d83aa8e"
+checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
 dependencies = [
 "aws-smithy-types",
 ]

 [[package]]
 name = "aws-smithy-query"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bda4b1dfc9810e35fba8a620e900522cd1bd4f9578c446e82f49d1ce41d2e9f9"
+checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb"
 dependencies = [
 "aws-smithy-types",
 "urlencoding",
@@ -614,9 +614,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-runtime"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fafdab38f40ad7816e7da5dec279400dd505160780083759f01441af1bbb10ea"
+checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01"
 dependencies = [
 "aws-smithy-async",
 "aws-smithy-http",
@@ -639,14 +639,15 @@ dependencies = [

 [[package]]
 name = "aws-smithy-runtime-api"
-version = "1.1.4"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c18276dd28852f34b3bf501f4f3719781f4999a51c7bff1a5c6dc8c4529adc29"
+checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5"
 dependencies = [
 "aws-smithy-async",
 "aws-smithy-types",
 "bytes",
 "http 0.2.9",
+ "http 1.1.0",
 "pin-project-lite",
 "tokio",
 "tracing",
@@ -655,9 +656,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-types"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb3e134004170d3303718baa2a4eb4ca64ee0a1c0a7041dca31b38be0fb414f3"
+checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729"
 dependencies = [
 "base64-simd",
 "bytes",
@@ -678,18 +679,18 @@ dependencies = [

 [[package]]
 name = "aws-smithy-xml"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8604a11b25e9ecaf32f9aa56b9fe253c5e2f606a3477f0071e96d3155a5ed218"
+checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9"
 dependencies = [
 "xmlparser",
 ]

 [[package]]
 name = "aws-types"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "789bbe008e65636fe1b6dbbb374c40c8960d1232b96af5ff4aec349f9c4accf4"
+checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40"
 dependencies = [
 "aws-credential-types",
 "aws-smithy-async",
@@ -2396,9 +2397,9 @@ dependencies = [

 [[package]]
 name = "http"
-version = "1.0.0"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b32afd38673a8016f7c9ae69e5af41a58f81b1d31689040f2f1959594ce194ea"
+checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258"
 dependencies = [
 "bytes",
 "fnv",
@@ -2498,7 +2499,7 @@ dependencies = [
 "hyper",
 "log",
 "rustls 0.21.9",
- "rustls-native-certs",
+ "rustls-native-certs 0.6.2",
 "tokio",
 "tokio-rustls 0.24.0",
 ]
@@ -3581,6 +3582,7 @@ dependencies = [
 "strum_macros",
 "svg_fmt",
 "sync_wrapper",
+ "sysinfo",
 "tenant_size_model",
 "thiserror",
 "tokio",
@@ -4198,7 +4200,12 @@ name = "proxy"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-compression",
 "async-trait",
+ "aws-config",
+ "aws-sdk-iam",
+ "aws-sigv4",
+ "aws-types",
 "base64 0.13.1",
 "bstr",
 "bytes",
@@ -4209,6 +4216,7 @@ dependencies = [
 "consumption_metrics",
 "dashmap",
 "env_logger",
+ "fallible-iterator",
 "futures",
 "git-version",
 "hashbrown 0.13.2",
@@ -4216,6 +4224,7 @@ dependencies = [
 "hex",
 "hmac",
 "hostname",
+ "http 1.1.0",
 "humantime",
 "hyper",
 "hyper-tungstenite",
@@ -4431,9 +4440,9 @@ dependencies = [

 [[package]]
 name = "redis"
-version = "0.24.0"
+version = "0.25.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd"
+checksum = "71d64e978fd98a0e6b105d066ba4889a7301fca65aeac850a877d8797343feeb"
 dependencies = [
 "async-trait",
 "bytes",
@@ -4442,15 +4451,15 @@ dependencies = [
 "itoa",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.21.9",
- "rustls-native-certs",
- "rustls-pemfile 1.0.2",
- "rustls-webpki 0.101.7",
+ "rustls 0.22.2",
+ "rustls-native-certs 0.7.0",
+ "rustls-pemfile 2.1.1",
+ "rustls-pki-types",
 "ryu",
 "sha1_smol",
- "socket2 0.4.9",
+ "socket2 0.5.5",
 "tokio",
- "tokio-rustls 0.24.0",
+ "tokio-rustls 0.25.0",
 "tokio-util",
 "url",
 ]
@@ -4879,6 +4888,19 @@ dependencies = [
 "security-framework",
 ]

+[[package]]
+name = "rustls-native-certs"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792"
+dependencies = [
+ "openssl-probe",
+ "rustls-pemfile 2.1.1",
+ "rustls-pki-types",
+ "schannel",
+ "security-framework",
+]
+
 [[package]]
 name = "rustls-pemfile"
 version = "1.0.2"
@@ -5601,6 +5623,26 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "storcon_cli"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "comfy-table",
+ "hyper",
+ "pageserver_api",
+ "pageserver_client",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "thiserror",
+ "tokio",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "stringprep"
 version = "0.1.2"
@@ -5914,9 +5956,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"

 [[package]]
 name = "tokio"
-version = "1.36.0"
+version = "1.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
+checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787"
 dependencies = [
 "backtrace",
 "bytes",
@@ -6159,7 +6201,7 @@ dependencies = [
 "percent-encoding",
 "pin-project",
 "prost",
- "rustls-native-certs",
+ "rustls-native-certs 0.6.2",
 "rustls-pemfile 1.0.2",
 "tokio",
 "tokio-rustls 0.24.0",
@@ -7045,7 +7087,6 @@ dependencies = [
 "aws-sigv4",
 "aws-smithy-async",
 "aws-smithy-http",
- "aws-smithy-runtime-api",
 "aws-smithy-types",
 "axum",
 "base64 0.21.1",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,6 +4,7 @@ members = [
    "compute_tools",
    "control_plane",
    "control_plane/attachment_service",
+    "control_plane/storcon_cli",
    "pageserver",
    "pageserver/compaction",
    "pageserver/ctl",
@@ -52,10 +53,12 @@ async-stream = "0.3"
 async-trait = "0.1"
 aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "1.14"
-aws-sdk-secretsmanager = { version = "1.14.0" }
+aws-sdk-iam = "1.15.0"
 aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
 aws-smithy-types = "1.1.4"
 aws-credential-types = "1.1.4"
+aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
+aws-types = "1.1.7"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -76,6 +79,7 @@ either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
 fail = "0.5.0"
+fallible-iterator = "0.2"
 fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
@@ -88,6 +92,7 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
+http = {version = "1.1.0", features = ["std"]}
 http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
@@ -121,7 +126,7 @@ procfs = "0.14"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
-redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
+redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.76.0
+ENV RUSTC_VERSION=1.77.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
@@ -149,7 +149,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    cargo install --git https://github.com/paritytech/cachepot && \
    cargo install rustfilt && \
    cargo install cargo-hakari && \
-    cargo install cargo-deny && \
+    cargo install cargo-deny --locked && \
    cargo install cargo-hack && \
    cargo install cargo-nextest && \
    rm -rf /home/nonroot/.cargo/registry && \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -944,6 +944,9 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl

+# Create remote extension download directory
+RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
+
 # Install:
 # libreadline8 for psql
 # libicu67, locales for collations (including ICU and plpgsql_check)
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1262,10 +1262,12 @@ LIMIT 100",
        .await
        .map_err(DownloadError::Other);

-        self.ext_download_progress
-            .write()
-            .expect("bad lock")
-            .insert(ext_archive_name.to_string(), (download_start, true));
+        if download_size.is_ok() {
+            self.ext_download_progress
+                .write()
+                .expect("bad lock")
+                .insert(ext_archive_name.to_string(), (download_start, true));
+        }

        download_size
    }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -17,6 +17,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
        .write(true)
        .create(true)
        .append(false)
+        .truncate(false)
        .open(path)?;
    let buf = io::BufReader::new(&file);
    let mut count: usize = 0;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            RoleAction::Create => {
                // This branch only runs when roles are created through the console, so it is
                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
-                // from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
+                // from neon_superuser.
                let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("running role create query: '{}'", &query);
@@ -743,21 +743,24 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
    // which may happen in two cases:
    // - extension was just installed
    // - extension was already installed and is up to date
-    // DISABLED due to compute node unpinning epic
-    // let query = "ALTER EXTENSION neon UPDATE";
-    // info!("update neon extension version with query: {}", query);
-    // client.simple_query(query)?;
+    let query = "ALTER EXTENSION neon UPDATE";
+    info!("update neon extension version with query: {}", query);
+    if let Err(e) = client.simple_query(query) {
+        error!(
+            "failed to upgrade neon extension during `handle_extension_neon`: {}",
+            e
+        );
+    }

    Ok(())
 }

 #[instrument(skip_all)]
-pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
-    info!("handle neon extension upgrade (not really)");
-    // DISABLED due to compute node unpinning epic
-    // let query = "ALTER EXTENSION neon UPDATE";
-    // info!("update neon extension version with query: {}", query);
-    // client.simple_query(query)?;
+pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
+    info!("handle neon extension upgrade");
+    let query = "ALTER EXTENSION neon UPDATE";
+    info!("update neon extension version with query: {}", query);
+    client.simple_query(query)?;

    Ok(())
 }
@@ -806,19 +809,8 @@ $$;"#,
        "",
        "",
        "",
+        "",
        // Add new migrations below.
-        r#"
-DO $$
-DECLARE
-    role_name TEXT;
-BEGIN
-    FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
-    LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
-        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
-    END LOOP;
-END
-$$;"#,
    ];

    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -16,7 +16,6 @@ testing = []
 [dependencies]
 anyhow.workspace = true
 aws-config.workspace = true
-aws-sdk-secretsmanager.workspace = true
 bytes.workspace = true
 camino.workspace = true
 clap.workspace = true
@@ -26,6 +25,7 @@ git-version.workspace = true
 hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
+itertools.workspace = true
 lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
--- a/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/down.sql
+++ b/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/down.sql
@@ -0,0 +1,3 @@
+-- This file should undo anything in `up.sql`
+
+ALTER TABLE tenant_shards drop scheduling_policy;
--- a/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/up.sql
+++ b/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/up.sql
@@ -0,0 +1,2 @@
+
+ALTER TABLE tenant_shards add scheduling_policy VARCHAR NOT NULL DEFAULT '"Active"';
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -14,7 +14,6 @@ use utils::{

 use crate::service::Config;

-const BUSY_DELAY: Duration = Duration::from_secs(1);
 const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);

 pub(crate) const API_CONCURRENCY: usize = 32;
@@ -280,11 +279,10 @@ impl ComputeHook {
                Err(NotifyError::SlowDown)
            }
            StatusCode::LOCKED => {
-                // Delay our retry if busy: the usual fast exponential backoff in backoff::retry
-                // is not appropriate
-                tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
-                    .await
-                    .ok();
+                // We consider this fatal, because it's possible that the operation blocking the control one is
+                // also the one that is waiting for this reconcile.  We should let the reconciler calling
+                // this hook fail, to give control plane a chance to un-lock.
+                tracing::info!("Control plane reports tenant is locked, dropping out of notify");
                Err(NotifyError::Busy)
            }
            StatusCode::SERVICE_UNAVAILABLE
@@ -306,7 +304,12 @@ impl ComputeHook {
        let client = reqwest::Client::new();
        backoff::retry(
            || self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
-            |e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
+            |e| {
+                matches!(
+                    e,
+                    NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy
+                )
+            },
            3,
            10,
            "Send compute notification",
--- a/control_plane/attachment_service/src/heartbeater.rs
+++ b/control_plane/attachment_service/src/heartbeater.rs
@@ -139,7 +139,7 @@ impl HeartbeaterTask {
                        .with_client_retries(
                            |client| async move { client.get_utilization().await },
                            &jwt_token,
-                            2,
+                            3,
                            3,
                            Duration::from_secs(1),
                            &cancel,
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -34,7 +34,8 @@ use utils::{
 };

 use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
+    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest,
+    TenantShardMigrateRequest,
 };
 use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};

@@ -398,6 +399,15 @@ async fn handle_tenant_describe(
    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
 }

+async fn handle_tenant_list(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    json_response(StatusCode::OK, service.tenant_list())
+}
+
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -411,7 +421,10 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
    check_permissions(&req, Scope::Admin)?;

    let state = get_state(&req);
-    json_response(StatusCode::OK, state.service.node_list().await?)
+    let nodes = state.service.node_list().await?;
+    let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
+
+    json_response(StatusCode::OK, api_nodes)
 }

 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -478,6 +491,22 @@ async fn handle_tenant_shard_migrate(
    )
 }

+async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
+    let state = get_state(&req);
+
+    json_response(
+        StatusCode::OK,
+        state
+            .service
+            .tenant_update_policy(tenant_id, update_req)
+            .await?,
+    )
+}
+
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    check_permissions(&req, Scope::PageServerApi)?;
@@ -509,6 +538,14 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, state.service.consistency_check().await?)
 }

+async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+
+    json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
+}
+
 /// Status endpoint is just used for checking that our HTTP listener is up
 async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(StatusCode::OK, ())
@@ -726,6 +763,9 @@ pub fn make_router(
                RequestName("debug_v1_consistency_check"),
            )
        })
+        .post("/debug/v1/reconcile_all", |r| {
+            request_span(r, handle_reconcile_all)
+        })
        .put("/debug/v1/failpoints", |r| {
            request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
        })
@@ -765,6 +805,16 @@ pub fn make_router(
                RequestName("control_v1_tenant_describe"),
            )
        })
+        .get("/control/v1/tenant", |r| {
+            tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list"))
+        })
+        .put("/control/v1/tenant/:tenant_id/policy", |r| {
+            named_request_span(
+                r,
+                handle_tenant_update_policy,
+                RequestName("control_v1_tenant_policy"),
+            )
+        })
        // Tenant operations
        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -3,7 +3,6 @@ use attachment_service::http::make_router;
 use attachment_service::metrics::preinitialize_metrics;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
-use aws_config::{BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
@@ -55,11 +54,31 @@ struct Cli {
    #[arg(long)]
    database_url: Option<String>,

+    /// Flag to enable dev mode, which permits running without auth
+    #[arg(long, default_value = "false")]
+    dev: bool,
+
    /// Grace period before marking unresponsive pageserver offline
    #[arg(long)]
    max_unavailable_interval: Option<humantime::Duration>,
 }

+enum StrictMode {
+    /// In strict mode, we will require that all secrets are loaded, i.e. security features
+    /// may not be implicitly turned off by omitting secrets in the environment.
+    Strict,
+    /// In dev mode, secrets are optional, and omitting a particular secret will implicitly
+    /// disable the auth related to it (e.g. no pageserver jwt key -> send unauthenticated
+    /// requests, no public key -> don't authenticate incoming requests).
+    Dev,
+}
+
+impl Default for StrictMode {
+    fn default() -> Self {
+        Self::Strict
+    }
+}
+
 /// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
 /// type encapsulates the logic to decide which and do the loading.
 struct Secrets {
@@ -70,13 +89,6 @@ struct Secrets {
 }

 impl Secrets {
-    const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url";
-    const PAGESERVER_JWT_TOKEN_SECRET: &'static str =
-        "neon-storage-controller-pageserver-jwt-token";
-    const CONTROL_PLANE_JWT_TOKEN_SECRET: &'static str =
-        "neon-storage-controller-control-plane-jwt-token";
-    const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
-
    const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
    const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
    const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
@@ -87,111 +99,41 @@ impl Secrets {
    /// - Environment variables if DATABASE_URL is set.
    /// - AWS Secrets Manager secrets
    async fn load(args: &Cli) -> anyhow::Result<Self> {
-        match &args.database_url {
-            Some(url) => Self::load_cli(url, args),
-            None => match std::env::var(Self::DATABASE_URL_ENV) {
-                Ok(database_url) => Self::load_env(database_url),
-                Err(_) => Self::load_aws_sm().await,
-            },
-        }
-    }
-
-    fn load_env(database_url: String) -> anyhow::Result<Self> {
-        let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
-            Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
-            Err(_) => None,
-        };
-        Ok(Self {
-            database_url,
-            public_key,
-            jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
-            control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
-        })
-    }
-
-    async fn load_aws_sm() -> anyhow::Result<Self> {
-        let Ok(region) = std::env::var("AWS_REGION") else {
-            anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
-        };
-        let config = aws_config::defaults(BehaviorVersion::v2023_11_09())
-            .region(Region::new(region.clone()))
-            .load()
-            .await;
-
-        let asm = aws_sdk_secretsmanager::Client::new(&config);
-
-        let Some(database_url) = asm
-            .get_secret_value()
-            .secret_id(Self::DATABASE_URL_SECRET)
-            .send()
-            .await?
-            .secret_string()
-            .map(str::to_string)
+        let Some(database_url) =
+            Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await
        else {
            anyhow::bail!(
-                "Database URL secret not found at {region}/{}",
-                Self::DATABASE_URL_SECRET
+                "Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)"
            )
        };

-        let jwt_token = asm
-            .get_secret_value()
-            .secret_id(Self::PAGESERVER_JWT_TOKEN_SECRET)
-            .send()
-            .await?
-            .secret_string()
-            .map(str::to_string);
-        if jwt_token.is_none() {
-            tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver");
-        }
-
-        let control_plane_jwt_token = asm
-            .get_secret_value()
-            .secret_id(Self::CONTROL_PLANE_JWT_TOKEN_SECRET)
-            .send()
-            .await?
-            .secret_string()
-            .map(str::to_string);
-        if jwt_token.is_none() {
-            tracing::warn!("No control plane JWT token set: this will only work if authentication is disabled on the pageserver");
-        }
-
-        let public_key = asm
-            .get_secret_value()
-            .secret_id(Self::PUBLIC_KEY_SECRET)
-            .send()
-            .await?
-            .secret_string()
-            .map(str::to_string);
-        let public_key = match public_key {
-            Some(key) => Some(JwtAuth::from_key(key)?),
-            None => {
-                tracing::warn!(
-                    "No public key set: inccoming HTTP requests will not be authenticated"
-                );
-                None
-            }
+        let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await {
+            Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?),
+            None => None,
        };

-        Ok(Self {
+        let this = Self {
            database_url,
            public_key,
-            jwt_token,
-            control_plane_jwt_token,
-        })
+            jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await,
+            control_plane_jwt_token: Self::load_secret(
+                &args.control_plane_jwt_token,
+                Self::CONTROL_PLANE_JWT_TOKEN_ENV,
+            )
+            .await,
+        };
+
+        Ok(this)
    }

-    fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
-        let public_key = match &args.public_key {
-            None => None,
-            Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?),
-        };
-        Ok(Self {
-            database_url: database_url.to_owned(),
-            public_key,
-            jwt_token: args.jwt_token.clone(),
-            control_plane_jwt_token: args.control_plane_jwt_token.clone(),
-        })
+    async fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
+        if let Some(v) = cli {
+            Some(v.clone())
+        } else if let Ok(v) = std::env::var(env_name) {
+            Some(v)
+        } else {
+            None
+        }
    }
 }

@@ -247,8 +189,42 @@ async fn async_main() -> anyhow::Result<()> {
        args.listen
    );

+    let strict_mode = if args.dev {
+        StrictMode::Dev
+    } else {
+        StrictMode::Strict
+    };
+
    let secrets = Secrets::load(&args).await?;

+    // Validate required secrets and arguments are provided in strict mode
+    match strict_mode {
+        StrictMode::Strict
+            if (secrets.public_key.is_none()
+                || secrets.jwt_token.is_none()
+                || secrets.control_plane_jwt_token.is_none()) =>
+        {
+            // Production systems should always have secrets configured: if public_key was not set
+            // then we would implicitly disable auth.
+            anyhow::bail!(
+                    "Insecure config!  One or more secrets is not set.  This is only permitted in `--dev` mode"
+                );
+        }
+        StrictMode::Strict if args.compute_hook_url.is_none() => {
+            // Production systems should always have a compute hook set, to prevent falling
+            // back to trying to use neon_local.
+            anyhow::bail!(
+                "`--compute-hook-url` is not set: this is only permitted in `--dev` mode"
+            );
+        }
+        StrictMode::Strict => {
+            tracing::info!("Starting in strict mode: configuration is OK.")
+        }
+        StrictMode::Dev => {
+            tracing::warn!("Starting in dev mode: this may be an insecure configuration.")
+        }
+    }
+
    let config = Config {
        jwt_token: secrets.jwt_token,
        control_plane_jwt_token: secrets.control_plane_jwt_token,
--- a/control_plane/attachment_service/src/metrics.rs
+++ b/control_plane/attachment_service/src/metrics.rs
@@ -37,6 +37,9 @@ pub(crate) struct StorageControllerMetricGroup {
    pub(crate) storage_controller_reconcile_complete:
        measured::CounterVec<ReconcileCompleteLabelGroupSet>,

+    /// Count of how many times we make an optimization change to a tenant's scheduling
+    pub(crate) storage_controller_schedule_optimization: measured::Counter,
+
    /// HTTP request status counters for handled requests
    pub(crate) storage_controller_http_request_status:
        measured::CounterVec<HttpRequestStatusLabelGroupSet>,
@@ -101,6 +104,7 @@ impl StorageControllerMetricGroup {
                    status: StaticLabelSet::new(),
                },
            ),
+            storage_controller_schedule_optimization: measured::Counter::new(),
            storage_controller_http_request_status: measured::CounterVec::new(
                HttpRequestStatusLabelGroupSet {
                    path: lasso::ThreadedRodeo::new(),
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -3,7 +3,8 @@ use std::{str::FromStr, time::Duration};
 use hyper::StatusCode;
 use pageserver_api::{
    controller_api::{
-        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
+        NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
+        TenantLocateResponseShard,
    },
    shard::TenantShardId,
 };
@@ -256,6 +257,19 @@ impl Node {
        )
        .await
    }
+
+    /// Generate the simplified API-friendly description of a node's state
+    pub(crate) fn describe(&self) -> NodeDescribeResponse {
+        NodeDescribeResponse {
+            id: self.id,
+            availability: self.availability.into(),
+            scheduling: self.scheduling,
+            listen_http_addr: self.listen_http_addr.clone(),
+            listen_http_port: self.listen_http_port,
+            listen_pg_addr: self.listen_pg_addr.clone(),
+            listen_pg_port: self.listen_pg_port,
+        }
+    }
 }

 impl std::fmt::Display for Node {
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -9,6 +9,7 @@ use camino::Utf8PathBuf;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
+use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::ShardConfigError;
@@ -107,6 +108,12 @@ pub(crate) enum AbortShardSplitStatus {

 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;

+/// Some methods can operate on either a whole tenant or a single shard
+pub(crate) enum TenantFilter {
+    Tenant(TenantId),
+    Shard(TenantShardId),
+}
+
 impl Persistence {
    // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
    // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
@@ -140,7 +147,7 @@ impl Persistence {
    /// Wraps `with_conn` in order to collect latency and error metrics
    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
    where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
        R: Send + 'static,
    {
        let latency = &METRICS_REGISTRY
@@ -168,7 +175,7 @@ impl Persistence {
    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
    where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
        R: Send + 'static,
    {
        let mut conn = self.connection_pool.get()?;
@@ -275,6 +282,11 @@ impl Persistence {
                // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
                shard.placement_policy = "{\"Attached\":0}".to_string();
            }
+
+            if shard.scheduling_policy.is_empty() {
+                shard.scheduling_policy =
+                    serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
+            }
        }

        let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
@@ -465,59 +477,45 @@ impl Persistence {
    /// that we only do the first time a tenant is set to an attached policy via /location_config.
    pub(crate) async fn update_tenant_shard(
        &self,
-        tenant_shard_id: TenantShardId,
-        input_placement_policy: PlacementPolicy,
-        input_config: TenantConfig,
+        tenant: TenantFilter,
+        input_placement_policy: Option<PlacementPolicy>,
+        input_config: Option<TenantConfig>,
        input_generation: Option<Generation>,
+        input_scheduling_policy: Option<ShardSchedulingPolicy>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;

        self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
-            let query = diesel::update(tenant_shards)
-                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
+            let query = match tenant {
+                TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .into_boxed(),
+                TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(input_tenant_id.to_string()))
+                    .into_boxed(),
+            };

-            if let Some(input_generation) = input_generation {
-                // Update includes generation column
-                query
-                    .set((
-                        generation.eq(Some(input_generation.into().unwrap() as i32)),
-                        placement_policy
-                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
-                        config.eq(serde_json::to_string(&input_config).unwrap()),
-                    ))
-                    .execute(conn)?;
-            } else {
-                // Update does not include generation column
-                query
-                    .set((
-                        placement_policy
-                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
-                        config.eq(serde_json::to_string(&input_config).unwrap()),
-                    ))
-                    .execute(conn)?;
+            #[derive(AsChangeset)]
+            #[diesel(table_name = crate::schema::tenant_shards)]
+            struct ShardUpdate {
+                generation: Option<i32>,
+                placement_policy: Option<String>,
+                config: Option<String>,
+                scheduling_policy: Option<String>,
            }

-            Ok(())
-        })
-        .await?;
+            let update = ShardUpdate {
+                generation: input_generation.map(|g| g.into().unwrap() as i32),
+                placement_policy: input_placement_policy
+                    .map(|p| serde_json::to_string(&p).unwrap()),
+                config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
+                scheduling_policy: input_scheduling_policy
+                    .map(|p| serde_json::to_string(&p).unwrap()),
+            };

-        Ok(())
-    }
-
-    pub(crate) async fn update_tenant_config(
-        &self,
-        input_tenant_id: TenantId,
-        input_config: TenantConfig,
-    ) -> DatabaseResult<()> {
-        use crate::schema::tenant_shards::dsl::*;
-
-        self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
-            diesel::update(tenant_shards)
-                .filter(tenant_id.eq(input_tenant_id.to_string()))
-                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
-                .execute(conn)?;
+            query.set(update).execute(conn)?;

            Ok(())
        })
@@ -728,6 +726,8 @@ pub(crate) struct TenantShardPersistence {
    pub(crate) splitting: SplitState,
    #[serde(default)]
    pub(crate) config: String,
+    #[serde(default)]
+    pub(crate) scheduling_policy: String,
 }

 impl TenantShardPersistence {
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -487,6 +487,7 @@ impl Reconciler {
        while let Err(e) = self.compute_notify().await {
            match e {
                NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
+                NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
                _ => {
                    tracing::warn!(
                        "Live migration blocked by compute notification error, retrying: {e}"
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -58,6 +58,70 @@ pub(crate) struct Scheduler {
    nodes: HashMap<NodeId, SchedulerNode>,
 }

+/// Score for soft constraint scheduling: lower scores are preferred to higher scores.
+///
+/// For example, we may set an affinity score based on the number of shards from the same
+/// tenant already on a node, to implicitly prefer to balance out shards.
+#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
+pub(crate) struct AffinityScore(pub(crate) usize);
+
+impl AffinityScore {
+    /// If we have no anti-affinity at all toward a node, this is its score.  It means
+    /// the scheduler has a free choice amongst nodes with this score, and may pick a node
+    /// based on other information such as total utilization.
+    pub(crate) const FREE: Self = Self(0);
+
+    pub(crate) fn inc(&mut self) {
+        self.0 += 1;
+    }
+}
+
+impl std::ops::Add for AffinityScore {
+    type Output = Self;
+
+    fn add(self, rhs: Self) -> Self::Output {
+        Self(self.0 + rhs.0)
+    }
+}
+
+// For carrying state between multiple calls to [`TenantState::schedule`], e.g. when calling
+// it for many shards in the same tenant.
+#[derive(Debug, Default)]
+pub(crate) struct ScheduleContext {
+    /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
+    pub(crate) nodes: HashMap<NodeId, AffinityScore>,
+
+    /// Specifically how many _attached_ locations are on each node
+    pub(crate) attached_nodes: HashMap<NodeId, usize>,
+}
+
+impl ScheduleContext {
+    /// Input is a list of nodes we would like to avoid using again within this context.  The more
+    /// times a node is passed into this call, the less inclined we are to use it.
+    pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
+        for node_id in nodes {
+            let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE);
+            entry.inc()
+        }
+    }
+
+    pub(crate) fn push_attached(&mut self, node_id: NodeId) {
+        let entry = self.attached_nodes.entry(node_id).or_default();
+        *entry += 1;
+    }
+
+    pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
+        self.nodes
+            .get(&node_id)
+            .copied()
+            .unwrap_or(AffinityScore::FREE)
+    }
+
+    pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
+        self.attached_nodes.get(&node_id).copied().unwrap_or(0)
+    }
+}
+
 impl Scheduler {
    pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
        let mut scheduler_nodes = HashMap::new();
@@ -224,27 +288,47 @@ impl Scheduler {
        node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
    }

-    pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
+    /// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they
+    /// are already in use by this shard -- we use this to avoid picking the same node
+    /// as both attached and secondary location.  This is a hard constraint: if we cannot
+    /// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`].
+    ///
+    /// context: we prefer to avoid using nodes identified in the context, according
+    /// to their anti-affinity score.  We use this to prefeer to avoid placing shards in
+    /// the same tenant on the same node.  This is a soft constraint: the context will never
+    /// cause us to fail to schedule a shard.
+    pub(crate) fn schedule_shard(
+        &self,
+        hard_exclude: &[NodeId],
+        context: &ScheduleContext,
+    ) -> Result<NodeId, ScheduleError> {
        if self.nodes.is_empty() {
            return Err(ScheduleError::NoPageservers);
        }

-        let mut tenant_counts: Vec<(NodeId, usize)> = self
+        let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
            .nodes
            .iter()
            .filter_map(|(k, v)| {
                if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
                    None
                } else {
-                    Some((*k, v.shard_count))
+                    Some((
+                        *k,
+                        context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
+                        v.shard_count,
+                    ))
                }
            })
            .collect();

-        // Sort by tenant count.  Nodes with the same tenant count are sorted by ID.
-        tenant_counts.sort_by_key(|i| (i.1, i.0));
+        // Sort by, in order of precedence:
+        //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
+        //  2nd: Utilization.  Within nodes with the same affinity, use the least loaded nodes.
+        //  3rd: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
+        scores.sort_by_key(|i| (i.1, i.2, i.0));

-        if tenant_counts.is_empty() {
+        if scores.is_empty() {
            // After applying constraints, no pageservers were left.  We log some detail about
            // the state of nodes to help understand why this happened.  This is not logged as an error because
            // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
@@ -260,10 +344,11 @@ impl Scheduler {
            return Err(ScheduleError::ImpossibleConstraint);
        }

-        let node_id = tenant_counts.first().unwrap().0;
+        // Lowest score wins
+        let node_id = scores.first().unwrap().0;
        tracing::info!(
-            "scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
-            tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
+            "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
+            scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
        );

        // Note that we do not update shard count here to reflect the scheduling: that
@@ -271,6 +356,12 @@ impl Scheduler {

        Ok(node_id)
    }
+
+    /// Unit test access to internal state
+    #[cfg(test)]
+    pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
+        self.nodes.get(&node_id).unwrap().shard_count
+    }
 }

 #[cfg(test)]
@@ -316,15 +407,17 @@ mod tests {
        let mut t1_intent = IntentState::new();
        let mut t2_intent = IntentState::new();

-        let scheduled = scheduler.schedule_shard(&[])?;
+        let context = ScheduleContext::default();
+
+        let scheduled = scheduler.schedule_shard(&[], &context)?;
        t1_intent.set_attached(&mut scheduler, Some(scheduled));
-        let scheduled = scheduler.schedule_shard(&[])?;
+        let scheduled = scheduler.schedule_shard(&[], &context)?;
        t2_intent.set_attached(&mut scheduler, Some(scheduled));

        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);

-        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
+        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
        t1_intent.push_secondary(&mut scheduler, scheduled);

        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -22,6 +22,7 @@ diesel::table! {
        placement_policy -> Varchar,
        splitting -> Int2,
        config -> Text,
+        scheduling_policy -> Varchar,
    }
 }

--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -8,7 +8,10 @@ use std::{
 };

 use crate::{
-    id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
+    id_lock_map::IdLockMap,
+    persistence::{AbortShardSplitStatus, TenantFilter},
+    reconciler::ReconcileError,
+    scheduler::ScheduleContext,
 };
 use anyhow::Context;
 use control_plane::storage_controller::{
@@ -17,12 +20,14 @@ use control_plane::storage_controller::{
 use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
+use itertools::Itertools;
 use pageserver_api::{
    controller_api::{
        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
-        TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
-        TenantShardMigrateResponse, UtilizationScore,
+        ShardSchedulingPolicy, TenantCreateResponse, TenantCreateResponseShard,
+        TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse,
+        TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
+        UtilizationScore,
    },
    models::{SecondaryProgress, TenantConfigRequest},
 };
@@ -51,7 +56,6 @@ use utils::{
    generation::Generation,
    http::error::ApiError,
    id::{NodeId, TenantId, TimelineId},
-    seqwait::SeqWait,
    sync::gate::Gate,
 };

@@ -66,7 +70,6 @@ use crate::{
        IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
        ReconcilerWaiter, TenantState,
    },
-    Sequence,
 };

 // For operations that should be quick, like attaching a new tenant
@@ -344,9 +347,15 @@ impl Service {
            }

            // Populate each tenant's intent state
+            let mut schedule_context = ScheduleContext::default();
            for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
+                if tenant_shard_id.shard_number == ShardNumber(0) {
+                    // Reset scheduling context each time we advance to the next Tenant
+                    schedule_context = ScheduleContext::default();
+                }
+
                tenant_state.intent_from_observed(scheduler);
-                if let Err(e) = tenant_state.schedule(scheduler) {
+                if let Err(e) = tenant_state.schedule(scheduler, &mut schedule_context) {
                    // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
                    // not enough pageservers are available.  The tenant may well still be available
                    // to clients.
@@ -670,7 +679,13 @@ impl Service {
        let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD);
        while !self.cancel.is_cancelled() {
            tokio::select! {
-              _ = interval.tick() => { self.reconcile_all(); }
+              _ = interval.tick() => {
+                let reconciles_spawned = self.reconcile_all();
+                if reconciles_spawned == 0 {
+                    // Run optimizer only when we didn't find any other work to do
+                    self.optimize_all();
+                }
+            }
              _ = self.cancel.cancelled() => return
            }
        }
@@ -957,30 +972,14 @@ impl Service {
        }
        for tsp in tenant_shard_persistence {
            let tenant_shard_id = tsp.get_tenant_shard_id()?;
-            let shard_identity = tsp.get_shard_identity()?;
+
            // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
            // it with what we can infer: the node for which a generation was most recently issued.
            let mut intent = IntentState::new();
            if let Some(generation_pageserver) = tsp.generation_pageserver {
                intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
            }
-
-            let new_tenant = TenantState {
-                tenant_shard_id,
-                shard: shard_identity,
-                sequence: Sequence::initial(),
-                generation: tsp.generation.map(|g| Generation::new(g as u32)),
-                policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
-                intent,
-                observed: ObservedState::new(),
-                config: serde_json::from_str(&tsp.config).unwrap(),
-                reconciler: None,
-                splitting: tsp.splitting,
-                waiter: Arc::new(SeqWait::new(Sequence::initial())),
-                error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
-                last_error: Arc::default(),
-                pending_compute_notification: false,
-            };
+            let new_tenant = TenantState::from_persistent(tsp, intent)?;

            tenants.insert(tenant_shard_id, new_tenant);
        }
@@ -1104,6 +1103,8 @@ impl Service {
                placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
                config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                splitting: SplitState::default(),
+                scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
+                    .unwrap(),
            };

            match self.persistence.insert_tenant_shards(vec![tsp]).await {
@@ -1156,9 +1157,10 @@ impl Service {
                    // when we reattaching a detached tenant.
                    self.persistence
                        .update_tenant_shard(
-                            attach_req.tenant_shard_id,
-                            PlacementPolicy::Attached(0),
-                            conf,
+                            TenantFilter::Shard(attach_req.tenant_shard_id),
+                            Some(PlacementPolicy::Attached(0)),
+                            Some(conf),
+                            None,
                            None,
                        )
                        .await?;
@@ -1523,6 +1525,8 @@ impl Service {
        &self,
        create_req: TenantCreateRequest,
    ) -> Result<TenantCreateResponse, ApiError> {
+        let tenant_id = create_req.new_tenant_id.tenant_id;
+
        // Exclude any concurrent attempts to create/access the same tenant ID
        let _tenant_lock = self
            .tenant_op_locks
@@ -1531,7 +1535,12 @@ impl Service {

        let (response, waiters) = self.do_tenant_create(create_req).await?;

-        self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
+        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
+            // Avoid deadlock: reconcile may fail while notifying compute, if the cloud control plane refuses to
+            // accept compute notifications while it is in the process of creating.  Reconciliation will
+            // be retried in the background.
+            tracing::warn!(%tenant_id, "Reconcile not done yet while creating tenant ({e})");
+        }
        Ok(response)
    }

@@ -1608,15 +1617,31 @@ impl Service {
                placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                config: serde_json::to_string(&create_req.config).unwrap(),
                splitting: SplitState::default(),
+                scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
+                    .unwrap(),
            })
            .collect();
-        self.persistence
+
+        match self
+            .persistence
            .insert_tenant_shards(persist_tenant_shards)
            .await
-            .map_err(|e| {
-                // TODO: distinguish primary key constraint (idempotent, OK), from other errors
-                ApiError::InternalServerError(anyhow::anyhow!(e))
-            })?;
+        {
+            Ok(_) => {}
+            Err(DatabaseError::Query(diesel::result::Error::DatabaseError(
+                DatabaseErrorKind::UniqueViolation,
+                _,
+            ))) => {
+                // Unique key violation: this is probably a retry.  Because the shard count is part of the unique key,
+                // if we see a unique key violation it means that the creation request's shard count matches the previous
+                // creation's shard count.
+                tracing::info!("Tenant shards already present in database, proceeding with idempotent creation...");
+            }
+            // Any other database error is unexpected and a bug.
+            Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))),
+        };
+
+        let mut schedule_context = ScheduleContext::default();

        let (waiters, response_shards) = {
            let mut locked = self.inner.write().unwrap();
@@ -1639,11 +1664,14 @@ impl Service {
                        // attached and secondary locations (independently) away frorm those
                        // pageservers also holding a shard for this tenant.

-                        entry.get_mut().schedule(scheduler).map_err(|e| {
-                            ApiError::Conflict(format!(
-                                "Failed to schedule shard {tenant_shard_id}: {e}"
-                            ))
-                        })?;
+                        entry
+                            .get_mut()
+                            .schedule(scheduler, &mut schedule_context)
+                            .map_err(|e| {
+                                ApiError::Conflict(format!(
+                                    "Failed to schedule shard {tenant_shard_id}: {e}"
+                                ))
+                            })?;

                        if let Some(node_id) = entry.get().intent.get_attached() {
                            let generation = entry
@@ -1671,7 +1699,7 @@ impl Service {

                        state.generation = initial_generation;
                        state.config = create_req.config.clone();
-                        if let Err(e) = state.schedule(scheduler) {
+                        if let Err(e) = state.schedule(scheduler, &mut schedule_context) {
                            schcedule_error = Some(e);
                        }

@@ -1879,6 +1907,7 @@ impl Service {
                // Persist updates
                // Ordering: write to the database before applying changes in-memory, so that
                // we will not appear time-travel backwards on a restart.
+                let mut schedule_context = ScheduleContext::default();
                for ShardUpdate {
                    tenant_shard_id,
                    placement_policy,
@@ -1888,10 +1917,11 @@ impl Service {
                {
                    self.persistence
                        .update_tenant_shard(
-                            *tenant_shard_id,
-                            placement_policy.clone(),
-                            tenant_config.clone(),
+                            TenantFilter::Shard(*tenant_shard_id),
+                            Some(placement_policy.clone()),
+                            Some(tenant_config.clone()),
                            *generation,
+                            None,
                        )
                        .await?;
                }
@@ -1925,7 +1955,7 @@ impl Service {
                            shard.generation = Some(generation);
                        }

-                        shard.schedule(scheduler)?;
+                        shard.schedule(scheduler, &mut schedule_context)?;

                        let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
                        if let Some(waiter) = maybe_waiter {
@@ -1969,7 +1999,13 @@ impl Service {
        let config = req.config;

        self.persistence
-            .update_tenant_config(req.tenant_id, config.clone())
+            .update_tenant_shard(
+                TenantFilter::Tenant(req.tenant_id),
+                None,
+                Some(config.clone()),
+                None,
+                None,
+            )
            .await?;

        let waiters = {
@@ -2079,7 +2115,7 @@ impl Service {
            let scheduler = &locked.scheduler;
            // Right now we only perform the operation on a single node without parallelization
            // TODO fan out the operation to multiple nodes for better performance
-            let node_id = scheduler.schedule_shard(&[])?;
+            let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
            let node = locked
                .nodes
                .get(&node_id)
@@ -2322,6 +2358,58 @@ impl Service {
        Ok(StatusCode::NOT_FOUND)
    }

+    /// Naming: this configures the storage controller's policies for a tenant, whereas [`Self::tenant_config_set`] is "set the TenantConfig"
+    /// for a tenant.  The TenantConfig is passed through to pageservers, whereas this function modifies
+    /// the tenant's policies (configuration) within the storage controller
+    pub(crate) async fn tenant_update_policy(
+        &self,
+        tenant_id: TenantId,
+        req: TenantPolicyRequest,
+    ) -> Result<(), ApiError> {
+        // We require an exclusive lock, because we are updating persistent and in-memory state
+        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+
+        let TenantPolicyRequest {
+            placement,
+            scheduling,
+        } = req;
+
+        self.persistence
+            .update_tenant_shard(
+                TenantFilter::Tenant(tenant_id),
+                placement.clone(),
+                None,
+                None,
+                scheduling,
+            )
+            .await?;
+
+        let mut schedule_context = ScheduleContext::default();
+        let mut locked = self.inner.write().unwrap();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
+        for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+            if let Some(placement) = &placement {
+                shard.policy = placement.clone();
+
+                tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(),
+                               "Updated placement policy to {placement:?}");
+            }
+
+            if let Some(scheduling) = &scheduling {
+                shard.set_scheduling_policy(*scheduling);
+
+                tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(),
+                               "Updated scheduling policy to {scheduling:?}");
+            }
+
+            // In case scheduling is being switched back on, try it now.
+            shard.schedule(scheduler, &mut schedule_context).ok();
+            self.maybe_reconcile_shard(shard, nodes);
+        }
+
+        Ok(())
+    }
+
    pub(crate) async fn tenant_timeline_create(
        &self,
        tenant_id: TenantId,
@@ -2648,47 +2736,73 @@ impl Service {
        })
    }

-    pub(crate) fn tenant_describe(
+    /// Returns None if the input iterator of shards does not include a shard with number=0
+    fn tenant_describe_impl<'a>(
        &self,
-        tenant_id: TenantId,
-    ) -> Result<TenantDescribeResponse, ApiError> {
-        let locked = self.inner.read().unwrap();
-
+        shards: impl Iterator<Item = &'a TenantState>,
+    ) -> Option<TenantDescribeResponse> {
        let mut shard_zero = None;
-        let mut shards = Vec::new();
+        let mut describe_shards = Vec::new();

-        for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-        {
-            if tenant_shard_id.is_zero() {
+        for shard in shards {
+            if shard.tenant_shard_id.is_zero() {
                shard_zero = Some(shard);
            }

-            let response_shard = TenantDescribeResponseShard {
-                tenant_shard_id: *tenant_shard_id,
+            describe_shards.push(TenantDescribeResponseShard {
+                tenant_shard_id: shard.tenant_shard_id,
                node_attached: *shard.intent.get_attached(),
                node_secondary: shard.intent.get_secondary().to_vec(),
                last_error: shard.last_error.lock().unwrap().clone(),
                is_reconciling: shard.reconciler.is_some(),
                is_pending_compute_notification: shard.pending_compute_notification,
                is_splitting: matches!(shard.splitting, SplitState::Splitting),
-            };
-            shards.push(response_shard);
+                scheduling_policy: *shard.get_scheduling_policy(),
+            })
        }

-        let Some(shard_zero) = shard_zero else {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
-            ));
-        };
+        let shard_zero = shard_zero?;

-        Ok(TenantDescribeResponse {
-            shards,
+        Some(TenantDescribeResponse {
+            tenant_id: shard_zero.tenant_shard_id.tenant_id,
+            shards: describe_shards,
            stripe_size: shard_zero.shard.stripe_size,
            policy: shard_zero.policy.clone(),
            config: shard_zero.config.clone(),
        })
    }

+    pub(crate) fn tenant_describe(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<TenantDescribeResponse, ApiError> {
+        let locked = self.inner.read().unwrap();
+
+        self.tenant_describe_impl(
+            locked
+                .tenants
+                .range(TenantShardId::tenant_range(tenant_id))
+                .map(|(_k, v)| v),
+        )
+        .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
+    }
+
+    pub(crate) fn tenant_list(&self) -> Vec<TenantDescribeResponse> {
+        let locked = self.inner.read().unwrap();
+
+        let mut result = Vec::new();
+        for (_tenant_id, tenant_shards) in
+            &locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id)
+        {
+            result.push(
+                self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v))
+                    .expect("Groups are always non-empty"),
+            );
+        }
+
+        result
+    }
+
    #[instrument(skip_all, fields(tenant_id=%op.tenant_id))]
    async fn abort_tenant_shard_split(
        &self,
@@ -2779,7 +2893,7 @@ impl Service {

                tracing::info!("Restoring parent shard {tenant_shard_id}");
                shard.splitting = SplitState::Idle;
-                if let Err(e) = shard.schedule(scheduler) {
+                if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) {
                    // If this shard can't be scheduled now (perhaps due to offline nodes or
                    // capacity issues), that must not prevent us rolling back a split.  In this
                    // case it should be eventually scheduled in the background.
@@ -2903,6 +3017,7 @@ impl Service {
                    )
                };

+                let mut schedule_context = ScheduleContext::default();
                for child in child_ids {
                    let mut child_shard = parent_ident;
                    child_shard.number = child.shard_number;
@@ -2938,7 +3053,7 @@ impl Service {

                    child_locations.push((child, pageserver, child_shard.stripe_size));

-                    if let Err(e) = child_state.schedule(scheduler) {
+                    if let Err(e) = child_state.schedule(scheduler, &mut schedule_context) {
                        // This is not fatal, because we've implicitly already got an attached
                        // location for the child shard.  Failure here just means we couldn't
                        // find a secondary (e.g. because cluster is overloaded).
@@ -3231,6 +3346,10 @@ impl Service {
                    placement_policy: serde_json::to_string(&policy).unwrap(),
                    config: serde_json::to_string(&config).unwrap(),
                    splitting: SplitState::Splitting,
+
+                    // Scheduling policies do not carry through to children
+                    scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
+                        .unwrap(),
                });
            }

@@ -3798,6 +3917,7 @@ impl Service {
            AvailabilityTransition::ToOffline => {
                tracing::info!("Node {} transition to offline", node_id);
                let mut tenants_affected: usize = 0;
+
                for (tenant_shard_id, tenant_state) in tenants {
                    if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
                        // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
@@ -3814,7 +3934,13 @@ impl Service {

                    if tenant_state.intent.demote_attached(node_id) {
                        tenant_state.sequence = tenant_state.sequence.next();
-                        match tenant_state.schedule(scheduler) {
+
+                        // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
+                        // for tenants without secondary locations: if they have a secondary location, then this
+                        // schedule() call is just promoting an existing secondary)
+                        let mut schedule_context = ScheduleContext::default();
+
+                        match tenant_state.schedule(scheduler, &mut schedule_context) {
                            Err(e) => {
                                // It is possible that some tenants will become unschedulable when too many pageservers
                                // go offline: in this case there isn't much we can do other than make the issue observable.
@@ -3865,9 +3991,6 @@ impl Service {
    /// Helper for methods that will try and call pageserver APIs for
    /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
    /// is attached somewhere.
-    ///
-    /// TODO: this doesn't actually ensure attached unless the PlacementPolicy is
-    /// an attached policy.  We should error out if it isn't.
    fn ensure_attached_schedule(
        &self,
        mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
@@ -3876,10 +3999,27 @@ impl Service {
        let mut waiters = Vec::new();
        let (nodes, tenants, scheduler) = locked.parts_mut();

-        for (_tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
-            shard.schedule(scheduler)?;
+        let mut schedule_context = ScheduleContext::default();
+        for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+            shard.schedule(scheduler, &mut schedule_context)?;
+
+            // The shard's policies may not result in an attached location being scheduled: this
+            // is an error because our caller needs it attached somewhere.
+            if shard.intent.get_attached().is_none() {
+                return Err(anyhow::anyhow!(
+                    "Tenant {tenant_id} not scheduled to be attached"
+                ));
+            };
+
+            if shard.stably_attached().is_some() {
+                // We do not require the shard to be totally up to date on reconciliation: we just require
+                // that it has been attached on the intended node.   Other dirty state such as unattached secondary
+                // locations, or compute hook notifications can be ignored.
+                continue;
+            }

            if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
+                tracing::info!("Waiting for shard {tenant_shard_id} to reconcile, in order to ensure it is attached");
                waiters.push(waiter);
            }
        }
@@ -3941,8 +4081,144 @@ impl Service {
        let (nodes, tenants, _scheduler) = locked.parts_mut();
        let pageservers = nodes.clone();

+        let mut schedule_context = ScheduleContext::default();
+
        let mut reconciles_spawned = 0;
-        for (_tenant_shard_id, shard) in tenants.iter_mut() {
+        for (tenant_shard_id, shard) in tenants.iter_mut() {
+            if tenant_shard_id.is_zero() {
+                schedule_context = ScheduleContext::default();
+            }
+
+            // Eventual consistency: if an earlier reconcile job failed, and the shard is still
+            // dirty, spawn another rone
+            if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
+                reconciles_spawned += 1;
+            }
+
+            schedule_context.avoid(&shard.intent.all_pageservers());
+        }
+
+        reconciles_spawned
+    }
+
+    /// `optimize` in this context means identifying shards which have valid scheduled locations, but
+    /// could be scheduled somewhere better:
+    /// - Cutting over to a secondary if the node with the secondary is more lightly loaded
+    ///    * e.g. after a node fails then recovers, to move some work back to it
+    /// - Cutting over to a secondary if it improves the spread of shard attachments within a tenant
+    ///    * e.g. after a shard split, the initial attached locations will all be on the node where
+    ///      we did the split, but are probably better placed elsewhere.
+    /// - Creating new secondary locations if it improves the spreading of a sharded tenant
+    ///    * e.g. after a shard split, some locations will be on the same node (where the split
+    ///     happened), and will probably be better placed elsewhere.
+    ///
+    /// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at
+    /// the time of scheduling, this function looks for cases where a better-scoring location is available
+    /// according to those same soft constraints.
+    fn optimize_all(&self) -> usize {
+        let mut locked = self.inner.write().unwrap();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
+        let pageservers = nodes.clone();
+
+        let mut schedule_context = ScheduleContext::default();
+
+        let mut reconciles_spawned = 0;
+
+        let mut tenant_shards: Vec<&TenantState> = Vec::new();
+
+        // Limit on how many shards' optmizations each call to this function will execute.  Combined
+        // with the frequency of background calls, this acts as an implicit rate limit that runs a small
+        // trickle of optimizations in the background, rather than executing a large number in parallel
+        // when a change occurs.
+        const MAX_OPTIMIZATIONS_PER_PASS: usize = 2;
+
+        let mut work = Vec::new();
+
+        for (tenant_shard_id, shard) in tenants.iter() {
+            if tenant_shard_id.is_zero() {
+                // Reset accumulators on the first shard in a tenant
+                schedule_context = ScheduleContext::default();
+                tenant_shards.clear();
+            }
+
+            if work.len() >= MAX_OPTIMIZATIONS_PER_PASS {
+                break;
+            }
+
+            match shard.get_scheduling_policy() {
+                ShardSchedulingPolicy::Active => {
+                    // Ok to do optimization
+                }
+                ShardSchedulingPolicy::Essential
+                | ShardSchedulingPolicy::Pause
+                | ShardSchedulingPolicy::Stop => {
+                    // Policy prevents optimizing this shard.
+                    continue;
+                }
+            }
+
+            // Accumulate the schedule context for all the shards in a tenant: we must have
+            // the total view of all shards before we can try to optimize any of them.
+            schedule_context.avoid(&shard.intent.all_pageservers());
+            if let Some(attached) = shard.intent.get_attached() {
+                schedule_context.push_attached(*attached);
+            }
+            tenant_shards.push(shard);
+
+            // Once we have seen the last shard in the tenant, proceed to search across all shards
+            // in the tenant for optimizations
+            if shard.shard.number.0 == shard.shard.count.count() - 1 {
+                if tenant_shards.iter().any(|s| s.reconciler.is_some()) {
+                    // Do not start any optimizations while another change to the tenant is ongoing: this
+                    // is not necessary for correctness, but simplifies operations and implicitly throttles
+                    // optimization changes to happen in a "trickle" over time.
+                    continue;
+                }
+
+                if tenant_shards.iter().any(|s| {
+                    !matches!(s.splitting, SplitState::Idle)
+                        || matches!(s.policy, PlacementPolicy::Detached)
+                }) {
+                    // Never attempt to optimize a tenant that is currently being split, or
+                    // a tenant that is meant to be detached
+                    continue;
+                }
+
+                // TODO: optimization calculations are relatively expensive: create some fast-path for
+                // the common idle case (avoiding the search on tenants that we have recently checked)
+
+                for shard in &tenant_shards {
+                    if let Some(optimization) =
+                        // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
+                        // its primary location based on soft constraints, cut it over.
+                        shard.optimize_attachment(nodes, &schedule_context)
+                    {
+                        work.push((shard.tenant_shard_id, optimization));
+                        break;
+                    } else if let Some(optimization) =
+                        // If idle, maybe optimize secondary locations: if a shard has a secondary location that would be
+                        // better placed on another node, based on ScheduleContext, then adjust it.  This
+                        // covers cases like after a shard split, where we might have too many shards
+                        // in the same tenant with secondary locations on the node where they originally split.
+                        shard.optimize_secondary(scheduler, &schedule_context)
+                    {
+                        work.push((shard.tenant_shard_id, optimization));
+                        break;
+                    }
+
+                    // TODO: extend this mechanism to prefer attaching on nodes with fewer attached
+                    // tenants (i.e. extend schedule state to distinguish attached from secondary counts),
+                    // for the total number of attachments on a node (not just within a tenant.)
+                }
+            }
+        }
+
+        for (tenant_shard_id, optimization) in work {
+            let shard = tenants
+                .get_mut(&tenant_shard_id)
+                .expect("We held lock from place we got this ID");
+            shard.apply_optimization(scheduler, optimization);
+
            if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
                reconciles_spawned += 1;
            }
@@ -3951,6 +4227,32 @@ impl Service {
        reconciles_spawned
    }

+    /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but
+    /// also wait for any generated Reconcilers to complete.  Calling this until it returns zero should
+    /// put the system into a quiescent state where future background reconciliations won't do anything.
+    pub(crate) async fn reconcile_all_now(&self) -> Result<usize, ReconcileWaitError> {
+        let reconciles_spawned = self.reconcile_all();
+        if reconciles_spawned == 0 {
+            // Only optimize when we are otherwise idle
+            self.optimize_all();
+        }
+
+        let waiters = {
+            let mut waiters = Vec::new();
+            let locked = self.inner.read().unwrap();
+            for (_tenant_shard_id, shard) in locked.tenants.iter() {
+                if let Some(waiter) = shard.get_waiter() {
+                    waiters.push(waiter);
+                }
+            }
+            waiters
+        };
+
+        let waiter_count = waiters.len();
+        self.await_waiters(waiters, RECONCILE_TIMEOUT).await?;
+        Ok(waiter_count)
+    }
+
    pub async fn shutdown(&self) {
        // Note that this already stops processing any results from reconciles: so
        // we do not expect that our [`TenantState`] objects will reach a neat
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -7,8 +7,9 @@ use std::{
 use crate::{
    metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
    persistence::TenantShardPersistence,
+    scheduler::{AffinityScore, MaySchedule, ScheduleContext},
 };
-use pageserver_api::controller_api::PlacementPolicy;
+use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
 use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
    shard::{ShardIdentity, TenantShardId},
@@ -116,6 +117,10 @@ pub(crate) struct TenantState {
    /// sending it.  This is the mechanism by which compute notifications are included in the scope
    /// of state that we publish externally in an eventually consistent way.
    pub(crate) pending_compute_notification: bool,
+
+    // Support/debug tool: if something is going wrong or flapping with scheduling, this may
+    // be set to a non-active state to avoid making changes while the issue is fixed.
+    scheduling_policy: ShardSchedulingPolicy,
 }

 #[derive(Default, Clone, Debug, Serialize)]
@@ -246,8 +251,13 @@ impl IntentState {

 impl Drop for IntentState {
    fn drop(&mut self) {
-        // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
-        debug_assert!(self.attached.is_none() && self.secondary.is_empty());
+        // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler.
+        // We do not check this while panicking, to avoid polluting unit test failures or
+        // other assertions with this assertion's output.  It's still wrong to leak these,
+        // but if we already have a panic then we don't need to independently flag this case.
+        if !(std::thread::panicking()) {
+            debug_assert!(self.attached.is_none() && self.secondary.is_empty());
+        }
    }
 }

@@ -292,6 +302,26 @@ pub enum ReconcileWaitError {
    Failed(TenantShardId, String),
 }

+#[derive(Eq, PartialEq, Debug)]
+pub(crate) struct ReplaceSecondary {
+    old_node_id: NodeId,
+    new_node_id: NodeId,
+}
+
+#[derive(Eq, PartialEq, Debug)]
+pub(crate) struct MigrateAttachment {
+    old_attached_node_id: NodeId,
+    new_attached_node_id: NodeId,
+}
+
+#[derive(Eq, PartialEq, Debug)]
+pub(crate) enum ScheduleOptimization {
+    // Replace one of our secondary locations with a different node
+    ReplaceSecondary(ReplaceSecondary),
+    // Migrate attachment to an existing secondary location
+    MigrateAttachment(MigrateAttachment),
+}
+
 impl ReconcilerWaiter {
    pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
        tokio::select! {
@@ -370,6 +400,7 @@ impl TenantState {
            error_waiter: Arc::new(SeqWait::new(Sequence(0))),
            last_error: Arc::default(),
            pending_compute_notification: false,
+            scheduling_policy: ShardSchedulingPolicy::default(),
        }
    }

@@ -425,6 +456,7 @@ impl TenantState {
    fn schedule_attached(
        &mut self,
        scheduler: &mut Scheduler,
+        context: &ScheduleContext,
    ) -> Result<(bool, NodeId), ScheduleError> {
        // No work to do if we already have an attached tenant
        if let Some(node_id) = self.intent.attached {
@@ -438,14 +470,33 @@ impl TenantState {
            Ok((true, promote_secondary))
        } else {
            // Pick a fresh node: either we had no secondaries or none were schedulable
-            let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
+            let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?;
            tracing::debug!("Selected {} as attached", node_id);
            self.intent.set_attached(scheduler, Some(node_id));
            Ok((true, node_id))
        }
    }

-    pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
+    pub(crate) fn schedule(
+        &mut self,
+        scheduler: &mut Scheduler,
+        context: &mut ScheduleContext,
+    ) -> Result<(), ScheduleError> {
+        let r = self.do_schedule(scheduler, context);
+
+        context.avoid(&self.intent.all_pageservers());
+        if let Some(attached) = self.intent.get_attached() {
+            context.push_attached(*attached);
+        }
+
+        r
+    }
+
+    pub(crate) fn do_schedule(
+        &mut self,
+        scheduler: &mut Scheduler,
+        context: &ScheduleContext,
+    ) -> Result<(), ScheduleError> {
        // TODO: before scheduling new nodes, check if any existing content in
        // self.intent refers to pageservers that are offline, and pick other
        // pageservers if so.
@@ -453,6 +504,16 @@ impl TenantState {
        // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
        // change their attach location.

+        match self.scheduling_policy {
+            ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {}
+            ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
+                // Warn to make it obvious why other things aren't happening/working, if we skip scheduling
+                tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
+                    "Scheduling is disabled by policy {:?}", self.scheduling_policy);
+                return Ok(());
+            }
+        }
+
        // Build the set of pageservers already in use by this tenant, to avoid scheduling
        // more work on the same pageservers we're already using.
        let mut modified = false;
@@ -479,12 +540,13 @@ impl TenantState {
                }

                // Should have exactly one attached, and N secondaries
-                let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
+                let (modified_attached, attached_node_id) =
+                    self.schedule_attached(scheduler, context)?;
                modified |= modified_attached;

                let mut used_pageservers = vec![attached_node_id];
                while self.intent.secondary.len() < secondary_count {
-                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
+                    let node_id = scheduler.schedule_shard(&used_pageservers, context)?;
                    self.intent.push_secondary(scheduler, node_id);
                    used_pageservers.push(node_id);
                    modified = true;
@@ -497,7 +559,7 @@ impl TenantState {
                    modified = true;
                } else if self.intent.secondary.is_empty() {
                    // Populate secondary by scheduling a fresh node
-                    let node_id = scheduler.schedule_shard(&[])?;
+                    let node_id = scheduler.schedule_shard(&[], context)?;
                    self.intent.push_secondary(scheduler, node_id);
                    modified = true;
                }
@@ -524,6 +586,167 @@ impl TenantState {
        Ok(())
    }

+    /// Optimize attachments: if a shard has a secondary location that is preferable to
+    /// its primary location based on soft constraints, switch that secondary location
+    /// to be attached.
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
+    pub(crate) fn optimize_attachment(
+        &self,
+        nodes: &HashMap<NodeId, Node>,
+        schedule_context: &ScheduleContext,
+    ) -> Option<ScheduleOptimization> {
+        let attached = (*self.intent.get_attached())?;
+        if self.intent.secondary.is_empty() {
+            // We can only do useful work if we have both attached and secondary locations: this
+            // function doesn't schedule new locations, only swaps between attached and secondaries.
+            return None;
+        }
+
+        let current_affinity_score = schedule_context.get_node_affinity(attached);
+        let current_attachment_count = schedule_context.get_node_attachments(attached);
+
+        // Generate score for each node, dropping any un-schedulable nodes.
+        let all_pageservers = self.intent.all_pageservers();
+        let mut scores = all_pageservers
+            .iter()
+            .flat_map(|node_id| {
+                if matches!(
+                    nodes
+                        .get(node_id)
+                        .map(|n| n.may_schedule())
+                        .unwrap_or(MaySchedule::No),
+                    MaySchedule::No
+                ) {
+                    None
+                } else {
+                    let affinity_score = schedule_context.get_node_affinity(*node_id);
+                    let attachment_count = schedule_context.get_node_attachments(*node_id);
+                    Some((*node_id, affinity_score, attachment_count))
+                }
+            })
+            .collect::<Vec<_>>();
+
+        // Sort precedence:
+        //  1st - prefer nodes with the lowest total affinity score
+        //  2nd - prefer nodes with the lowest number of attachments in this context
+        //  3rd - if all else is equal, sort by node ID for determinism in tests.
+        scores.sort_by_key(|i| (i.1, i.2, i.0));
+
+        if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) =
+            scores.first()
+        {
+            if attached != *preferred_node {
+                // The best alternative must be more than 1 better than us, otherwise we could end
+                // up flapping back next time we're called (e.g. there's no point migrating from
+                // a location with score 1 to a score zero, because on next location the situation
+                // would be the same, but in reverse).
+                if current_affinity_score > *preferred_affinity_score + AffinityScore(1)
+                    || current_attachment_count > *preferred_attachment_count + 1
+                {
+                    tracing::info!(
+                        "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
+                        self.intent.get_secondary()
+                    );
+                    return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
+                        old_attached_node_id: attached,
+                        new_attached_node_id: *preferred_node,
+                    }));
+                }
+            } else {
+                tracing::debug!(
+                    "Node {} is already preferred (score {:?})",
+                    preferred_node,
+                    preferred_affinity_score
+                );
+            }
+        }
+
+        // Fall-through: we didn't find an optimization
+        None
+    }
+
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
+    pub(crate) fn optimize_secondary(
+        &self,
+        scheduler: &Scheduler,
+        schedule_context: &ScheduleContext,
+    ) -> Option<ScheduleOptimization> {
+        if self.intent.secondary.is_empty() {
+            // We can only do useful work if we have both attached and secondary locations: this
+            // function doesn't schedule new locations, only swaps between attached and secondaries.
+            return None;
+        }
+
+        for secondary in self.intent.get_secondary() {
+            let Some(affinity_score) = schedule_context.nodes.get(secondary) else {
+                // We're already on a node unaffected any affinity constraints,
+                // so we won't change it.
+                continue;
+            };
+
+            // Let the scheduler suggest a node, where it would put us if we were scheduling afresh
+            // This implicitly limits the choice to nodes that are available, and prefers nodes
+            // with lower utilization.
+            let Ok(candidate_node) =
+                scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context)
+            else {
+                // A scheduling error means we have no possible candidate replacements
+                continue;
+            };
+
+            let candidate_affinity_score = schedule_context
+                .nodes
+                .get(&candidate_node)
+                .unwrap_or(&AffinityScore::FREE);
+
+            // The best alternative must be more than 1 better than us, otherwise we could end
+            // up flapping back next time we're called.
+            if *candidate_affinity_score + AffinityScore(1) < *affinity_score {
+                // If some other node is available and has a lower score than this node, then
+                // that other node is a good place to migrate to.
+                tracing::info!(
+                    "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
+                    self.intent.get_secondary()
+                );
+                return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
+                    old_node_id: *secondary,
+                    new_node_id: candidate_node,
+                }));
+            }
+        }
+
+        None
+    }
+
+    pub(crate) fn apply_optimization(
+        &mut self,
+        scheduler: &mut Scheduler,
+        optimization: ScheduleOptimization,
+    ) {
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_schedule_optimization
+            .inc();
+
+        match optimization {
+            ScheduleOptimization::MigrateAttachment(MigrateAttachment {
+                old_attached_node_id,
+                new_attached_node_id,
+            }) => {
+                self.intent.demote_attached(old_attached_node_id);
+                self.intent
+                    .promote_attached(scheduler, new_attached_node_id);
+            }
+            ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
+                old_node_id,
+                new_node_id,
+            }) => {
+                self.intent.remove_secondary(scheduler, old_node_id);
+                self.intent.push_secondary(scheduler, new_node_id);
+            }
+        }
+    }
+
    /// Query whether the tenant's observed state for attached node matches its intent state, and if so,
    /// yield the node ID.  This is appropriate for emitting compute hook notifications: we are checking that
    /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
@@ -668,6 +891,19 @@ impl TenantState {
            }
        }

+        // Pre-checks done: finally check whether we may actually do the work
+        match self.scheduling_policy {
+            ShardSchedulingPolicy::Active
+            | ShardSchedulingPolicy::Essential
+            | ShardSchedulingPolicy::Pause => {}
+            ShardSchedulingPolicy::Stop => {
+                // We only reach this point if there is work to do and we're going to skip
+                // doing it: warn it obvious why this tenant isn't doing what it ought to.
+                tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
+                return None;
+            }
+        }
+
        // Build list of nodes from which the reconciler should detach
        let mut detach = Vec::new();
        for node_id in self.observed.locations.keys() {
@@ -804,6 +1040,22 @@ impl TenantState {
        })
    }

+    /// Get a waiter for any reconciliation in flight, but do not start reconciliation
+    /// if it is not already running
+    pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
+        if self.reconciler.is_some() {
+            Some(ReconcilerWaiter {
+                tenant_shard_id: self.tenant_shard_id,
+                seq_wait: self.waiter.clone(),
+                error_seq_wait: self.error_waiter.clone(),
+                error: self.last_error.clone(),
+                seq: self.sequence,
+            })
+        } else {
+            None
+        }
+    }
+
    /// Called when a ReconcileResult has been emitted and the service is updating
    /// our state: if the result is from a sequence >= my ReconcileHandle, then drop
    /// the handle to indicate there is no longer a reconciliation in progress.
@@ -829,6 +1081,40 @@ impl TenantState {
        debug_assert!(!self.intent.all_pageservers().contains(&node_id));
    }

+    pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) {
+        self.scheduling_policy = p;
+    }
+
+    pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy {
+        &self.scheduling_policy
+    }
+
+    pub(crate) fn from_persistent(
+        tsp: TenantShardPersistence,
+        intent: IntentState,
+    ) -> anyhow::Result<Self> {
+        let tenant_shard_id = tsp.get_tenant_shard_id()?;
+        let shard_identity = tsp.get_shard_identity()?;
+
+        Ok(Self {
+            tenant_shard_id,
+            shard: shard_identity,
+            sequence: Sequence::initial(),
+            generation: tsp.generation.map(|g| Generation::new(g as u32)),
+            policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
+            intent,
+            observed: ObservedState::new(),
+            config: serde_json::from_str(&tsp.config).unwrap(),
+            reconciler: None,
+            splitting: tsp.splitting,
+            waiter: Arc::new(SeqWait::new(Sequence::initial())),
+            error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
+            last_error: Arc::default(),
+            pending_compute_notification: false,
+            scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
+        })
+    }
+
    pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
        TenantShardPersistence {
            tenant_id: self.tenant_shard_id.tenant_id.to_string(),
@@ -840,6 +1126,7 @@ impl TenantState {
            placement_policy: serde_json::to_string(&self.policy).unwrap(),
            config: serde_json::to_string(&self.config).unwrap(),
            splitting: SplitState::default(),
+            scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
        }
    }
 }
@@ -878,6 +1165,32 @@ pub(crate) mod tests {
        )
    }

+    fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantState> {
+        let tenant_id = TenantId::generate();
+
+        (0..shard_count.count())
+            .map(|i| {
+                let shard_number = ShardNumber(i);
+
+                let tenant_shard_id = TenantShardId {
+                    tenant_id,
+                    shard_number,
+                    shard_count,
+                };
+                TenantState::new(
+                    tenant_shard_id,
+                    ShardIdentity::new(
+                        shard_number,
+                        shard_count,
+                        pageserver_api::shard::ShardStripeSize(32768),
+                    )
+                    .unwrap(),
+                    policy.clone(),
+                )
+            })
+            .collect()
+    }
+
    /// Test the scheduling behaviors used when a tenant configured for HA is subject
    /// to nodes being marked offline.
    #[test]
@@ -887,10 +1200,11 @@ pub(crate) mod tests {
        let mut nodes = make_test_nodes(3);

        let mut scheduler = Scheduler::new(nodes.values());
+        let mut context = ScheduleContext::default();

        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
        tenant_state
-            .schedule(&mut scheduler)
+            .schedule(&mut scheduler, &mut context)
            .expect("we have enough nodes, scheduling should work");

        // Expect to initially be schedule on to different nodes
@@ -916,7 +1230,7 @@ pub(crate) mod tests {

        // Scheduling the node should promote the still-available secondary node to attached
        tenant_state
-            .schedule(&mut scheduler)
+            .schedule(&mut scheduler, &mut context)
            .expect("active nodes are available");
        assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);

@@ -980,4 +1294,219 @@ pub(crate) mod tests {
        tenant_state.intent.clear(&mut scheduler);
        Ok(())
    }
+
+    #[test]
+    fn scheduling_mode() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(3);
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
+
+        // In pause mode, schedule() shouldn't do anything
+        tenant_state.scheduling_policy = ShardSchedulingPolicy::Pause;
+        assert!(tenant_state
+            .schedule(&mut scheduler, &mut ScheduleContext::default())
+            .is_ok());
+        assert!(tenant_state.intent.all_pageservers().is_empty());
+
+        // In active mode, schedule() works
+        tenant_state.scheduling_policy = ShardSchedulingPolicy::Active;
+        assert!(tenant_state
+            .schedule(&mut scheduler, &mut ScheduleContext::default())
+            .is_ok());
+        assert!(!tenant_state.intent.all_pageservers().is_empty());
+
+        tenant_state.intent.clear(&mut scheduler);
+        Ok(())
+    }
+
+    #[test]
+    fn optimize_attachment() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(3);
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
+
+        // Initially: both nodes attached on shard 1, and both have secondary locations
+        // on different nodes.
+        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(2));
+        shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1)));
+        shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
+
+        let mut schedule_context = ScheduleContext::default();
+        schedule_context.avoid(&shard_a.intent.all_pageservers());
+        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
+        schedule_context.avoid(&shard_b.intent.all_pageservers());
+        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
+
+        let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);
+
+        // Either shard should recognize that it has the option to switch to a secondary location where there
+        // would be no other shards from the same tenant, and request to do so.
+        assert_eq!(
+            optimization_a,
+            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
+                old_attached_node_id: NodeId(1),
+                new_attached_node_id: NodeId(2)
+            }))
+        );
+
+        // Note that these optimizing two shards in the same tenant with the same ScheduleContext is
+        // mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility
+        // of [`Service::optimize_all`] to avoid trying
+        // to do optimizations for multiple shards in the same tenant at the same time.  Generating
+        // both optimizations is just done for test purposes
+        let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
+        assert_eq!(
+            optimization_b,
+            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
+                old_attached_node_id: NodeId(1),
+                new_attached_node_id: NodeId(3)
+            }))
+        );
+
+        // Applying these optimizations should result in the end state proposed
+        shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
+        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2)));
+        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
+        shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap());
+        assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3)));
+        assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]);
+
+        shard_a.intent.clear(&mut scheduler);
+        shard_b.intent.clear(&mut scheduler);
+
+        Ok(())
+    }
+
+    #[test]
+    fn optimize_secondary() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(4);
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
+
+        // Initially: both nodes attached on shard 1, and both have secondary locations
+        // on different nodes.
+        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
+        shard_b.intent.set_attached(&mut scheduler, Some(NodeId(2)));
+        shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
+
+        let mut schedule_context = ScheduleContext::default();
+        schedule_context.avoid(&shard_a.intent.all_pageservers());
+        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
+        schedule_context.avoid(&shard_b.intent.all_pageservers());
+        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
+
+        let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context);
+
+        // Since there is a node with no locations available, the node with two locations for the
+        // same tenant should generate an optimization to move one away
+        assert_eq!(
+            optimization_a,
+            Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
+                old_node_id: NodeId(3),
+                new_node_id: NodeId(4)
+            }))
+        );
+
+        shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
+        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
+        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(4)]);
+
+        shard_a.intent.clear(&mut scheduler);
+        shard_b.intent.clear(&mut scheduler);
+
+        Ok(())
+    }
+
+    // Optimize til quiescent: this emulates what Service::optimize_all does, when
+    // called repeatedly in the background.
+    fn optimize_til_idle(
+        nodes: &HashMap<NodeId, Node>,
+        scheduler: &mut Scheduler,
+        shards: &mut [TenantState],
+    ) {
+        let mut loop_n = 0;
+        loop {
+            let mut schedule_context = ScheduleContext::default();
+            let mut any_changed = false;
+
+            for shard in shards.iter() {
+                schedule_context.avoid(&shard.intent.all_pageservers());
+                if let Some(attached) = shard.intent.get_attached() {
+                    schedule_context.push_attached(*attached);
+                }
+            }
+
+            for shard in shards.iter_mut() {
+                let optimization = shard.optimize_attachment(nodes, &schedule_context);
+                if let Some(optimization) = optimization {
+                    shard.apply_optimization(scheduler, optimization);
+                    any_changed = true;
+                    break;
+                }
+
+                let optimization = shard.optimize_secondary(scheduler, &schedule_context);
+                if let Some(optimization) = optimization {
+                    shard.apply_optimization(scheduler, optimization);
+                    any_changed = true;
+                    break;
+                }
+            }
+
+            if !any_changed {
+                break;
+            }
+
+            // Assert no infinite loop
+            loop_n += 1;
+            assert!(loop_n < 1000);
+        }
+    }
+
+    /// Test the balancing behavior of shard scheduling: that it achieves a balance, and
+    /// that it converges.
+    #[test]
+    fn optimize_add_nodes() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(4);
+
+        // Only show the scheduler a couple of nodes
+        let mut scheduler = Scheduler::new([].iter());
+        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
+        scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
+
+        let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
+        let mut schedule_context = ScheduleContext::default();
+        for shard in &mut shards {
+            assert!(shard
+                .schedule(&mut scheduler, &mut schedule_context)
+                .is_ok());
+        }
+
+        // We should see equal number of locations on the two nodes.
+        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
+
+        // Add another two nodes: we should see the shards spread out when their optimize
+        // methods are called
+        scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap());
+        scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
+        optimize_til_idle(&nodes, &mut scheduler, &mut shards);
+
+        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
+        assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
+
+        for shard in shards.iter_mut() {
+            shard.intent.clear(&mut scheduler);
+        }
+
+        Ok(())
+    }
 }
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -294,7 +294,7 @@ where
    //      is in state 'taken' but the thread that would unlock it is
    //      not there.
    //   2. A rust object that represented some external resource in the
-    //      parent now got implicitly copied by the the fork, even though
+    //      parent now got implicitly copied by the fork, even though
    //      the object's type is not `Copy`. The parent program may use
    //      non-copyability as way to enforce unique ownership of an
    //      external resource in the typesystem. The fork breaks that
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -14,9 +14,7 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
-use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
-};
+use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::models::{
    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
@@ -1060,21 +1058,6 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }
        }

-        Some(("set-state", subcommand_args)) => {
-            let pageserver = get_pageserver(env, subcommand_args)?;
-            let scheduling = subcommand_args.get_one("scheduling");
-            let availability = subcommand_args.get_one("availability");
-
-            let storage_controller = StorageController::from_env(env);
-            storage_controller
-                .node_configure(NodeConfigureRequest {
-                    node_id: pageserver.conf.id,
-                    scheduling: scheduling.cloned(),
-                    availability: availability.cloned(),
-                })
-                .await?;
-        }
-
        Some(("status", subcommand_args)) => {
            match get_pageserver(env, subcommand_args)?.check_status().await {
                Ok(_) => println!("Page server is up and running"),
@@ -1515,12 +1498,6 @@ fn cli() -> Command {
                    .about("Restart local pageserver")
                    .arg(pageserver_config_args.clone())
                )
-                .subcommand(Command::new("set-state")
-                    .arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
-                    .arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
-                    .about("Set scheduling or availability state of pageserver node")
-                    .arg(pageserver_config_args.clone())
-                )
        )
        .subcommand(
            Command::new("storage_controller")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -12,7 +12,7 @@
 //!
 //! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
 //! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
-//! the basebackup from the pageserver to initialize the the data directory, and
+//! the basebackup from the pageserver to initialize the data directory, and
 //! finally launches the PostgreSQL process. It watches the PostgreSQL process
 //! until it exits.
 //!
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -389,6 +389,10 @@ impl PageServerNode {
                .remove("image_creation_threshold")
                .map(|x| x.parse::<usize>())
                .transpose()?,
+            image_layer_creation_check_threshold: settings
+                .remove("image_layer_creation_check_threshold")
+                .map(|x| x.parse::<u8>())
+                .transpose()?,
            pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
            walreceiver_connect_timeout: settings
                .remove("walreceiver_connect_timeout")
@@ -501,6 +505,12 @@ impl PageServerNode {
                    .map(|x| x.parse::<usize>())
                    .transpose()
                    .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
+                image_layer_creation_check_threshold: settings
+                    .remove("image_layer_creation_check_threshold")
+                    .map(|x| x.parse::<u8>())
+                    .transpose()
+                    .context("Failed to parse 'image_creation_check_threshold' as integer")?,
+
                pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
                walreceiver_connect_timeout: settings
                    .remove("walreceiver_connect_timeout")
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -279,6 +279,7 @@ impl StorageController {
            &self.listen,
            "-p",
            self.path.as_ref(),
+            "--dev",
            "--database-url",
            &database_url,
            "--max-unavailable-interval",
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "storcon_cli"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+
+[dependencies]
+anyhow.workspace = true
+clap.workspace = true
+comfy-table.workspace = true
+hyper.workspace = true
+pageserver_api.workspace = true
+pageserver_client.workspace = true
+reqwest.workspace = true
+serde.workspace = true
+serde_json = { workspace = true, features = ["raw_value"] }
+thiserror.workspace = true
+tokio.workspace = true
+tracing.workspace = true
+utils.workspace = true
+workspace_hack.workspace = true
+
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -0,0 +1,587 @@
+use std::{collections::HashMap, str::FromStr};
+
+use clap::{Parser, Subcommand};
+use hyper::Method;
+use pageserver_api::{
+    controller_api::{
+        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
+        TenantDescribeResponse, TenantPolicyRequest,
+    },
+    models::{
+        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
+        TenantShardSplitRequest, TenantShardSplitResponse,
+    },
+    shard::{ShardStripeSize, TenantShardId},
+};
+use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
+use reqwest::Url;
+use serde::{de::DeserializeOwned, Serialize};
+use utils::id::{NodeId, TenantId};
+
+use pageserver_api::controller_api::{
+    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
+    TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
+};
+
+#[derive(Subcommand, Debug)]
+enum Command {
+    /// Register a pageserver with the storage controller.  This shouldn't usually be necessary,
+    /// since pageservers auto-register when they start up
+    NodeRegister {
+        #[arg(long)]
+        node_id: NodeId,
+
+        #[arg(long)]
+        listen_pg_addr: String,
+        #[arg(long)]
+        listen_pg_port: u16,
+
+        #[arg(long)]
+        listen_http_addr: String,
+        #[arg(long)]
+        listen_http_port: u16,
+    },
+
+    /// Modify a node's configuration in the storage controller
+    NodeConfigure {
+        #[arg(long)]
+        node_id: NodeId,
+
+        /// Availability is usually auto-detected based on heartbeats.  Set 'offline' here to
+        /// manually mark a node offline
+        #[arg(long)]
+        availability: Option<NodeAvailabilityArg>,
+        /// Scheduling policy controls whether tenant shards may be scheduled onto this node.
+        #[arg(long)]
+        scheduling: Option<NodeSchedulingPolicy>,
+    },
+    /// Modify a tenant's policies in the storage controller
+    TenantPolicy {
+        #[arg(long)]
+        tenant_id: TenantId,
+        /// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
+        /// or is in the normal attached state with N secondary locations (`attached:N`)
+        #[arg(long)]
+        placement: Option<PlacementPolicyArg>,
+        /// Scheduling policy enables pausing the controller's scheduling activity involving this tenant.  `active` is normal,
+        /// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
+        /// all reconciliation activity including for scheduling changes already made.  `pause` and `stop` can make a tenant
+        /// unavailable, and are only for use in emergencies.
+        #[arg(long)]
+        scheduling: Option<ShardSchedulingPolicyArg>,
+    },
+    /// List nodes known to the storage controller
+    Nodes {},
+    /// List tenants known to the storage controller
+    Tenants {},
+    /// Create a new tenant in the storage controller, and by extension on pageservers.
+    TenantCreate {
+        #[arg(long)]
+        tenant_id: TenantId,
+    },
+    /// Delete a tenant in the storage controller, and by extension on pageservers.
+    TenantDelete {
+        #[arg(long)]
+        tenant_id: TenantId,
+    },
+    /// Split an existing tenant into a higher number of shards than its current shard count.
+    TenantShardSplit {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        shard_count: u8,
+        /// Optional, in 8kiB pages.  e.g. set 2048 for 16MB stripes.
+        #[arg(long)]
+        stripe_size: Option<u32>,
+    },
+    /// Migrate the attached location for a tenant shard to a specific pageserver.
+    TenantShardMigrate {
+        #[arg(long)]
+        tenant_shard_id: TenantShardId,
+        #[arg(long)]
+        node: NodeId,
+    },
+    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
+    /// that is passed through to pageservers, and does not affect storage controller behavior.
+    TenantConfig {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        config: String,
+    },
+    /// Attempt to balance the locations for a tenant across pageservers.  This is a client-side
+    /// alternative to the storage controller's scheduling optimization behavior.
+    TenantScatter {
+        #[arg(long)]
+        tenant_id: TenantId,
+    },
+    /// Print details about a particular tenant, including all its shards' states.
+    TenantDescribe {
+        #[arg(long)]
+        tenant_id: TenantId,
+    },
+}
+
+#[derive(Parser)]
+#[command(
+    author,
+    version,
+    about,
+    long_about = "CLI for Storage Controller Support/Debug"
+)]
+#[command(arg_required_else_help(true))]
+struct Cli {
+    #[arg(long)]
+    /// URL to storage controller.  e.g. http://127.0.0.1:1234 when using `neon_local`
+    api: Url,
+
+    #[arg(long)]
+    /// JWT token for authenticating with storage controller.  Depending on the API used, this
+    /// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
+    /// a token with both scopes to use with this tool.
+    jwt: Option<String>,
+
+    #[command(subcommand)]
+    command: Command,
+}
+
+#[derive(Debug, Clone)]
+struct PlacementPolicyArg(PlacementPolicy);
+
+impl FromStr for PlacementPolicyArg {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "detached" => Ok(Self(PlacementPolicy::Detached)),
+            "secondary" => Ok(Self(PlacementPolicy::Secondary)),
+            _ if s.starts_with("attached:") => {
+                let mut splitter = s.split(':');
+                let _prefix = splitter.next().unwrap();
+                match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
+                    Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
+                    None => Err(anyhow::anyhow!(
+                        "Invalid format '{s}', a valid example is 'attached:1'"
+                    )),
+                }
+            }
+            _ => Err(anyhow::anyhow!(
+                "Unknown placement policy '{s}', try detached,secondary,attached:<n>"
+            )),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
+
+impl FromStr for ShardSchedulingPolicyArg {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self(ShardSchedulingPolicy::Active)),
+            "essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
+            "pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
+            "stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
+            _ => Err(anyhow::anyhow!(
+                "Unknown scheduling policy '{s}', try active,essential,pause,stop"
+            )),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct NodeAvailabilityArg(NodeAvailabilityWrapper);
+
+impl FromStr for NodeAvailabilityArg {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self(NodeAvailabilityWrapper::Active)),
+            "offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
+            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
+        }
+    }
+}
+
+struct Client {
+    base_url: Url,
+    jwt_token: Option<String>,
+    client: reqwest::Client,
+}
+
+impl Client {
+    fn new(base_url: Url, jwt_token: Option<String>) -> Self {
+        Self {
+            base_url,
+            jwt_token,
+            client: reqwest::ClientBuilder::new()
+                .build()
+                .expect("Failed to construct http client"),
+        }
+    }
+
+    /// Simple HTTP request wrapper for calling into attachment service
+    async fn dispatch<RQ, RS>(
+        &self,
+        method: hyper::Method,
+        path: String,
+        body: Option<RQ>,
+    ) -> mgmt_api::Result<RS>
+    where
+        RQ: Serialize + Sized,
+        RS: DeserializeOwned + Sized,
+    {
+        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
+        // for general purpose API access.
+        let url = Url::from_str(&format!(
+            "http://{}:{}/{path}",
+            self.base_url.host_str().unwrap(),
+            self.base_url.port().unwrap()
+        ))
+        .unwrap();
+
+        let mut builder = self.client.request(method, url);
+        if let Some(body) = body {
+            builder = builder.json(&body)
+        }
+        if let Some(jwt_token) = &self.jwt_token {
+            builder = builder.header(
+                reqwest::header::AUTHORIZATION,
+                format!("Bearer {jwt_token}"),
+            );
+        }
+
+        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
+        let response = response.error_from_body().await?;
+
+        response
+            .json()
+            .await
+            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
+    }
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let cli = Cli::parse();
+
+    let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
+
+    let mut trimmed = cli.api.to_string();
+    trimmed.pop();
+    let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
+
+    match cli.command {
+        Command::NodeRegister {
+            node_id,
+            listen_pg_addr,
+            listen_pg_port,
+            listen_http_addr,
+            listen_http_port,
+        } => {
+            storcon_client
+                .dispatch::<_, ()>(
+                    Method::POST,
+                    "control/v1/node".to_string(),
+                    Some(NodeRegisterRequest {
+                        node_id,
+                        listen_pg_addr,
+                        listen_pg_port,
+                        listen_http_addr,
+                        listen_http_port,
+                    }),
+                )
+                .await?;
+        }
+        Command::TenantCreate { tenant_id } => {
+            vps_client
+                .tenant_create(&TenantCreateRequest {
+                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    generation: None,
+                    shard_parameters: ShardParameters::default(),
+                    placement_policy: Some(PlacementPolicy::Attached(1)),
+                    config: TenantConfig::default(),
+                })
+                .await?;
+        }
+        Command::TenantDelete { tenant_id } => {
+            let status = vps_client
+                .tenant_delete(TenantShardId::unsharded(tenant_id))
+                .await?;
+            tracing::info!("Delete status: {}", status);
+        }
+        Command::Nodes {} => {
+            let resp = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+            let mut table = comfy_table::Table::new();
+            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
+            for node in resp {
+                table.add_row([
+                    format!("{}", node.id),
+                    node.listen_http_addr,
+                    format!("{:?}", node.scheduling),
+                    format!("{:?}", node.availability),
+                ]);
+            }
+            println!("{table}");
+        }
+        Command::NodeConfigure {
+            node_id,
+            availability,
+            scheduling,
+        } => {
+            let req = NodeConfigureRequest {
+                node_id,
+                availability: availability.map(|a| a.0),
+                scheduling,
+            };
+            storcon_client
+                .dispatch::<_, ()>(
+                    Method::PUT,
+                    format!("control/v1/node/{node_id}/config"),
+                    Some(req),
+                )
+                .await?;
+        }
+        Command::Tenants {} => {
+            let resp = storcon_client
+                .dispatch::<(), Vec<TenantDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/tenant".to_string(),
+                    None,
+                )
+                .await?;
+            let mut table = comfy_table::Table::new();
+            table.set_header([
+                "TenantId",
+                "ShardCount",
+                "StripeSize",
+                "Placement",
+                "Scheduling",
+            ]);
+            for tenant in resp {
+                let shard_zero = tenant.shards.into_iter().next().unwrap();
+                table.add_row([
+                    format!("{}", tenant.tenant_id),
+                    format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
+                    format!("{:?}", tenant.stripe_size),
+                    format!("{:?}", tenant.policy),
+                    format!("{:?}", shard_zero.scheduling_policy),
+                ]);
+            }
+
+            println!("{table}");
+        }
+        Command::TenantPolicy {
+            tenant_id,
+            placement,
+            scheduling,
+        } => {
+            let req = TenantPolicyRequest {
+                scheduling: scheduling.map(|s| s.0),
+                placement: placement.map(|p| p.0),
+            };
+            storcon_client
+                .dispatch::<_, ()>(
+                    Method::PUT,
+                    format!("control/v1/tenant/{tenant_id}/policy"),
+                    Some(req),
+                )
+                .await?;
+        }
+        Command::TenantShardSplit {
+            tenant_id,
+            shard_count,
+            stripe_size,
+        } => {
+            let req = TenantShardSplitRequest {
+                new_shard_count: shard_count,
+                new_stripe_size: stripe_size.map(ShardStripeSize),
+            };
+
+            let response = storcon_client
+                .dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
+                    Method::PUT,
+                    format!("control/v1/tenant/{tenant_id}/shard_split"),
+                    Some(req),
+                )
+                .await?;
+            println!(
+                "Split tenant {} into {} shards: {}",
+                tenant_id,
+                shard_count,
+                response
+                    .new_shards
+                    .iter()
+                    .map(|s| format!("{:?}", s))
+                    .collect::<Vec<_>>()
+                    .join(",")
+            );
+        }
+        Command::TenantShardMigrate {
+            tenant_shard_id,
+            node,
+        } => {
+            let req = TenantShardMigrateRequest {
+                tenant_shard_id,
+                node_id: node,
+            };
+
+            storcon_client
+                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
+                    Method::PUT,
+                    format!("control/v1/tenant/{tenant_shard_id}/migrate"),
+                    Some(req),
+                )
+                .await?;
+        }
+        Command::TenantConfig { tenant_id, config } => {
+            let tenant_conf = serde_json::from_str(&config)?;
+
+            vps_client
+                .tenant_config(&TenantConfigRequest {
+                    tenant_id,
+                    config: tenant_conf,
+                })
+                .await?;
+        }
+        Command::TenantScatter { tenant_id } => {
+            // Find the shards
+            let locate_response = storcon_client
+                .dispatch::<(), TenantLocateResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}/locate"),
+                    None,
+                )
+                .await?;
+            let shards = locate_response.shards;
+
+            let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
+            let shard_count = shards.len();
+            for s in shards {
+                let entry = node_to_shards.entry(s.node_id).or_default();
+                entry.push(s.shard_id);
+            }
+
+            // Load list of available nodes
+            let nodes_resp = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+
+            for node in nodes_resp {
+                if matches!(node.availability, NodeAvailabilityWrapper::Active) {
+                    node_to_shards.entry(node.id).or_default();
+                }
+            }
+
+            let max_shard_per_node = shard_count / node_to_shards.len();
+
+            loop {
+                let mut migrate_shard = None;
+                for shards in node_to_shards.values_mut() {
+                    if shards.len() > max_shard_per_node {
+                        // Pick the emptiest
+                        migrate_shard = Some(shards.pop().unwrap());
+                    }
+                }
+                let Some(migrate_shard) = migrate_shard else {
+                    break;
+                };
+
+                // Pick the emptiest node to migrate to
+                let mut destinations = node_to_shards
+                    .iter()
+                    .map(|(k, v)| (k, v.len()))
+                    .collect::<Vec<_>>();
+                destinations.sort_by_key(|i| i.1);
+                let (destination_node, destination_count) = *destinations.first().unwrap();
+                if destination_count + 1 > max_shard_per_node {
+                    // Even the emptiest destination doesn't have space: we're done
+                    break;
+                }
+                let destination_node = *destination_node;
+
+                node_to_shards
+                    .get_mut(&destination_node)
+                    .unwrap()
+                    .push(migrate_shard);
+
+                println!("Migrate {} -> {} ...", migrate_shard, destination_node);
+
+                storcon_client
+                    .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
+                        Method::PUT,
+                        format!("control/v1/tenant/{migrate_shard}/migrate"),
+                        Some(TenantShardMigrateRequest {
+                            tenant_shard_id: migrate_shard,
+                            node_id: destination_node,
+                        }),
+                    )
+                    .await?;
+                println!("Migrate {} -> {} OK", migrate_shard, destination_node);
+            }
+
+            // Spread the shards across the nodes
+        }
+        Command::TenantDescribe { tenant_id } => {
+            let describe_response = storcon_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await?;
+            let shards = describe_response.shards;
+            let mut table = comfy_table::Table::new();
+            table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
+            for shard in shards {
+                let secondary = shard
+                    .node_secondary
+                    .iter()
+                    .map(|n| format!("{}", n))
+                    .collect::<Vec<_>>()
+                    .join(",");
+
+                let mut status_parts = Vec::new();
+                if shard.is_reconciling {
+                    status_parts.push("reconciling");
+                }
+
+                if shard.is_pending_compute_notification {
+                    status_parts.push("pending_compute");
+                }
+
+                if shard.is_splitting {
+                    status_parts.push("splitting");
+                }
+                let status = status_parts.join(",");
+
+                table.add_row([
+                    format!("{}", shard.tenant_shard_id),
+                    shard
+                        .node_attached
+                        .map(|n| format!("{}", n))
+                        .unwrap_or(String::new()),
+                    secondary,
+                    shard.last_error,
+                    status,
+                ]);
+            }
+            println!("{table}");
+        }
+    }
+
+    Ok(())
+}
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -40,7 +40,7 @@ macro_rules! register_hll {
    }};

    ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
-        $crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
+        $crate::register_hll!($N, $crate::opts!($NAME, $HELP))
    }};
 }

--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -4,7 +4,7 @@ use std::str::FromStr;
 /// API (`/control/v1` prefix).  Implemented by the server
 /// in [`attachment_service::http`]
 use serde::{Deserialize, Serialize};
-use utils::id::NodeId;
+use utils::id::{NodeId, TenantId};

 use crate::{
    models::{ShardParameters, TenantConfig},
@@ -42,6 +42,12 @@ pub struct NodeConfigureRequest {
    pub scheduling: Option<NodeSchedulingPolicy>,
 }

+#[derive(Serialize, Deserialize)]
+pub struct TenantPolicyRequest {
+    pub placement: Option<PlacementPolicy>,
+    pub scheduling: Option<ShardSchedulingPolicy>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantLocateResponseShard {
    pub shard_id: TenantShardId,
@@ -62,12 +68,27 @@ pub struct TenantLocateResponse {

 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponse {
+    pub tenant_id: TenantId,
    pub shards: Vec<TenantDescribeResponseShard>,
    pub stripe_size: ShardStripeSize,
    pub policy: PlacementPolicy,
    pub config: TenantConfig,
 }

+#[derive(Serialize, Deserialize)]
+pub struct NodeDescribeResponse {
+    pub id: NodeId,
+
+    pub availability: NodeAvailabilityWrapper,
+    pub scheduling: NodeSchedulingPolicy,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponseShard {
    pub tenant_shard_id: TenantShardId,
@@ -83,6 +104,8 @@ pub struct TenantDescribeResponseShard {
    pub is_pending_compute_notification: bool,
    /// A shard split is currently underway
    pub is_splitting: bool,
+
+    pub scheduling_policy: ShardSchedulingPolicy,
 }

 /// Explicitly migrating a particular shard is a low level operation
@@ -97,7 +120,7 @@ pub struct TenantShardMigrateRequest {
 /// Utilisation score indicating how good a candidate a pageserver
 /// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
 /// Lower values are better.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
 pub struct UtilizationScore(pub u64);

 impl UtilizationScore {
@@ -106,7 +129,7 @@ impl UtilizationScore {
    }
 }

-#[derive(Serialize, Clone, Copy)]
+#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
    // Normal, happy state
@@ -129,7 +152,7 @@ impl Eq for NodeAvailability {}
 // This wrapper provides serde functionality and it should only be used to
 // communicate with external callers which don't know or care about the
 // utilisation score of the pageserver it is targeting.
-#[derive(Serialize, Deserialize, Clone)]
+#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
 pub enum NodeAvailabilityWrapper {
    Active,
    Offline,
@@ -155,22 +178,33 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
    }
 }

-impl FromStr for NodeAvailability {
-    type Err = anyhow::Error;
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
+pub enum ShardSchedulingPolicy {
+    // Normal mode: the tenant's scheduled locations may be updated at will, including
+    // for non-essential optimization.
+    Active,

-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            // This is used when parsing node configuration requests from neon-local.
-            // Assume the worst possible utilisation score
-            // and let it get updated via the heartbeats.
-            "active" => Ok(Self::Active(UtilizationScore::worst())),
-            "offline" => Ok(Self::Offline),
-            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
-        }
+    // Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
+    // For example, this still permits a node's attachment location to change to a secondary in
+    // response to a node failure, or to assign a new secondary if a node was removed.
+    Essential,
+
+    // No scheduling: leave the shard running wherever it currently is.  Even if the shard is
+    // unavailable, it will not be rescheduled to another node.
+    Pause,
+
+    // No reconciling: we will make no location_conf API calls to pageservers at all.  If the
+    // shard is unavailable, it stays that way.  If a node fails, this shard doesn't get failed over.
+    Stop,
+}
+
+impl Default for ShardSchedulingPolicy {
+    fn default() -> Self {
+        Self::Active
    }
 }

-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum NodeSchedulingPolicy {
    Active,
    Filling,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -301,6 +301,7 @@ pub struct TenantConfig {
    pub heatmap_period: Option<String>,
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
+    pub image_layer_creation_check_threshold: Option<u8>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
--- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
+++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
@@ -1,5 +1,6 @@
 use anyhow::*;
 use clap::{value_parser, Arg, ArgMatches, Command};
+use postgres::Client;
 use std::{path::PathBuf, str::FromStr};
 use wal_craft::*;

@@ -8,8 +9,8 @@ fn main() -> Result<()> {
        .init();
    let arg_matches = cli().get_matches();

-    let wal_craft = |arg_matches: &ArgMatches, client| {
-        let (intermediate_lsns, end_of_wal_lsn) = match arg_matches
+    let wal_craft = |arg_matches: &ArgMatches, client: &mut Client| {
+        let intermediate_lsns = match arg_matches
            .get_one::<String>("type")
            .map(|s| s.as_str())
            .context("'type' is required")?
@@ -25,6 +26,7 @@ fn main() -> Result<()> {
            LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
            a => panic!("Unknown --type argument: {a}"),
        };
+        let end_of_wal_lsn = client.pg_current_wal_insert_lsn()?;
        for lsn in intermediate_lsns {
            println!("intermediate_lsn = {lsn}");
        }
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -5,7 +5,6 @@ use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
 use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
-use std::cmp::Ordering;
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -232,59 +231,52 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow
 pub trait Crafter {
    const NAME: &'static str;

-    /// Generates WAL using the client `client`. Returns a pair of:
-    /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from.
-    ///   May include or exclude Lsn(0) and the end-of-wal.
-    /// * The expected end-of-wal LSN.
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)>;
+    /// Generates WAL using the client `client`. Returns a vector of some valid
+    /// "interesting" intermediate LSNs which one may start reading from.
+    /// test_end_of_wal uses this to check various starting points.
+    ///
+    /// Note that postgres is generally keen about writing some WAL. While we
+    /// try to disable it (autovacuum, big wal_writer_delay, etc) it is always
+    /// possible, e.g. xl_running_xacts are dumped each 15s. So checks about
+    /// stable WAL end would be flaky unless postgres is shut down. For this
+    /// reason returning potential end of WAL here is pointless. Most of the
+    /// time this doesn't happen though, so it is reasonable to create needed
+    /// WAL structure and immediately kill postgres like test_end_of_wal does.
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>>;
 }

+/// Wraps some WAL craft function, providing current LSN to it before the
+/// insertion and flushing WAL afterwards. Also pushes initial LSN to the
+/// result.
 fn craft_internal<C: postgres::GenericClient>(
    client: &mut C,
-    f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec<PgLsn>, Option<PgLsn>)>,
-) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+    f: impl Fn(&mut C, PgLsn) -> anyhow::Result<Vec<PgLsn>>,
+) -> anyhow::Result<Vec<PgLsn>> {
    ensure_server_config(client)?;

    let initial_lsn = client.pg_current_wal_insert_lsn()?;
    info!("LSN initial = {}", initial_lsn);

-    let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
-    let last_lsn = match last_lsn {
-        None => client.pg_current_wal_insert_lsn()?,
-        Some(last_lsn) => {
-            let insert_lsn = client.pg_current_wal_insert_lsn()?;
-            match last_lsn.cmp(&insert_lsn) {
-                Ordering::Less => bail!(
-                    "Some records were inserted after the crafted WAL: {} vs {}",
-                    last_lsn,
-                    insert_lsn
-                ),
-                Ordering::Equal => last_lsn,
-                Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
-            }
-        }
-    };
+    let mut intermediate_lsns = f(client, initial_lsn)?;
    if !intermediate_lsns.starts_with(&[initial_lsn]) {
        intermediate_lsns.insert(0, initial_lsn);
    }

    // Some records may be not flushed, e.g. non-transactional logical messages.
+    //
+    // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
+    // because pg_current_wal_insert_lsn skips page headers.
    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
-    match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
-        Ordering::Less => bail!("Some records were flushed after the crafted WAL"),
-        Ordering::Equal => {}
-        Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
-    }
-    Ok((intermediate_lsns, last_lsn))
+    Ok(intermediate_lsns)
 }

 pub struct Simple;
 impl Crafter for Simple {
    const NAME: &'static str = "simple";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
        craft_internal(client, |client, _| {
            client.execute("CREATE table t(x int)", &[])?;
-            Ok((Vec::new(), None))
+            Ok(Vec::new())
        })
    }
 }
@@ -292,29 +284,36 @@ impl Crafter for Simple {
 pub struct LastWalRecordXlogSwitch;
 impl Crafter for LastWalRecordXlogSwitch {
    const NAME: &'static str = "last_wal_record_xlog_switch";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
-        // Do not use generate_internal because here we end up with flush_lsn exactly on
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
+        // Do not use craft_internal because here we end up with flush_lsn exactly on
        // the segment boundary and insert_lsn after the initial page header, which is unusual.
        ensure_server_config(client)?;

        client.execute("CREATE table t(x int)", &[])?;
        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-        let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
-        let next_segment = PgLsn::from(0x0200_0000);
+        // pg_switch_wal returns end of last record of the switched segment,
+        // i.e. end of SWITCH itself.
+        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+        let before_xlog_switch_u64 = u64::from(before_xlog_switch);
+        let next_segment = PgLsn::from(
+            before_xlog_switch_u64 - (before_xlog_switch_u64 % WAL_SEGMENT_SIZE as u64)
+                + WAL_SEGMENT_SIZE as u64,
+        );
        ensure!(
-            after_xlog_switch <= next_segment,
-            "XLOG_SWITCH message ended after the expected segment boundary: {} > {}",
-            after_xlog_switch,
+            xlog_switch_record_end <= next_segment,
+            "XLOG_SWITCH record ended after the expected segment boundary: {} > {}",
+            xlog_switch_record_end,
            next_segment
        );
-        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
+        Ok(vec![before_xlog_switch, xlog_switch_record_end])
    }
 }

 pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
+/// Craft xlog SWITCH record ending at page boundary.
 impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
    const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
        // Do not use generate_internal because here we end up with flush_lsn exactly on
        // the segment boundary and insert_lsn after the initial page header, which is unusual.
        ensure_server_config(client)?;
@@ -361,28 +360,29 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {

        // Emit the XLOG_SWITCH
        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-        let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
        let next_segment = PgLsn::from(0x0200_0000);
        ensure!(
-            after_xlog_switch < next_segment,
-            "XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}",
-            after_xlog_switch,
+            xlog_switch_record_end < next_segment,
+            "XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
+            xlog_switch_record_end,
            next_segment
        );
        ensure!(
-            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
+            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
            "XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
-            after_xlog_switch,
-            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ
+            xlog_switch_record_end,
+            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
        );
-        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
+        Ok(vec![before_xlog_switch, xlog_switch_record_end])
    }
 }

-fn craft_single_logical_message(
+/// Write ~16MB logical message; it should cross WAL segment.
+fn craft_seg_size_logical_message(
    client: &mut impl postgres::GenericClient,
    transactional: bool,
-) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+) -> anyhow::Result<Vec<PgLsn>> {
    craft_internal(client, |client, initial_lsn| {
        ensure!(
            initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
@@ -405,34 +405,24 @@ fn craft_single_logical_message(
            "Logical message crossed two segments"
        );

-        if transactional {
-            // Transactional logical messages are part of a transaction, so the one above is
-            // followed by a small COMMIT record.
-
-            let after_message_lsn = client.pg_current_wal_insert_lsn()?;
-            ensure!(
-                message_lsn < after_message_lsn,
-                "No record found after the emitted message"
-            );
-            Ok((vec![message_lsn], Some(after_message_lsn)))
-        } else {
-            Ok((Vec::new(), Some(message_lsn)))
-        }
+        Ok(vec![message_lsn])
    })
 }

 pub struct WalRecordCrossingSegmentFollowedBySmallOne;
 impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
    const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
-        craft_single_logical_message(client, true)
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
+        // Transactional message crossing WAL segment will be followed by small
+        // commit record.
+        craft_seg_size_logical_message(client, true)
    }
 }

 pub struct LastWalRecordCrossingSegment;
 impl Crafter for LastWalRecordCrossingSegment {
    const NAME: &'static str = "last_wal_record_crossing_segment";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
-        craft_single_logical_message(client, false)
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
+        craft_seg_size_logical_message(client, false)
    }
 }
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -11,13 +11,15 @@ use utils::const_assert;
 use utils::lsn::Lsn;

 fn init_logging() {
-    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
-        format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
-    ))
+    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(format!(
+        "crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"
+    )))
    .is_test(true)
    .try_init();
 }

+/// Test that find_end_of_wal returns the same results as pg_dump on various
+/// WALs created by Crafter.
 fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    use crate::*;

@@ -38,13 +40,13 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    }
    cfg.initdb().unwrap();
    let srv = cfg.start_server().unwrap();
-    let (intermediate_lsns, expected_end_of_wal_partial) =
-        C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
+    let intermediate_lsns = C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
    let intermediate_lsns: Vec<Lsn> = intermediate_lsns
        .iter()
        .map(|&lsn| u64::from(lsn).into())
        .collect();
-    let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
+    // Kill postgres. Note that it might have inserted to WAL something after
+    // 'craft' did its job.
    srv.kill();

    // Check find_end_of_wal on the initial WAL
@@ -56,7 +58,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
        .filter(|fname| IsXLogFileName(fname))
        .max()
        .unwrap();
-    check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
+    let expected_end_of_wal = find_pg_waldump_end_of_wal(&cfg, &last_segment);
    for start_lsn in intermediate_lsns
        .iter()
        .chain(std::iter::once(&expected_end_of_wal))
@@ -91,11 +93,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    }
 }

-fn check_pg_waldump_end_of_wal(
-    cfg: &crate::Conf,
-    last_segment: &str,
-    expected_end_of_wal: Lsn,
-) {
+fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
    // Get the actual end of WAL by pg_waldump
    let waldump_output = cfg
        .pg_waldump("000000010000000000000001", last_segment)
@@ -113,11 +111,8 @@ fn check_pg_waldump_end_of_wal(
        }
    };
    let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
-    info!(
-        "waldump erred on {}, expected wal end at {}",
-        waldump_wal_end, expected_end_of_wal
-    );
-    assert_eq!(waldump_wal_end, expected_end_of_wal);
+    info!("waldump erred on {}", waldump_wal_end);
+    waldump_wal_end
 }

 fn check_end_of_wal(
@@ -210,9 +205,9 @@ pub fn test_update_next_xid() {
 #[test]
 pub fn test_encode_logical_message() {
    let expected = [
-        64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
-        38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
-        101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
+        64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38,
+        0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102,
+        105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
    ];
    let actual = encode_logical_message("prefix", "message");
    assert_eq!(expected, actual[..]);
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -565,6 +565,16 @@ impl GenericRemoteStorage {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct StorageMetadata(HashMap<String, String>);

+impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
+    fn from(arr: [(&str, &str); N]) -> Self {
+        let map: HashMap<String, String> = arr
+            .iter()
+            .map(|(k, v)| (k.to_string(), v.to_string()))
+            .collect();
+        Self(map)
+    }
+}
+
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -198,6 +198,7 @@ impl LocalFs {
            fs::OpenOptions::new()
                .write(true)
                .create(true)
+                .truncate(true)
                .open(&temp_file_path)
                .await
                .with_context(|| {
--- a/libs/tenant_size_model/tests/tests.rs
+++ b/libs/tenant_size_model/tests/tests.rs
@@ -247,7 +247,7 @@ fn scenario_4() {
    //
    // This is in total 5000 + 1000 + 5000 + 1000 = 12000
    //
-    // (If we used the the method from the previous scenario, and
+    // (If we used the method from the previous scenario, and
    // kept only snapshot at the branch point, we'd need to keep
    // all the WAL between 10000-18000 on the main branch, so
    // the total size would be 5000 + 1000 + 8000 = 14000. The
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -63,6 +63,7 @@ impl UnwrittenLockFile {
 pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
    let lock_file = fs::OpenOptions::new()
        .create(true) // O_CREAT
+        .truncate(true)
        .write(true)
        .open(lock_file_path)
        .context("open lock file")?;
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -182,6 +182,18 @@ where
        }
    }

+    /// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
+    pub fn would_wait_for(&self, num: V) -> Result<(), V> {
+        let internal = self.internal.lock().unwrap();
+        let cnt = internal.current.cnt_value();
+        drop(internal);
+        if cnt >= num {
+            Ok(())
+        } else {
+            Err(cnt)
+        }
+    }
+
    /// Register and return a channel that will be notified when a number arrives,
    /// or None, if it has already arrived.
    fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -69,7 +69,7 @@ pub struct Config {
    /// should be removed once we have a better solution there.
    sys_buffer_bytes: u64,

-    /// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
+    /// Minimum fraction of total system memory reserved *before* the cgroup threshold; in
    /// other words, providing a ceiling for the highest value of the threshold by enforcing that
    /// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
    /// threshold.
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -59,6 +59,7 @@ signal-hook.workspace = true
 smallvec = { workspace = true, features = ["write"] }
 svg_fmt.workspace = true
 sync_wrapper.workspace = true
+sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -43,7 +43,8 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
    fanout: u64,
    ctx: &E::RequestContext,
 ) -> anyhow::Result<()> {
-    assert!(fanout >= 2);
+    assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}");
+    let exp_base = fanout.max(2);
    // Start at L0
    let mut current_level_no = 0;
    let mut current_level_target_height = target_file_size;
@@ -106,7 +107,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
            break;
        }
        current_level_no += 1;
-        current_level_target_height = current_level_target_height.saturating_mul(fanout);
+        current_level_target_height = current_level_target_height.saturating_mul(exp_base);
    }
    Ok(())
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -600,32 +600,37 @@ fn start_pageserver(
            None,
            "consumption metrics collection",
            true,
-            async move {
-                // first wait until background jobs are cleared to launch.
-                //
-                // this is because we only process active tenants and timelines, and the
-                // Timeline::get_current_logical_size will spawn the logical size calculation,
-                // which will not be rate-limited.
-                let cancel = task_mgr::shutdown_token();
+            {
+                let tenant_manager = tenant_manager.clone();
+                async move {
+                    // first wait until background jobs are cleared to launch.
+                    //
+                    // this is because we only process active tenants and timelines, and the
+                    // Timeline::get_current_logical_size will spawn the logical size calculation,
+                    // which will not be rate-limited.
+                    let cancel = task_mgr::shutdown_token();

-                tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()); },
-                    _ = background_jobs_barrier.wait() => {}
-                };
+                    tokio::select! {
+                        _ = cancel.cancelled() => { return Ok(()); },
+                        _ = background_jobs_barrier.wait() => {}
+                    };

-                pageserver::consumption_metrics::collect_metrics(
-                    metric_collection_endpoint,
-                    conf.metric_collection_interval,
-                    conf.cached_metric_collection_interval,
-                    conf.synthetic_size_calculation_interval,
-                    conf.id,
-                    local_disk_storage,
-                    cancel,
-                    metrics_ctx,
-                )
-                .instrument(info_span!("metrics_collection"))
-                .await?;
-                Ok(())
+                    pageserver::consumption_metrics::collect_metrics(
+                        tenant_manager,
+                        metric_collection_endpoint,
+                        &conf.metric_collection_bucket,
+                        conf.metric_collection_interval,
+                        conf.cached_metric_collection_interval,
+                        conf.synthetic_size_calculation_interval,
+                        conf.id,
+                        local_disk_storage,
+                        cancel,
+                        metrics_ctx,
+                    )
+                    .instrument(info_span!("metrics_collection"))
+                    .await?;
+                    Ok(())
+                }
            },
        );
    }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -94,6 +94,8 @@ pub mod defaults {

    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;

+    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
+
    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = crate::walredo::ProcessKind::DEFAULT_TOML;

    ///
@@ -158,6 +160,8 @@ pub mod defaults {
 #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
 #secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}

+#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
+
 [remote_storage]

 "#
@@ -234,6 +238,7 @@ pub struct PageServerConf {
    // How often to send unchanged cached metrics to the metrics endpoint.
    pub cached_metric_collection_interval: Duration,
    pub metric_collection_endpoint: Option<Url>,
+    pub metric_collection_bucket: Option<RemoteStorageConfig>,
    pub synthetic_size_calculation_interval: Duration,

    pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
@@ -279,6 +284,13 @@ pub struct PageServerConf {

    pub validate_vectored_get: bool,

+    /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
+    /// is exceeded, we start proactively closing ephemeral layers to limit the total amount
+    /// of ephemeral data.
+    ///
+    /// Setting this to zero disables limits on total ephemeral layer size.
+    pub ephemeral_bytes_per_memory_kb: usize,
+
    pub walredo_process_kind: crate::walredo::ProcessKind,
 }

@@ -374,6 +386,7 @@ struct PageServerConfigBuilder {
    cached_metric_collection_interval: BuilderValue<Duration>,
    metric_collection_endpoint: BuilderValue<Option<Url>>,
    synthetic_size_calculation_interval: BuilderValue<Duration>,
+    metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,

    disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,

@@ -400,6 +413,8 @@ struct PageServerConfigBuilder {

    validate_vectored_get: BuilderValue<bool>,

+    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
+
    walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
 }

@@ -456,6 +471,8 @@ impl PageServerConfigBuilder {
            .expect("cannot parse default synthetic size calculation interval")),
            metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),

+            metric_collection_bucket: Set(None),
+
            disk_usage_based_eviction: Set(None),

            test_remote_failures: Set(0),
@@ -483,6 +500,7 @@ impl PageServerConfigBuilder {
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
+            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),

            walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
        }
@@ -585,6 +603,13 @@ impl PageServerConfigBuilder {
        self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
    }

+    pub fn metric_collection_bucket(
+        &mut self,
+        metric_collection_bucket: Option<RemoteStorageConfig>,
+    ) {
+        self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket)
+    }
+
    pub fn synthetic_size_calculation_interval(
        &mut self,
        synthetic_size_calculation_interval: Duration,
@@ -653,6 +678,10 @@ impl PageServerConfigBuilder {
        self.validate_vectored_get = BuilderValue::Set(value);
    }

+    pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
+        self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
+    }
+
    pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
        self.walredo_process_kind = BuilderValue::Set(value);
    }
@@ -696,6 +725,7 @@ impl PageServerConfigBuilder {
                metric_collection_interval,
                cached_metric_collection_interval,
                metric_collection_endpoint,
+                metric_collection_bucket,
                synthetic_size_calculation_interval,
                disk_usage_based_eviction,
                test_remote_failures,
@@ -710,6 +740,7 @@ impl PageServerConfigBuilder {
                get_vectored_impl,
                max_vectored_read_bytes,
                validate_vectored_get,
+                ephemeral_bytes_per_memory_kb,
                walredo_process_kind,
            }
            CUSTOM LOGIC
@@ -944,6 +975,9 @@ impl PageServerConf {
                    let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
                    builder.metric_collection_endpoint(Some(endpoint));
                },
+                "metric_collection_bucket" => {
+                    builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
+                }
                "synthetic_size_calculation_interval" =>
                    builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
                "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
@@ -997,6 +1031,9 @@ impl PageServerConf {
                "validate_vectored_get" => {
                    builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
                }
+                "ephemeral_bytes_per_memory_kb" => {
+                    builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
+                }
                "walredo_process_kind" => {
                    builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
                }
@@ -1061,6 +1098,7 @@ impl PageServerConf {
            metric_collection_interval: Duration::from_secs(60),
            cached_metric_collection_interval: Duration::from_secs(60 * 60),
            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
+            metric_collection_bucket: None,
            synthetic_size_calculation_interval: Duration::from_secs(60),
            disk_usage_based_eviction: None,
            test_remote_failures: 0,
@@ -1079,6 +1117,7 @@ impl PageServerConf {
                    .expect("Invalid default constant"),
            ),
            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
+            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
            walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
        }
    }
@@ -1292,6 +1331,7 @@ background_task_maximum_delay = '334 s'
                    defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
                )?,
                metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
+                metric_collection_bucket: None,
                synthetic_size_calculation_interval: humantime::parse_duration(
                    defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
                )?,
@@ -1314,6 +1354,7 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
+                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
            },
            "Correct defaults should be used when no config values are provided"
@@ -1366,6 +1407,7 @@ background_task_maximum_delay = '334 s'
                metric_collection_interval: Duration::from_secs(222),
                cached_metric_collection_interval: Duration::from_secs(22200),
                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
+                metric_collection_bucket: None,
                synthetic_size_calculation_interval: Duration::from_secs(333),
                disk_usage_based_eviction: None,
                test_remote_failures: 0,
@@ -1384,6 +1426,7 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
+                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
            },
            "Should be able to parse all basic config values correctly"
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,10 +3,13 @@
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
+use crate::tenant::{
+    mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
+};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
+use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
 use reqwest::Url;
 use std::collections::HashMap;
 use std::sync::Arc;
@@ -40,7 +43,9 @@ type Cache = HashMap<MetricsKey, (EventType, u64)>;
 /// Main thread that serves metrics collection
 #[allow(clippy::too_many_arguments)]
 pub async fn collect_metrics(
+    tenant_manager: Arc<TenantManager>,
    metric_collection_endpoint: &Url,
+    metric_collection_bucket: &Option<RemoteStorageConfig>,
    metric_collection_interval: Duration,
    _cached_metric_collection_interval: Duration,
    synthetic_size_calculation_interval: Duration,
@@ -65,15 +70,19 @@ pub async fn collect_metrics(
        None,
        "synthetic size calculation",
        false,
-        async move {
-            calculate_synthetic_size_worker(
-                synthetic_size_calculation_interval,
-                &cancel,
-                &worker_ctx,
-            )
-            .instrument(info_span!("synthetic_size_worker"))
-            .await?;
-            Ok(())
+        {
+            let tenant_manager = tenant_manager.clone();
+            async move {
+                calculate_synthetic_size_worker(
+                    tenant_manager,
+                    synthetic_size_calculation_interval,
+                    &cancel,
+                    &worker_ctx,
+                )
+                .instrument(info_span!("synthetic_size_worker"))
+                .await?;
+                Ok(())
+            }
        },
    );

@@ -94,13 +103,27 @@ pub async fn collect_metrics(
        .build()
        .expect("Failed to create http client with timeout");

+    let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
+        match GenericRemoteStorage::from_config(bucket_config) {
+            Ok(client) => Some(client),
+            Err(e) => {
+                // Non-fatal error: if we were given an invalid config, we will proceed
+                // with sending metrics over the network, but not to S3.
+                tracing::warn!("Invalid configuration for metric_collection_bucket: {e}");
+                None
+            }
+        }
+    } else {
+        None
+    };
+
    let node_id = node_id.to_string();

    loop {
        let started_at = Instant::now();

        // these are point in time, with variable "now"
-        let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;
+        let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;

        let metrics = Arc::new(metrics);

@@ -118,10 +141,18 @@ pub async fn collect_metrics(
                    tracing::error!("failed to persist metrics to {path:?}: {e:#}");
                }
            }
+
+            if let Some(bucket_client) = &bucket_client {
+                let res =
+                    upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
+                if let Err(e) = res {
+                    tracing::error!("failed to upload to S3: {e:#}");
+                }
+            }
        };

        let upload = async {
-            let res = upload::upload_metrics(
+            let res = upload::upload_metrics_http(
                &client,
                metric_collection_endpoint,
                &cancel,
@@ -132,7 +163,7 @@ pub async fn collect_metrics(
            .await;
            if let Err(e) = res {
                // serialization error which should never happen
-                tracing::error!("failed to upload due to {e:#}");
+                tracing::error!("failed to upload via HTTP due to {e:#}");
            }
        };

@@ -247,6 +278,7 @@ async fn reschedule(

 /// Caclculate synthetic size for each active tenant
 async fn calculate_synthetic_size_worker(
+    tenant_manager: Arc<TenantManager>,
    synthetic_size_calculation_interval: Duration,
    cancel: &CancellationToken,
    ctx: &RequestContext,
@@ -259,7 +291,7 @@ async fn calculate_synthetic_size_worker(
    loop {
        let started_at = Instant::now();

-        let tenants = match mgr::list_tenants().await {
+        let tenants = match tenant_manager.list_tenants() {
            Ok(tenants) => tenants,
            Err(e) => {
                warn!("cannot get tenant list: {e:#}");
@@ -278,10 +310,14 @@ async fn calculate_synthetic_size_worker(
                continue;
            }

-            let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
+            let Ok(tenant) = tenant_manager.get_attached_tenant_shard(tenant_shard_id) else {
                continue;
            };

+            if !tenant.is_active() {
+                continue;
+            }
+
            // there is never any reason to exit calculate_synthetic_size_worker following any
            // return value -- we don't need to care about shutdown because no tenant is found when
            // pageserver is shut down.
@@ -319,9 +355,7 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
    };

    // this error can be returned if timeline is shutting down, but it does not
-    // mean the synthetic size worker should terminate. we do not need any checks
-    // in this function because `mgr::get_tenant` will error out after shutdown has
-    // progressed to shutting down tenants.
+    // mean the synthetic size worker should terminate.
    let shutting_down = matches!(
        e.downcast_ref::<PageReconstructError>(),
        Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -1,3 +1,4 @@
+use crate::tenant::mgr::TenantManager;
 use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
@@ -181,6 +182,7 @@ impl MetricsKey {
 }

 pub(super) async fn collect_all_metrics(
+    tenant_manager: &Arc<TenantManager>,
    cached_metrics: &Cache,
    ctx: &RequestContext,
 ) -> Vec<RawMetric> {
@@ -188,7 +190,7 @@ pub(super) async fn collect_all_metrics(

    let started_at = std::time::Instant::now();

-    let tenants = match crate::tenant::mgr::list_tenants().await {
+    let tenants = match tenant_manager.list_tenants() {
        Ok(tenants) => tenants,
        Err(err) => {
            tracing::error!("failed to list tenants: {:?}", err);
@@ -200,7 +202,8 @@ pub(super) async fn collect_all_metrics(
        if state != TenantState::Active || !id.is_zero() {
            None
        } else {
-            crate::tenant::mgr::get_tenant(id, true)
+            tenant_manager
+                .get_attached_tenant_shard(id)
                .ok()
                .map(|tenant| (id.tenant_id, tenant))
        }
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -1,4 +1,9 @@
+use std::time::SystemTime;
+
+use chrono::{DateTime, Utc};
 use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
+use remote_storage::{GenericRemoteStorage, RemotePath};
+use tokio::io::AsyncWriteExt;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;

@@ -13,8 +18,9 @@ struct Ids {
    pub(super) timeline_id: Option<TimelineId>,
 }

+/// Serialize and write metrics to an HTTP endpoint
 #[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
-pub(super) async fn upload_metrics(
+pub(super) async fn upload_metrics_http(
    client: &reqwest::Client,
    metric_collection_endpoint: &reqwest::Url,
    cancel: &CancellationToken,
@@ -74,6 +80,60 @@ pub(super) async fn upload_metrics(
    Ok(())
 }

+/// Serialize and write metrics to a remote storage object
+#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
+pub(super) async fn upload_metrics_bucket(
+    client: &GenericRemoteStorage,
+    cancel: &CancellationToken,
+    node_id: &str,
+    metrics: &[RawMetric],
+) -> anyhow::Result<()> {
+    if metrics.is_empty() {
+        // Skip uploads if we have no metrics, so that readers don't have to handle the edge case
+        // of an empty object.
+        return Ok(());
+    }
+
+    // Compose object path
+    let datetime: DateTime<Utc> = SystemTime::now().into();
+    let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ");
+    let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?;
+
+    // Set up a gzip writer into a buffer
+    let mut compressed_bytes: Vec<u8> = Vec::new();
+    let compressed_writer = std::io::Cursor::new(&mut compressed_bytes);
+    let mut gzip_writer = async_compression::tokio::write::GzipEncoder::new(compressed_writer);
+
+    // Serialize and write into compressed buffer
+    let started_at = std::time::Instant::now();
+    for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
+        let (_chunk, body) = res?;
+        gzip_writer.write_all(&body).await?;
+    }
+    gzip_writer.flush().await?;
+    gzip_writer.shutdown().await?;
+    let compressed_length = compressed_bytes.len();
+
+    // Write to remote storage
+    client
+        .upload_storage_object(
+            futures::stream::once(futures::future::ready(Ok(compressed_bytes.into()))),
+            compressed_length,
+            &path,
+            cancel,
+        )
+        .await?;
+    let elapsed = started_at.elapsed();
+
+    tracing::info!(
+        compressed_length,
+        elapsed_ms = elapsed.as_millis(),
+        "write metrics bucket at {path}",
+    );
+
+    Ok(())
+}
+
 // The return type is quite ugly, but we gain testability in isolation
 fn serialize_in_chunks<'a, F>(
    chunk_size: usize,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -61,7 +61,6 @@ use crate::{
    metrics::disk_usage_based_eviction::METRICS,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
-        self,
        mgr::TenantManager,
        remote_timeline_client::LayerFileMetadata,
        secondary::SecondaryTenant,
@@ -814,8 +813,8 @@ async fn collect_eviction_candidates(
    const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);

    // get a snapshot of the list of tenants
-    let tenants = tenant::mgr::list_tenants()
-        .await
+    let tenants = tenant_manager
+        .list_tenants()
        .context("get list of tenants")?;

    // TODO: avoid listing every layer in every tenant: this loop can block the executor,
@@ -827,8 +826,12 @@ async fn collect_eviction_candidates(
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
-        let tenant = match tenant::mgr::get_tenant(tenant_id, true) {
-            Ok(tenant) => tenant,
+        let tenant = match tenant_manager.get_attached_tenant_shard(tenant_id) {
+            Ok(tenant) if tenant.is_active() => tenant,
+            Ok(_) => {
+                debug!(tenant_id=%tenant_id.tenant_id, shard_id=%tenant_id.shard_slug(), "Tenant shard is not active");
+                continue;
+            }
            Err(e) => {
                // this can happen if tenant has lifecycle transition after we fetched it
                debug!("failed to get tenant: {e:#}");
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1038,7 +1038,7 @@ paths:
                  format: hex
      responses:
        "201":
-          description: TimelineInfo
+          description: Timeline was created, or already existed with matching parameters
          content:
            application/json:
              schema:
@@ -1068,11 +1068,17 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"
        "409":
-          description: Timeline already exists, creation skipped
+          description: Timeline already exists, with different parameters.  Creation cannot proceed.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
+        "429":
+          description: A creation request was sent for the same Timeline Id while a creation was already in progress.  Back off and retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
        "500":
          description: Generic operation error
          content:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -49,8 +49,8 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::{
-    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
-    TenantSlotError, TenantSlotUpsertError, TenantStateError,
+    GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError,
+    TenantSlotUpsertError, TenantStateError,
 };
 use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
 use crate::tenant::remote_timeline_client;
@@ -249,16 +249,11 @@ impl From<GetTenantError> for ApiError {
    fn from(tse: GetTenantError) -> ApiError {
        match tse {
            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
-            GetTenantError::Broken(reason) => {
-                ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
-            }
            GetTenantError::NotActive(_) => {
                // Why is this not `ApiError::NotFound`?
                // Because we must be careful to never return 404 for a tenant if it does
                // in fact exist locally. If we did, the caller could draw the conclusion
                // that it can attach the tenant to another PS and we'd be in split-brain.
-                //
-                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
                ApiError::ResourceUnavailable("Tenant not yet active".into())
            }
            GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
@@ -269,6 +264,9 @@ impl From<GetTenantError> for ApiError {
 impl From<GetActiveTenantError> for ApiError {
    fn from(e: GetActiveTenantError) -> ApiError {
        match e {
+            GetActiveTenantError::Broken(reason) => {
+                ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
+            }
            GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
            GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
            GetActiveTenantError::NotFound(gte) => gte.into(),
@@ -279,19 +277,6 @@ impl From<GetActiveTenantError> for ApiError {
    }
 }

-impl From<SetNewTenantConfigError> for ApiError {
-    fn from(e: SetNewTenantConfigError) -> ApiError {
-        match e {
-            SetNewTenantConfigError::GetTenant(tid) => {
-                ApiError::NotFound(anyhow!("tenant {}", tid).into())
-            }
-            e @ (SetNewTenantConfigError::Persist(_) | SetNewTenantConfigError::Other(_)) => {
-                ApiError::InternalServerError(anyhow::Error::new(e))
-            }
-        }
-    }
-}
-
 impl From<crate::tenant::DeleteTimelineError> for ApiError {
    fn from(value: crate::tenant::DeleteTimelineError) -> Self {
        use crate::tenant::DeleteTimelineError::*;
@@ -495,7 +480,7 @@ async fn timeline_create_handler(
    async {
        let tenant = state
            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id, false)?;
+            .get_attached_tenant_shard(tenant_shard_id)?;

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

@@ -535,10 +520,13 @@ async fn timeline_create_handler(
                    HttpErrorBody::from_msg("Tenant shutting down".to_string()),
                )
            }
-            Err(
-                e @ tenant::CreateTimelineError::Conflict
-                | e @ tenant::CreateTimelineError::AlreadyCreating,
-            ) => json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())),
+            Err(e @ tenant::CreateTimelineError::Conflict) => {
+                json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string()))
+            }
+            Err(e @ tenant::CreateTimelineError::AlreadyCreating) => json_response(
+                StatusCode::TOO_MANY_REQUESTS,
+                HttpErrorBody::from_msg(e.to_string()),
+            ),
            Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
                StatusCode::NOT_ACCEPTABLE,
                HttpErrorBody::from_msg(format!("{err:#}")),
@@ -581,7 +569,7 @@ async fn timeline_list_handler(
    let response_data = async {
        let tenant = state
            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id, false)?;
+            .get_attached_tenant_shard(tenant_shard_id)?;

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

@@ -619,6 +607,7 @@ async fn timeline_preserve_initdb_handler(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);

    // Part of the process for disaster recovery from safekeeper-stored WAL:
    // If we don't recover into a new timeline but want to keep the timeline ID,
@@ -626,7 +615,9 @@ async fn timeline_preserve_initdb_handler(
    // location where timeline recreation cand find it.

    async {
-        let tenant = mgr::get_tenant(tenant_shard_id, false)?;
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;

        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -668,7 +659,7 @@ async fn timeline_detail_handler(
    let timeline_info = async {
        let tenant = state
            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id, false)?;
+            .get_attached_tenant_shard(tenant_shard_id)?;

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

@@ -855,7 +846,7 @@ async fn timeline_delete_handler(

    let tenant = state
        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id, false)
+        .get_attached_tenant_shard(tenant_shard_id)
        .map_err(|e| {
            match e {
                // GetTenantError has a built-in conversion to ApiError, but in this context we don't
@@ -973,10 +964,11 @@ async fn tenant_list_handler(
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;
+    let state = get_state(&request);

-    let response_data = mgr::list_tenants()
-        .instrument(info_span!("tenant_list"))
-        .await
+    let response_data = state
+        .tenant_manager
+        .list_tenants()
        .map_err(|_| {
            ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
        })?
@@ -999,9 +991,27 @@ async fn tenant_status(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
+
+    // In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting.
+    let activate = true;
+    #[cfg(feature = "testing")]
+    let activate = parse_query_param(&request, "activate")?.unwrap_or(activate);

    let tenant_info = async {
-        let tenant = mgr::get_tenant(tenant_shard_id, false)?;
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+
+        if activate {
+            // This is advisory: we prefer to let the tenant activate on-demand when this function is
+            // called, but it is still valid to return 200 and describe the current state of the tenant
+            // if it doesn't make it into an active state.
+            tenant
+                .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
+                .await
+                .ok();
+        }

        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
@@ -1074,9 +1084,7 @@ async fn tenant_size_handler(
    let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
    let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
    let headers = request.headers();
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+    let state = get_state(&request);

    if !tenant_shard_id.is_zero() {
        return Err(ApiError::BadRequest(anyhow!(
@@ -1084,6 +1092,12 @@ async fn tenant_size_handler(
        )));
    }

+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
    // this can be long operation
    let inputs = tenant
        .gather_size_inputs(
@@ -1152,10 +1166,15 @@ async fn tenant_shard_split_handler(
    let state = get_state(&request);
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
    let new_shards = state
        .tenant_manager
        .shard_split(
-            tenant_shard_id,
+            tenant,
            ShardCount::new(req.new_shard_count),
            req.new_stripe_size,
            &ctx,
@@ -1373,8 +1392,11 @@ async fn get_tenant_config_handler(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);

-    let tenant = mgr::get_tenant(tenant_shard_id, false)?;
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;

    let response = HashMap::from([
        (
@@ -1402,15 +1424,31 @@ async fn update_tenant_config_handler(
    let tenant_id = request_data.tenant_id;
    check_permission(&request, Some(tenant_id))?;

-    let tenant_conf =
+    let new_tenant_conf =
        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

    let state = get_state(&request);
-    state
+
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let tenant = state
        .tenant_manager
-        .set_new_tenant_config(tenant_conf, tenant_id)
-        .instrument(info_span!("tenant_config", %tenant_id))
-        .await?;
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+    // This is a legacy API that only operates on attached tenants: the preferred
+    // API to use is the location_config/ endpoint, which lets the caller provide
+    // the full LocationConf.
+    let location_conf = LocationConf::attached_single(
+        new_tenant_conf.clone(),
+        tenant.get_generation(),
+        &ShardParameters::default(),
+    );
+
+    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+    tenant.set_new_tenant_config(new_tenant_conf);

    json_response(StatusCode::OK, ())
 }
@@ -1634,10 +1672,12 @@ async fn handle_tenant_break(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;

-    let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true)
-        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
-
-    tenant.set_broken("broken from test".to_owned()).await;
+    let state = get_state(&r);
+    state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?
+        .set_broken("broken from test".to_owned())
+        .await;

    json_response(StatusCode::OK, ())
 }
@@ -1881,7 +1921,7 @@ async fn active_timeline_of_active_tenant(
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
+    let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id)?;

    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -435,7 +435,7 @@ pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(||
 static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_remote_physical_size",
-        "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
+        "The size of the layer files present in the remote storage that are listed in the remote index_part.json.",
        // Corollary: If any files are missing from the index part, they won't be included here.
        &["tenant_id", "shard_id", "timeline_id"]
    )
@@ -699,6 +699,14 @@ pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
    .expect("Failed to register pageserver_startup_is_loading")
 });

+pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_timeline_ephemeral_bytes",
+        "Total number of bytes in ephemeral layers, summed for all timelines.  Approximate, lazily updated."
+    )
+    .expect("Failed to register metric")
+});
+
 /// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
 /// like how long it took to load.
 ///
@@ -1475,12 +1483,18 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
 });

 pub(crate) struct WalIngestMetrics {
+    pub(crate) bytes_received: IntCounter,
    pub(crate) records_received: IntCounter,
    pub(crate) records_committed: IntCounter,
    pub(crate) records_filtered: IntCounter,
 }

 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
+    bytes_received: register_int_counter!(
+        "pageserver_wal_ingest_bytes_received",
+        "Bytes of WAL ingested from safekeepers",
+    )
+    .unwrap(),
    records_received: register_int_counter!(
        "pageserver_wal_ingest_records_received",
        "Number of WAL records received from safekeepers"
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -760,6 +760,7 @@ impl PageServerHandler {
        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
        timeline
            .import_basebackup_from_tar(
+                tenant.clone(),
                &mut copyin_reader,
                base_lsn,
                self.broker_client.clone(),
@@ -875,7 +876,13 @@ impl PageServerHandler {
            if lsn <= last_record_lsn {
                lsn = last_record_lsn;
            } else {
-                timeline.wait_lsn(lsn, ctx).await?;
+                timeline
+                    .wait_lsn(
+                        lsn,
+                        crate::tenant::timeline::WaitLsnWaiter::PageService,
+                        ctx,
+                    )
+                    .await?;
                // Since we waited for 'lsn' to arrive, that is now the last
                // record LSN. (Or close enough for our purposes; the
                // last-record LSN can advance immediately after we return
@@ -887,7 +894,13 @@ impl PageServerHandler {
                    "invalid LSN(0) in request".into(),
                ));
            }
-            timeline.wait_lsn(lsn, ctx).await?;
+            timeline
+                .wait_lsn(
+                    lsn,
+                    crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    ctx,
+                )
+                .await?;
        }

        if lsn < **latest_gc_cutoff_lsn {
@@ -1214,7 +1227,13 @@ impl PageServerHandler {
        if let Some(lsn) = lsn {
            // Backup was requested at a particular LSN. Wait for it to arrive.
            info!("waiting for {}", lsn);
-            timeline.wait_lsn(lsn, ctx).await?;
+            timeline
+                .wait_lsn(
+                    lsn,
+                    crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    ctx,
+                )
+                .await?;
            timeline
                .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
                .context("invalid basebackup lsn")?;
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -214,13 +214,12 @@ pub enum TaskKind {
    /// Internally, `Client` hands over requests to the `Connection` object.
    /// The `Connection` object is responsible for speaking the wire protocol.
    ///
-    /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
-    /// That abstraction doesn't use `task_mgr`.
+    /// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
    /// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
    /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
    ///
-    /// Once the connection is established, the `TaskHandle` task creates a
-    /// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
+    /// Once the connection is established, the `TaskHandle` task spawns a
+    /// [`WalReceiverConnectionPoller`] task that is responsible for polling
    /// the `Connection` object.
    /// A `CancellationToken` created by the `TaskHandle` task ensures
    /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
@@ -230,7 +229,6 @@ pub enum TaskKind {
    WalReceiverManager,

    /// The `TaskHandle` task that executes `handle_walreceiver_connection`.
-    /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
    /// See the comment on [`WalReceiverManager`].
    ///
    /// [`WalReceiverManager`]: Self::WalReceiverManager
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,6 +12,7 @@
 //!

 use anyhow::{bail, Context};
+use arc_swap::ArcSwap;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use enumset::EnumSet;
@@ -98,7 +99,7 @@ use std::ops::Bound::Included;
 use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
-use std::sync::{Mutex, RwLock};
+use std::sync::Mutex;
 use std::time::{Duration, Instant};

 use crate::span;
@@ -260,7 +261,7 @@ pub struct Tenant {
    // We keep TenantConfOpt sturct here to preserve the information
    // about parameters that are not set.
    // This is necessary to allow global config updates.
-    tenant_conf: Arc<RwLock<AttachedTenantConf>>,
+    tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,

    tenant_shard_id: TenantShardId,

@@ -1411,7 +1412,7 @@ impl Tenant {
    /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
    #[allow(clippy::too_many_arguments)]
    pub(crate) async fn create_timeline(
-        &self,
+        self: &Arc<Tenant>,
        new_timeline_id: TimelineId,
        ancestor_timeline_id: Option<TimelineId>,
        mut ancestor_start_lsn: Option<Lsn>,
@@ -1515,7 +1516,7 @@ impl Tenant {
                    // sizes etc. and that would get confused if the previous page versions
                    // are not in the repository yet.
                    ancestor_timeline
-                        .wait_lsn(*lsn, ctx)
+                        .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
                        .await
                        .map_err(|e| match e {
                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
@@ -1559,7 +1560,7 @@ impl Tenant {
            })?;
        }

-        loaded_timeline.activate(broker_client, None, ctx);
+        loaded_timeline.activate(self.clone(), broker_client, None, ctx);

        Ok(loaded_timeline)
    }
@@ -1606,7 +1607,7 @@ impl Tenant {
        );

        {
-            let conf = self.tenant_conf.read().unwrap();
+            let conf = self.tenant_conf.load();

            if !conf.location.may_delete_layers_hint() {
                info!("Skipping GC in location state {:?}", conf.location);
@@ -1633,7 +1634,7 @@ impl Tenant {
        }

        {
-            let conf = self.tenant_conf.read().unwrap();
+            let conf = self.tenant_conf.load();
            if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
                info!("Skipping compaction in location state {:?}", conf.location);
                return Ok(());
@@ -1731,7 +1732,12 @@ impl Tenant {
            let mut activated_timelines = 0;

            for timeline in timelines_to_activate {
-                timeline.activate(broker_client.clone(), background_jobs_can_start, ctx);
+                timeline.activate(
+                    self.clone(),
+                    broker_client.clone(),
+                    background_jobs_can_start,
+                    ctx,
+                );
                activated_timelines += 1;
            }

@@ -1777,7 +1783,7 @@ impl Tenant {
    async fn shutdown(
        &self,
        shutdown_progress: completion::Barrier,
-        freeze_and_flush: bool,
+        shutdown_mode: timeline::ShutdownMode,
    ) -> Result<(), completion::Barrier> {
        span::debug_assert_current_span_has_tenant_id();

@@ -1824,16 +1830,8 @@ impl Tenant {
            timelines.values().for_each(|timeline| {
                let timeline = Arc::clone(timeline);
                let timeline_id = timeline.timeline_id;
-
-                let span =
-                    tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush);
-                js.spawn(async move {
-                    if freeze_and_flush {
-                        timeline.flush_and_shutdown().instrument(span).await
-                    } else {
-                        timeline.shutdown().instrument(span).await
-                    }
-                });
+                let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode);
+                js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await });
            })
        };
        // test_long_timeline_create_then_tenant_delete is leaning on this message
@@ -2063,7 +2061,12 @@ impl Tenant {
                TenantState::Active { .. } => {
                    return Ok(());
                }
-                TenantState::Broken { .. } | TenantState::Stopping { .. } => {
+                TenantState::Broken { reason, .. } => {
+                    // This is fatal, and reported distinctly from the general case of "will never be active" because
+                    // it's logically a 500 to external API users (broken is always a bug).
+                    return Err(GetActiveTenantError::Broken(reason));
+                }
+                TenantState::Stopping { .. } => {
                    // There's no chance the tenant can transition back into ::Active
                    return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
                }
@@ -2072,14 +2075,14 @@ impl Tenant {
    }

    pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
-        self.tenant_conf.read().unwrap().location.attach_mode
+        self.tenant_conf.load().location.attach_mode
    }

    /// For API access: generate a LocationConfig equivalent to the one that would be used to
    /// create a Tenant in the same state.  Do not use this in hot paths: it's for relatively
    /// rare external API calls, like a reconciliation at startup.
    pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
-        let conf = self.tenant_conf.read().unwrap();
+        let conf = self.tenant_conf.load();

        let location_config_mode = match conf.location.attach_mode {
            AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
@@ -2141,7 +2144,7 @@ impl Tenant {

            // Shut down the timeline's remote client: this means that the indices we write
            // for child shards will not be invalidated by the parent shard deleting layers.
-            tl_client.shutdown().await?;
+            tl_client.shutdown().await;

            // Download methods can still be used after shutdown, as they don't flow through the remote client's
            // queue.  In principal the RemoteTimelineClient could provide this without downloading it, but this
@@ -2226,7 +2229,7 @@ where

 impl Tenant {
    pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
-        self.tenant_conf.read().unwrap().tenant_conf.clone()
+        self.tenant_conf.load().tenant_conf.clone()
    }

    pub fn effective_config(&self) -> TenantConf {
@@ -2235,84 +2238,84 @@ impl Tenant {
    }

    pub fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .checkpoint_distance
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
    }

    pub fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .checkpoint_timeout
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
    }

    pub fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .compaction_target_size
            .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
    }

    pub fn get_compaction_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .compaction_period
            .unwrap_or(self.conf.default_tenant_conf.compaction_period)
    }

    pub fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .compaction_threshold
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

    pub fn get_gc_horizon(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .gc_horizon
            .unwrap_or(self.conf.default_tenant_conf.gc_horizon)
    }

    pub fn get_gc_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .gc_period
            .unwrap_or(self.conf.default_tenant_conf.gc_period)
    }

    pub fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .image_creation_threshold
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

    pub fn get_pitr_interval(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .pitr_interval
            .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
    }

    pub fn get_trace_read_requests(&self) -> bool {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .trace_read_requests
            .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
    }

    pub fn get_min_resident_size_override(&self) -> Option<u64> {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .min_resident_size_override
            .or(self.conf.default_tenant_conf.min_resident_size_override)
    }

    pub fn get_heatmap_period(&self) -> Option<Duration> {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        let heatmap_period = tenant_conf
            .heatmap_period
            .unwrap_or(self.conf.default_tenant_conf.heatmap_period);
@@ -2324,26 +2327,40 @@ impl Tenant {
    }

    pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
-        self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
-        self.tenant_conf_updated();
+        // Use read-copy-update in order to avoid overwriting the location config
+        // state if this races with [`Tenant::set_new_location_config`]. Note that
+        // this race is not possible if both request types come from the storage
+        // controller (as they should!) because an exclusive op lock is required
+        // on the storage controller side.
+        self.tenant_conf.rcu(|inner| {
+            Arc::new(AttachedTenantConf {
+                tenant_conf: new_tenant_conf.clone(),
+                location: inner.location,
+            })
+        });
+
+        self.tenant_conf_updated(&new_tenant_conf);
        // Don't hold self.timelines.lock() during the notifies.
        // There's no risk of deadlock right now, but there could be if we consolidate
        // mutexes in struct Timeline in the future.
        let timelines = self.list_timelines();
        for timeline in timelines {
-            timeline.tenant_conf_updated();
+            timeline.tenant_conf_updated(&new_tenant_conf);
        }
    }

    pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
-        *self.tenant_conf.write().unwrap() = new_conf;
-        self.tenant_conf_updated();
+        let new_tenant_conf = new_conf.tenant_conf.clone();
+
+        self.tenant_conf.store(Arc::new(new_conf));
+
+        self.tenant_conf_updated(&new_tenant_conf);
        // Don't hold self.timelines.lock() during the notifies.
        // There's no risk of deadlock right now, but there could be if we consolidate
        // mutexes in struct Timeline in the future.
        let timelines = self.list_timelines();
        for timeline in timelines {
-            timeline.tenant_conf_updated();
+            timeline.tenant_conf_updated(&new_tenant_conf);
        }
    }

@@ -2357,11 +2374,8 @@ impl Tenant {
            .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
    }

-    pub(crate) fn tenant_conf_updated(&self) {
-        let conf = {
-            let guard = self.tenant_conf.read().unwrap();
-            Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf)
-        };
+    pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
+        let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
        self.timeline_get_throttle.reconfigure(conf)
    }

@@ -2509,7 +2523,7 @@ impl Tenant {
                Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
                &crate::metrics::tenant_throttling::TIMELINE_GET,
            )),
-            tenant_conf: Arc::new(RwLock::new(attached_conf)),
+            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
        }
    }

@@ -3495,7 +3509,7 @@ impl Tenant {
    }

    pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
-        self.tenant_conf.read().unwrap().tenant_conf.clone()
+        self.tenant_conf.load().tenant_conf.clone()
    }
 }

@@ -3643,6 +3657,9 @@ pub(crate) mod harness {
                heatmap_period: Some(tenant_conf.heatmap_period),
                lazy_slru_download: Some(tenant_conf.lazy_slru_download),
                timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
+                image_layer_creation_check_threshold: Some(
+                    tenant_conf.image_layer_creation_check_threshold,
+                ),
            }
        }
    }
@@ -3841,6 +3858,7 @@ mod tests {
    use hex_literal::hex;
    use pageserver_api::keyspace::KeySpace;
    use rand::{thread_rng, Rng};
+    use tests::timeline::ShutdownMode;

    static TEST_KEY: Lazy<Key> =
        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4286,7 +4304,7 @@ mod tests {
            make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
            // so that all uploads finish & we can call harness.load() below again
            tenant
-                .shutdown(Default::default(), true)
+                .shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
                .instrument(harness.span())
                .await
                .ok()
@@ -4327,7 +4345,7 @@ mod tests {

            // so that all uploads finish & we can call harness.load() below again
            tenant
-                .shutdown(Default::default(), true)
+                .shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
                .instrument(harness.span())
                .await
                .ok()
@@ -5108,7 +5126,7 @@ mod tests {
            // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
-                .shutdown()
+                .shutdown(super::timeline::ShutdownMode::Hard)
                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
                .await;
            std::mem::forget(tline);
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -57,6 +57,9 @@ pub mod defaults {
    // throughputs up to 1GiB/s per timeline.
    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
+    // By default ingest enough WAL for two new L0 layers before checking if new image
+    // image layers should be created.
+    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;

    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }
@@ -362,6 +365,10 @@ pub struct TenantConf {
    pub lazy_slru_download: bool,

    pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
+
+    // How much WAL must be ingested before checking again whether a new image layer is required.
+    // Expresed in multiples of checkpoint distance.
+    pub image_layer_creation_check_threshold: u8,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -454,6 +461,9 @@ pub struct TenantConfOpt {

    #[serde(skip_serializing_if = "Option::is_none")]
    pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_layer_creation_check_threshold: Option<u8>,
 }

 impl TenantConfOpt {
@@ -508,6 +518,9 @@ impl TenantConfOpt {
                .timeline_get_throttle
                .clone()
                .unwrap_or(global_conf.timeline_get_throttle),
+            image_layer_creation_check_threshold: self
+                .image_layer_creation_check_threshold
+                .unwrap_or(global_conf.image_layer_creation_check_threshold),
        }
    }
 }
@@ -548,6 +561,7 @@ impl Default for TenantConf {
            heatmap_period: Duration::ZERO,
            lazy_slru_download: false,
            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
+            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
        }
    }
 }
@@ -621,6 +635,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
            heatmap_period: value.heatmap_period.map(humantime),
            lazy_slru_download: value.lazy_slru_download,
            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
+            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
        }
    }
 }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -14,7 +14,10 @@ use crate::{
    config::PageServerConf,
    context::RequestContext,
    task_mgr::{self, TaskKind},
-    tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
+    tenant::{
+        mgr::{TenantSlot, TenantsMapRemoveResult},
+        timeline::ShutdownMode,
+    },
 };

 use super::{
@@ -111,6 +114,7 @@ async fn create_local_delete_mark(
    let _ = std::fs::OpenOptions::new()
        .write(true)
        .create(true)
+        .truncate(true)
        .open(&marker_path)
        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;

@@ -462,7 +466,7 @@ impl DeleteTenantFlow {
        // tenant.shutdown
        // Its also bad that we're holding tenants.read here.
        // TODO relax set_stopping to be idempotent?
-        if tenant.shutdown(progress, false).await.is_err() {
+        if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
            return Err(DeleteTenantError::Other(anyhow::anyhow!(
                "tenant shutdown is already in progress"
            )));
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -72,6 +72,10 @@ impl EphemeralFile {
        self.len
    }

+    pub(crate) fn id(&self) -> page_cache::FileId {
+        self.page_cache_file_id
+    }
+
    pub(crate) async fn read_blk(
        &self,
        blknum: u32,
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -346,35 +346,6 @@ where
    }
 }

-#[derive(PartialEq, Eq, Hash, Debug, Clone)]
-pub enum InMemoryLayerHandle {
-    Open {
-        lsn_floor: Lsn,
-        end_lsn: Lsn,
-    },
-    Frozen {
-        idx: usize,
-        lsn_floor: Lsn,
-        end_lsn: Lsn,
-    },
-}
-
-impl InMemoryLayerHandle {
-    pub fn get_lsn_floor(&self) -> Lsn {
-        match self {
-            InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
-            InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
-        }
-    }
-
-    pub fn get_end_lsn(&self) -> Lsn {
-        match self {
-            InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
-            InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
-        }
-    }
-}
-
 impl LayerMap {
    ///
    /// Find the latest layer (by lsn.end) that covers the given
@@ -576,41 +547,18 @@ impl LayerMap {
        self.historic.iter()
    }

-    /// Get a handle for the first in memory layer that matches the provided predicate.
-    /// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
-    ///
-    /// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
-    /// the same exclusive region established by holding the layer manager lock.
-    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
+    /// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
+    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
    where
        Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
    {
        if let Some(open) = &self.open_layer {
            if pred(open) {
-                return Some(InMemoryLayerHandle::Open {
-                    lsn_floor: open.get_lsn_range().start,
-                    end_lsn: open.get_lsn_range().end,
-                });
+                return Some(open.clone());
            }
        }

-        let pos = self.frozen_layers.iter().rev().position(pred);
-        pos.map(|rev_idx| {
-            let idx = self.frozen_layers.len() - 1 - rev_idx;
-            InMemoryLayerHandle::Frozen {
-                idx,
-                lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
-                end_lsn: self.frozen_layers[idx].get_lsn_range().end,
-            }
-        })
-    }
-
-    /// Get the layer pointed to by the provided handle.
-    pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
-        match handle {
-            InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
-            InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
-        }
+        self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
    }

    ///
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -4,7 +4,7 @@
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
 use itertools::Itertools;
 use pageserver_api::key::Key;
-use pageserver_api::models::{LocationConfigMode, ShardParameters};
+use pageserver_api::models::LocationConfigMode;
 use pageserver_api::shard::{
    ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
 };
@@ -16,6 +16,7 @@ use std::collections::{BTreeMap, HashMap};
 use std::ops::Deref;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
+use sysinfo::SystemExt;
 use tokio::fs;
 use utils::timeout::{timeout_cancellable, TimeoutCancellableError};

@@ -39,10 +40,11 @@ use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
-    TenantConfOpt,
 };
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
+use crate::tenant::storage_layer::inmemory_layer;
+use crate::tenant::timeline::ShutdownMode;
 use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};

@@ -543,6 +545,18 @@ pub async fn init_tenant_mgr(

    let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);

+    // Initialize dynamic limits that depend on system resources
+    let system_memory =
+        sysinfo::System::new_with_specifics(sysinfo::RefreshKind::new().with_memory())
+            .total_memory();
+    let max_ephemeral_layer_bytes =
+        conf.ephemeral_bytes_per_memory_kb as u64 * (system_memory / 1024);
+    tracing::info!("Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory");
+    inmemory_layer::GLOBAL_RESOURCES.max_dirty_bytes.store(
+        max_ephemeral_layer_bytes,
+        std::sync::atomic::Ordering::Relaxed,
+    );
+
    // Scan local filesystem for attached tenants
    let tenant_configs = init_load_tenant_configs(conf).await?;

@@ -770,11 +784,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
                            shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone()));
                            join_set.spawn(
                                async move {
-                                    let freeze_and_flush = true;
-
                                    let res = {
                                        let (_guard, shutdown_progress) = completion::channel();
-                                        t.shutdown(shutdown_progress, freeze_and_flush).await
+                                        t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await
                                    };

                                    if let Err(other_progress) = res {
@@ -875,16 +887,6 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
    // caller will log how long we took
 }

-#[derive(Debug, thiserror::Error)]
-pub(crate) enum SetNewTenantConfigError {
-    #[error(transparent)]
-    GetTenant(#[from] GetTenantError),
-    #[error(transparent)]
-    Persist(anyhow::Error),
-    #[error(transparent)]
-    Other(anyhow::Error),
-}
-
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum UpsertLocationError {
    #[error("Bad config request: {0}")]
@@ -910,32 +912,21 @@ impl TenantManager {
        self.conf
    }

-    /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
-    /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
+    /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or currently
+    /// undergoing a state change (i.e. slot is InProgress).
+    ///
+    /// The return Tenant is not guaranteed to be active: check its status after obtaing it, or
+    /// use [`Tenant::wait_to_become_active`] before using it if you will do I/O on it.
    pub(crate) fn get_attached_tenant_shard(
        &self,
        tenant_shard_id: TenantShardId,
-        active_only: bool,
    ) -> Result<Arc<Tenant>, GetTenantError> {
        let locked = self.tenants.read().unwrap();

        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;

        match peek_slot {
-            Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
-                TenantState::Broken {
-                    reason,
-                    backtrace: _,
-                } if active_only => Err(GetTenantError::Broken(reason)),
-                TenantState::Active => Ok(Arc::clone(tenant)),
-                _ => {
-                    if active_only {
-                        Err(GetTenantError::NotActive(tenant_shard_id))
-                    } else {
-                        Ok(Arc::clone(tenant))
-                    }
-                }
-            },
+            Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)),
            Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
            None | Some(TenantSlot::Secondary(_)) => {
                Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
@@ -1115,7 +1106,7 @@ impl TenantManager {
                };

                info!("Shutting down attached tenant");
-                match tenant.shutdown(progress, false).await {
+                match tenant.shutdown(progress, ShutdownMode::Hard).await {
                    Ok(()) => {}
                    Err(barrier) => {
                        info!("Shutdown already in progress, waiting for it to complete");
@@ -1231,7 +1222,7 @@ impl TenantManager {
                    TenantSlot::Attached(tenant) => {
                        let (_guard, progress) = utils::completion::channel();
                        info!("Shutting down just-spawned tenant, because tenant manager is shut down");
-                        match tenant.shutdown(progress, false).await {
+                        match tenant.shutdown(progress, ShutdownMode::Hard).await {
                            Ok(()) => {
                                info!("Finished shutting down just-spawned tenant");
                            }
@@ -1281,7 +1272,7 @@ impl TenantManager {
        };

        let (_guard, progress) = utils::completion::channel();
-        match tenant.shutdown(progress, false).await {
+        match tenant.shutdown(progress, ShutdownMode::Hard).await {
            Ok(()) => {
                slot_guard.drop_old_value()?;
            }
@@ -1428,7 +1419,8 @@ impl TenantManager {
                    .wait_to_become_active(activation_timeout)
                    .await
                    .map_err(|e| match e {
-                        GetActiveTenantError::WillNotBecomeActive(_) => {
+                        GetActiveTenantError::WillNotBecomeActive(_)
+                        | GetActiveTenantError::Broken(_) => {
                            DeleteTenantError::InvalidState(tenant.current_state())
                        }
                        GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
@@ -1455,29 +1447,30 @@ impl TenantManager {
        result
    }

-    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.literal()))]
+    #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
    pub(crate) async fn shard_split(
        &self,
-        tenant_shard_id: TenantShardId,
+        tenant: Arc<Tenant>,
        new_shard_count: ShardCount,
        new_stripe_size: Option<ShardStripeSize>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<TenantShardId>> {
+        let tenant_shard_id = *tenant.get_tenant_shard_id();
        let r = self
-            .do_shard_split(tenant_shard_id, new_shard_count, new_stripe_size, ctx)
+            .do_shard_split(tenant, new_shard_count, new_stripe_size, ctx)
            .await;
        if r.is_err() {
            // Shard splitting might have left the original shard in a partially shut down state (it
            // stops the shard's remote timeline client).  Reset it to ensure we leave things in
            // a working state.
            if self.get(tenant_shard_id).is_some() {
-                tracing::warn!("Resetting {tenant_shard_id} after shard split failure");
+                tracing::warn!("Resetting after shard split failure");
                if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await {
                    // Log this error because our return value will still be the original error, not this one.  This is
                    // a severe error: if this happens, we might be leaving behind a tenant that is not fully functional
                    // (e.g. has uploads disabled).  We can't do anything else: if reset fails then shutting the tenant down or
                    // setting it broken probably won't help either.
-                    tracing::error!("Failed to reset {tenant_shard_id}: {e}");
+                    tracing::error!("Failed to reset: {e}");
                }
            }
        }
@@ -1487,12 +1480,12 @@ impl TenantManager {

    pub(crate) async fn do_shard_split(
        &self,
-        tenant_shard_id: TenantShardId,
+        tenant: Arc<Tenant>,
        new_shard_count: ShardCount,
        new_stripe_size: Option<ShardStripeSize>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<TenantShardId>> {
-        let tenant = get_tenant(tenant_shard_id, true)?;
+        let tenant_shard_id = *tenant.get_tenant_shard_id();

        // Validate the incoming request
        if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
@@ -1538,7 +1531,6 @@ impl TenantManager {
            // If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
            // have been left in a partially-shut-down state.
            tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning");
-            self.reset_tenant(tenant_shard_id, false, ctx).await?;
            return Err(e);
        }

@@ -1656,7 +1648,14 @@ impl TenantManager {
                    fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
                        "failpoint"
                    )));
-                    if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
+                    if let Err(e) = timeline
+                        .wait_lsn(
+                            *target_lsn,
+                            crate::tenant::timeline::WaitLsnWaiter::Tenant,
+                            ctx,
+                        )
+                        .await
+                    {
                        // Failure here might mean shutdown, in any case this part is an optimization
                        // and we shouldn't hold up the split operation.
                        tracing::warn!(
@@ -1677,7 +1676,7 @@ impl TenantManager {

        // Phase 5: Shut down the parent shard, and erase it from disk
        let (_guard, progress) = completion::channel();
-        match parent.shutdown(progress, false).await {
+        match parent.shutdown(progress, ShutdownMode::Hard).await {
            Ok(()) => {}
            Err(other) => {
                other.wait().await;
@@ -1936,38 +1935,23 @@ impl TenantManager {
        removal_result
    }

-    pub(crate) async fn set_new_tenant_config(
+    pub(crate) fn list_tenants(
        &self,
-        new_tenant_conf: TenantConfOpt,
-        tenant_id: TenantId,
-    ) -> Result<(), SetNewTenantConfigError> {
-        // Legacy API: does not support sharding
-        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-        info!("configuring tenant {tenant_id}");
-        let tenant = get_tenant(tenant_shard_id, true)?;
-
-        if !tenant.tenant_shard_id().shard_count.is_unsharded() {
-            // Note that we use ShardParameters::default below.
-            return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
-            "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
-        )));
-        }
-
-        // This is a legacy API that only operates on attached tenants: the preferred
-        // API to use is the location_config/ endpoint, which lets the caller provide
-        // the full LocationConf.
-        let location_conf = LocationConf::attached_single(
-            new_tenant_conf.clone(),
-            tenant.generation,
-            &ShardParameters::default(),
-        );
-
-        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &location_conf)
-            .await
-            .map_err(SetNewTenantConfigError::Persist)?;
-        tenant.set_new_tenant_config(new_tenant_conf);
-        Ok(())
+    ) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
+        let tenants = TENANTS.read().unwrap();
+        let m = match &*tenants {
+            TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
+        };
+        Ok(m.iter()
+            .filter_map(|(id, tenant)| match tenant {
+                TenantSlot::Attached(tenant) => {
+                    Some((*id, tenant.current_state(), tenant.generation()))
+                }
+                TenantSlot::Secondary(_) => None,
+                TenantSlot::InProgress(_) => None,
+            })
+            .collect())
    }
 }

@@ -1980,51 +1964,12 @@ pub(crate) enum GetTenantError {

    #[error("Tenant {0} is not active")]
    NotActive(TenantShardId),
-    /// Broken is logically a subset of NotActive, but a distinct error is useful as
-    /// NotActive is usually a retryable state for API purposes, whereas Broken
-    /// is a stuck error state
-    #[error("Tenant is broken: {0}")]
-    Broken(String),

    // Initializing or shutting down: cannot authoritatively say whether we have this tenant
    #[error("Tenant map is not available: {0}")]
    MapState(#[from] TenantMapError),
 }

-/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
-/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
-///
-/// This method is cancel-safe.
-pub(crate) fn get_tenant(
-    tenant_shard_id: TenantShardId,
-    active_only: bool,
-) -> Result<Arc<Tenant>, GetTenantError> {
-    let locked = TENANTS.read().unwrap();
-
-    let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
-
-    match peek_slot {
-        Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
-            TenantState::Broken {
-                reason,
-                backtrace: _,
-            } if active_only => Err(GetTenantError::Broken(reason)),
-            TenantState::Active => Ok(Arc::clone(tenant)),
-            _ => {
-                if active_only {
-                    Err(GetTenantError::NotActive(tenant_shard_id))
-                } else {
-                    Ok(Arc::clone(tenant))
-                }
-            }
-        },
-        Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
-        None | Some(TenantSlot::Secondary(_)) => {
-            Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
-        }
-    }
-}
-
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum GetActiveTenantError {
    /// We may time out either while TenantSlot is InProgress, or while the Tenant
@@ -2048,6 +1993,12 @@ pub(crate) enum GetActiveTenantError {
    /// Tenant exists, but is in a state that cannot become active (e.g. Stopping, Broken)
    #[error("will not become active.  Current state: {0}")]
    WillNotBecomeActive(TenantState),
+
+    /// Broken is logically a subset of WillNotBecomeActive, but a distinct error is useful as
+    /// WillNotBecomeActive is a permitted error under some circumstances, whereas broken should
+    /// never happen.
+    #[error("Tenant is broken: {0}")]
+    Broken(String),
 }

 /// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`]
@@ -2267,27 +2218,6 @@ pub(crate) enum TenantMapListError {
    Initializing,
 }

-///
-/// Get list of tenants, for the mgmt API
-///
-pub(crate) async fn list_tenants(
-) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
-    let tenants = TENANTS.read().unwrap();
-    let m = match &*tenants {
-        TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
-        TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
-    };
-    Ok(m.iter()
-        .filter_map(|(id, tenant)| match tenant {
-            TenantSlot::Attached(tenant) => {
-                Some((*id, tenant.current_state(), tenant.generation()))
-            }
-            TenantSlot::Secondary(_) => None,
-            TenantSlot::InProgress(_) => None,
-        })
-        .collect())
-}
-
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum TenantMapInsertError {
    #[error(transparent)]
@@ -2733,11 +2663,11 @@ where
    let attached_tenant = match slot_guard.get_old_value() {
        Some(TenantSlot::Attached(tenant)) => {
            // whenever we remove a tenant from memory, we don't want to flush and wait for upload
-            let freeze_and_flush = false;
+            let shutdown_mode = ShutdownMode::Hard;

            // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
            // that we can continue safely to cleanup.
-            match tenant.shutdown(progress, freeze_and_flush).await {
+            match tenant.shutdown(progress, shutdown_mode).await {
                Ok(()) => {}
                Err(_other) => {
                    // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -217,7 +217,7 @@ use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::download::download_retry;
 use crate::tenant::storage_layer::AsLayerDesc;
-use crate::tenant::upload_queue::Delete;
+use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable};
 use crate::tenant::TIMELINES_SEGMENT_NAME;
 use crate::{
    config::PageServerConf,
@@ -266,15 +266,6 @@ pub enum MaybeDeletedIndexPart {
    Deleted(IndexPart),
 }

-/// Errors that can arise when calling [`RemoteTimelineClient::stop`].
-#[derive(Debug, thiserror::Error)]
-pub enum StopError {
-    /// Returned if the upload queue was never initialized.
-    /// See [`RemoteTimelineClient::init_upload_queue`] and [`RemoteTimelineClient::init_upload_queue_for_empty_remote`].
-    #[error("queue is not initialized")]
-    QueueUninitialized,
-}
-
 #[derive(Debug, thiserror::Error)]
 pub enum PersistIndexPartWithDeletedFlagError {
    #[error("another task is already setting the deleted_flag, started at {0:?}")]
@@ -399,15 +390,10 @@ impl RemoteTimelineClient {
            "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
        ))?;

-        {
-            let mut upload_queue = self.upload_queue.lock().unwrap();
-            upload_queue.initialize_with_current_remote_index_part(index_part)?;
-            self.update_remote_physical_size_gauge(Some(index_part));
-        }
-        // also locks upload queue, without dropping the guard above it will be a deadlock
-        self.stop().expect("initialized line above");
-
        let mut upload_queue = self.upload_queue.lock().unwrap();
+        upload_queue.initialize_with_current_remote_index_part(index_part)?;
+        self.update_remote_physical_size_gauge(Some(index_part));
+        self.stop_impl(&mut upload_queue);

        upload_queue
            .stopped_mut()
@@ -421,7 +407,8 @@ impl RemoteTimelineClient {
        match &mut *self.upload_queue.lock().unwrap() {
            UploadQueue::Uninitialized => None,
            UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
-            UploadQueue::Stopped(q) => q
+            UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None,
+            UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => q
                .upload_queue_for_deletion
                .get_last_remote_consistent_lsn_projected(),
        }
@@ -431,7 +418,8 @@ impl RemoteTimelineClient {
        match &mut *self.upload_queue.lock().unwrap() {
            UploadQueue::Uninitialized => None,
            UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
-            UploadQueue::Stopped(q) => Some(
+            UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None,
+            UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => Some(
                q.upload_queue_for_deletion
                    .get_last_remote_consistent_lsn_visible(),
            ),
@@ -898,7 +886,7 @@ impl RemoteTimelineClient {
    /// Wait for all previously scheduled operations to complete, and then stop.
    ///
    /// Not cancellation safe
-    pub(crate) async fn shutdown(self: &Arc<Self>) -> Result<(), StopError> {
+    pub(crate) async fn shutdown(self: &Arc<Self>) {
        // On cancellation the queue is left in ackward state of refusing new operations but
        // proper stop is yet to be called. On cancel the original or some later task must call
        // `stop` or `shutdown`.
@@ -909,8 +897,12 @@ impl RemoteTimelineClient {
        let fut = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = match &mut *guard {
-                UploadQueue::Stopped(_) => return Ok(()),
-                UploadQueue::Uninitialized => return Err(StopError::QueueUninitialized),
+                UploadQueue::Stopped(_) => return,
+                UploadQueue::Uninitialized => {
+                    // transition into Stopped state
+                    self.stop_impl(&mut guard);
+                    return;
+                }
                UploadQueue::Initialized(ref mut init) => init,
            };

@@ -942,7 +934,7 @@ impl RemoteTimelineClient {
            }
        }

-        self.stop()
+        self.stop();
    }

    /// Set the deleted_at field in the remote index file.
@@ -1324,12 +1316,7 @@ impl RemoteTimelineClient {
            // upload finishes or times out soon enough.
            if cancel.is_cancelled() {
                info!("upload task cancelled by shutdown request");
-                match self.stop() {
-                    Ok(()) => {}
-                    Err(StopError::QueueUninitialized) => {
-                        unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
-                    }
-                }
+                self.stop();
                return;
            }

@@ -1582,19 +1569,25 @@ impl RemoteTimelineClient {
    /// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
    ///
    /// In-progress operations will still be running after this function returns.
-    /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
+    /// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))`
    /// to wait for them to complete, after calling this function.
-    pub(crate) fn stop(&self) -> Result<(), StopError> {
+    pub(crate) fn stop(&self) {
        // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
        // into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
        // The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
        let mut guard = self.upload_queue.lock().unwrap();
-        match &mut *guard {
-            UploadQueue::Uninitialized => Err(StopError::QueueUninitialized),
+        self.stop_impl(&mut guard);
+    }
+
+    fn stop_impl(&self, guard: &mut std::sync::MutexGuard<UploadQueue>) {
+        match &mut **guard {
+            UploadQueue::Uninitialized => {
+                info!("UploadQueue is in state Uninitialized, nothing to do");
+                **guard = UploadQueue::Stopped(UploadQueueStopped::Uninitialized);
+            }
            UploadQueue::Stopped(_) => {
                // nothing to do
                info!("another concurrent task already shut down the queue");
-                Ok(())
            }
            UploadQueue::Initialized(initialized) => {
                info!("shutting down upload queue");
@@ -1627,11 +1620,13 @@ impl RemoteTimelineClient {
                    };

                    let upload_queue = std::mem::replace(
-                        &mut *guard,
-                        UploadQueue::Stopped(UploadQueueStopped {
-                            upload_queue_for_deletion,
-                            deleted_at: SetDeletedFlagProgress::NotRunning,
-                        }),
+                        &mut **guard,
+                        UploadQueue::Stopped(UploadQueueStopped::Deletable(
+                            UploadQueueStoppedDeletable {
+                                upload_queue_for_deletion,
+                                deleted_at: SetDeletedFlagProgress::NotRunning,
+                            },
+                        )),
                    );
                    if let UploadQueue::Initialized(qi) = upload_queue {
                        qi
@@ -1660,10 +1655,6 @@ impl RemoteTimelineClient {
                    // which is exactly what we want to happen.
                    drop(op);
                }
-
-                // We're done.
-                drop(guard);
-                Ok(())
            }
        }
    }
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -11,11 +11,11 @@ use crate::{
    disk_usage_eviction_task::{
        finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer,
    },
-    is_temporary,
    metrics::SECONDARY_MODE,
    tenant::{
        config::SecondaryLocationConfig,
        debug_assert_current_span_has_tenant_and_timeline_id,
+        ephemeral_file::is_ephemeral_file,
        remote_timeline_client::{
            index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
@@ -786,6 +786,35 @@ impl<'a> TenantDownloader<'a> {
            // Existing on-disk layers: just update their access time.
            if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
                tracing::debug!("Layer {} is already on disk", layer.name);
+
+                if cfg!(debug_assertions) {
+                    // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
+                    // are already present on disk are really there.
+                    let local_path = self
+                        .conf
+                        .timeline_path(tenant_shard_id, &timeline.timeline_id)
+                        .join(layer.name.file_name());
+                    match tokio::fs::metadata(&local_path).await {
+                        Ok(meta) => {
+                            tracing::debug!(
+                                "Layer {} present at {}, size {}",
+                                layer.name,
+                                local_path,
+                                meta.len(),
+                            );
+                        }
+                        Err(e) => {
+                            tracing::warn!(
+                                "Layer {} not found at {} ({})",
+                                layer.name,
+                                local_path,
+                                e
+                            );
+                            debug_assert!(false);
+                        }
+                    }
+                }
+
                if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
                    || on_disk.access_time != layer.access_time
                {
@@ -964,7 +993,7 @@ async fn init_timeline_state(
            continue;
        } else if crate::is_temporary(&file_path)
            || is_temp_download_file(&file_path)
-            || is_temporary(&file_path)
+            || is_ephemeral_file(file_name)
        {
            // Temporary files are frequently left behind from restarting during downloads
            tracing::info!("Cleaning up temporary file {file_path}");
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -9,6 +9,7 @@ use crate::{
    metrics::SECONDARY_MODE,
    tenant::{
        config::AttachmentMode,
+        mgr::GetTenantError,
        mgr::TenantManager,
        remote_timeline_client::remote_heatmap_path,
        span::debug_assert_current_span_has_tenant_id,
@@ -292,8 +293,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            "Starting heatmap write on command");
        let tenant = self
            .tenant_manager
-            .get_attached_tenant_shard(*tenant_shard_id, true)
+            .get_attached_tenant_shard(*tenant_shard_id)
            .map_err(|e| anyhow::anyhow!(e))?;
+        if !tenant.is_active() {
+            return Err(GetTenantError::NotActive(*tenant_shard_id).into());
+        }

        Ok(UploadPending {
            // Ignore our state for last digest: this forces an upload even if nothing has changed
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -3,7 +3,7 @@
 pub mod delta_layer;
 mod filename;
 pub mod image_layer;
-mod inmemory_layer;
+pub(crate) mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;

@@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
 use std::ops::Range;
-use std::sync::Mutex;
+use std::sync::{Arc, Mutex};
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -41,8 +41,8 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};

 pub(crate) use layer::{EvictionError, Layer, ResidentLayer};

-use super::layer_map::InMemoryLayerHandle;
-use super::timeline::layer_manager::LayerManager;
+use self::inmemory_layer::InMemoryLayerFileId;
+
 use super::timeline::GetVectoredError;
 use super::PageReconstructError;

@@ -204,23 +204,30 @@ impl Default for ValuesReconstructState {
    }
 }

-/// Description of layer to be read - the layer map can turn
-/// this description into the actual layer.
-#[derive(PartialEq, Eq, Hash, Debug, Clone)]
-pub(crate) enum ReadableLayerDesc {
-    Persistent {
-        desc: PersistentLayerDesc,
-        lsn_range: Range<Lsn>,
-    },
-    InMemory {
-        handle: InMemoryLayerHandle,
-        lsn_ceil: Lsn,
-    },
+/// A key that uniquely identifies a layer in a timeline
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+pub(crate) enum LayerId {
+    PersitentLayerId(PersistentLayerKey),
+    InMemoryLayerId(InMemoryLayerFileId),
 }

-/// Wraper for 'ReadableLayerDesc' sorted by Lsn
+/// Layer wrapper for the read path. Note that it is valid
+/// to use these layers even after external operations have
+/// been performed on them (compaction, freeze, etc.).
 #[derive(Debug)]
-struct ReadableLayerDescOrdered(ReadableLayerDesc);
+pub(crate) enum ReadableLayer {
+    PersistentLayer(Layer),
+    InMemoryLayer(Arc<InMemoryLayer>),
+}
+
+/// A partial description of a read to be done.
+#[derive(Debug, Clone)]
+struct ReadDesc {
+    /// An id used to resolve the readable layer within the fringe
+    layer_id: LayerId,
+    /// Lsn range for the read, used for selecting the next read
+    lsn_range: Range<Lsn>,
+}

 /// Data structure which maintains a fringe of layers for the
 /// read path. The fringe is the set of layers which intersects
@@ -231,41 +238,64 @@ struct ReadableLayerDescOrdered(ReadableLayerDesc);
 /// a two layer indexing scheme.
 #[derive(Debug)]
 pub(crate) struct LayerFringe {
-    layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
-    layers: HashMap<ReadableLayerDesc, KeySpace>,
+    planned_reads_by_lsn: BinaryHeap<ReadDesc>,
+    layers: HashMap<LayerId, LayerKeyspace>,
+}
+
+#[derive(Debug)]
+struct LayerKeyspace {
+    layer: ReadableLayer,
+    target_keyspace: KeySpace,
 }

 impl LayerFringe {
    pub(crate) fn new() -> Self {
        LayerFringe {
-            layers_by_lsn: BinaryHeap::new(),
+            planned_reads_by_lsn: BinaryHeap::new(),
            layers: HashMap::new(),
        }
    }

-    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
-        let handle = match self.layers_by_lsn.pop() {
-            Some(h) => h,
+    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
+        let read_desc = match self.planned_reads_by_lsn.pop() {
+            Some(desc) => desc,
            None => return None,
        };

-        let removed = self.layers.remove_entry(&handle.0);
+        let removed = self.layers.remove_entry(&read_desc.layer_id);
        match removed {
-            Some((layer, keyspace)) => Some((layer, keyspace)),
+            Some((
+                _,
+                LayerKeyspace {
+                    layer,
+                    target_keyspace,
+                },
+            )) => Some((layer, target_keyspace, read_desc.lsn_range)),
            None => unreachable!("fringe internals are always consistent"),
        }
    }

-    pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
-        let entry = self.layers.entry(layer.clone());
+    pub(crate) fn update(
+        &mut self,
+        layer: ReadableLayer,
+        keyspace: KeySpace,
+        lsn_range: Range<Lsn>,
+    ) {
+        let layer_id = layer.id();
+        let entry = self.layers.entry(layer_id.clone());
        match entry {
            Entry::Occupied(mut entry) => {
-                entry.get_mut().merge(&keyspace);
+                entry.get_mut().target_keyspace.merge(&keyspace);
            }
            Entry::Vacant(entry) => {
-                self.layers_by_lsn
-                    .push(ReadableLayerDescOrdered(entry.key().clone()));
-                entry.insert(keyspace);
+                self.planned_reads_by_lsn.push(ReadDesc {
+                    lsn_range,
+                    layer_id: layer_id.clone(),
+                });
+                entry.insert(LayerKeyspace {
+                    layer,
+                    target_keyspace: keyspace,
+                });
            }
        }
    }
@@ -277,77 +307,55 @@ impl Default for LayerFringe {
    }
 }

-impl Ord for ReadableLayerDescOrdered {
+impl Ord for ReadDesc {
    fn cmp(&self, other: &Self) -> Ordering {
-        let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
+        let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
        if ord == std::cmp::Ordering::Equal {
-            self.0
-                .get_lsn_floor()
-                .cmp(&other.0.get_lsn_floor())
-                .reverse()
+            self.lsn_range.start.cmp(&other.lsn_range.start).reverse()
        } else {
            ord
        }
    }
 }

-impl PartialOrd for ReadableLayerDescOrdered {
+impl PartialOrd for ReadDesc {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
 }

-impl PartialEq for ReadableLayerDescOrdered {
+impl PartialEq for ReadDesc {
    fn eq(&self, other: &Self) -> bool {
-        self.0.get_lsn_floor() == other.0.get_lsn_floor()
-            && self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
+        self.lsn_range == other.lsn_range
    }
 }

-impl Eq for ReadableLayerDescOrdered {}
+impl Eq for ReadDesc {}

-impl ReadableLayerDesc {
-    pub(crate) fn get_lsn_floor(&self) -> Lsn {
+impl ReadableLayer {
+    pub(crate) fn id(&self) -> LayerId {
        match self {
-            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
-            ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
-        }
-    }
-
-    pub(crate) fn get_lsn_ceil(&self) -> Lsn {
-        match self {
-            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
-            ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
+            Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()),
+            Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()),
        }
    }

    pub(crate) async fn get_values_reconstruct_data(
        &self,
-        layer_manager: &LayerManager,
        keyspace: KeySpace,
+        lsn_range: Range<Lsn>,
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
        match self {
-            ReadableLayerDesc::Persistent { desc, lsn_range } => {
-                let layer = layer_manager.get_from_desc(desc);
+            ReadableLayer::PersistentLayer(layer) => {
                layer
-                    .get_values_reconstruct_data(
-                        keyspace,
-                        lsn_range.clone(),
-                        reconstruct_state,
-                        ctx,
-                    )
+                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
                    .await
            }
-            ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
-                let layer = layer_manager
-                    .layer_map()
-                    .get_in_memory_layer(handle)
-                    .unwrap();
-
+            ReadableLayer::InMemoryLayer(layer) => {
                layer
-                    .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
                    .await
            }
        }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -47,6 +47,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
+use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -946,6 +947,34 @@ impl DeltaLayerInner {
        Ok(planner.finish())
    }

+    fn get_min_read_buffer_size(
+        planned_reads: &[VectoredRead],
+        read_size_soft_max: usize,
+    ) -> usize {
+        let Some(largest_read) = planned_reads.iter().max_by_key(|read| read.size()) else {
+            return read_size_soft_max;
+        };
+
+        let largest_read_size = largest_read.size();
+        if largest_read_size > read_size_soft_max {
+            // If the read is oversized, it should only contain one key.
+            let offenders = largest_read
+                .blobs_at
+                .as_slice()
+                .iter()
+                .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
+                .join(", ");
+            tracing::warn!(
+                "Oversized vectored read ({} > {}) for keys {}",
+                largest_read_size,
+                read_size_soft_max,
+                offenders
+            );
+        }
+
+        largest_read_size
+    }
+
    async fn do_reads_and_update_state(
        &self,
        reads: Vec<VectoredRead>,
@@ -959,7 +988,8 @@ impl DeltaLayerInner {
            .expect("Layer is loaded with max vectored bytes config")
            .0
            .into();
-        let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
+        let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
+        let mut buf = Some(BytesMut::with_capacity(buf_size));

        // Note that reads are processed in reverse order (from highest key+lsn).
        // This is the order that `ReconstructState` requires such that it can
@@ -986,7 +1016,7 @@ impl DeltaLayerInner {

                    // We have "lost" the buffer since the lower level IO api
                    // doesn't return the buffer on error. Allocate a new one.
-                    buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
+                    buf = Some(BytesMut::with_capacity(buf_size));

                    continue;
                }
@@ -1210,9 +1240,16 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
 mod test {
    use std::collections::BTreeMap;

+    use itertools::MinMaxResult;
+    use rand::prelude::{SeedableRng, SliceRandom, StdRng};
+    use rand::RngCore;
+
    use super::*;
    use crate::{
-        context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk,
+        context::DownloadBehavior,
+        task_mgr::TaskKind,
+        tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
+        DEFAULT_PG_VERSION,
    };

    /// Construct an index for a fictional delta layer and and then
@@ -1332,4 +1369,229 @@ mod test {

        assert_eq!(planned_blobs, expected_blobs);
    }
+
+    mod constants {
+        use utils::lsn::Lsn;
+
+        /// Offset used by all lsns in this test
+        pub(super) const LSN_OFFSET: Lsn = Lsn(0x08);
+        /// Number of unique keys including in the test data
+        pub(super) const KEY_COUNT: u8 = 60;
+        /// Max number of different lsns for each key
+        pub(super) const MAX_ENTRIES_PER_KEY: u8 = 20;
+        /// Possible value sizes for each key along with a probability weight
+        pub(super) const VALUE_SIZES: [(usize, u8); 3] = [(100, 2), (1024, 2), (1024 * 1024, 1)];
+        /// Probability that there will be a gap between the current key and the next one (33.3%)
+        pub(super) const KEY_GAP_CHANGES: [(bool, u8); 2] = [(true, 1), (false, 2)];
+        /// The minimum size of a key range in all the generated reads
+        pub(super) const MIN_RANGE_SIZE: i128 = 10;
+        /// The number of ranges included in each vectored read
+        pub(super) const RANGES_COUNT: u8 = 2;
+        /// The number of vectored reads performed
+        pub(super) const READS_COUNT: u8 = 100;
+        /// Soft max size of a vectored read. Will be violated if we have to read keys
+        /// with values larger than the limit
+        pub(super) const MAX_VECTORED_READ_BYTES: usize = 64 * 1024;
+    }
+
+    struct Entry {
+        key: Key,
+        lsn: Lsn,
+        value: Vec<u8>,
+    }
+
+    fn generate_entries(rng: &mut StdRng) -> Vec<Entry> {
+        let mut current_key = Key::MIN;
+
+        let mut entries = Vec::new();
+        for _ in 0..constants::KEY_COUNT {
+            let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY);
+            let mut lsns_iter =
+                std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| {
+                    Some(Lsn(lsn.0 + 0x08))
+                });
+            let mut lsns = Vec::new();
+            while lsns.len() < count as usize {
+                let take = rng.gen_bool(0.5);
+                let lsn = lsns_iter.next().unwrap();
+                if take {
+                    lsns.push(lsn);
+                }
+            }
+
+            for lsn in lsns {
+                let size = constants::VALUE_SIZES
+                    .choose_weighted(rng, |item| item.1)
+                    .unwrap()
+                    .0;
+                let mut buf = vec![0; size];
+                rng.fill_bytes(&mut buf);
+
+                entries.push(Entry {
+                    key: current_key,
+                    lsn,
+                    value: buf,
+                })
+            }
+
+            let gap = constants::KEY_GAP_CHANGES
+                .choose_weighted(rng, |item| item.1)
+                .unwrap()
+                .0;
+            if gap {
+                current_key = current_key.add(2);
+            } else {
+                current_key = current_key.add(1);
+            }
+        }
+
+        entries
+    }
+
+    struct EntriesMeta {
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+        index: BTreeMap<(Key, Lsn), Vec<u8>>,
+    }
+
+    fn get_entries_meta(entries: &[Entry]) -> EntriesMeta {
+        let key_range = match entries.iter().minmax_by_key(|e| e.key) {
+            MinMaxResult::MinMax(min, max) => min.key..max.key.next(),
+            _ => panic!("More than one entry is always expected"),
+        };
+
+        let lsn_range = match entries.iter().minmax_by_key(|e| e.lsn) {
+            MinMaxResult::MinMax(min, max) => min.lsn..Lsn(max.lsn.0 + 1),
+            _ => panic!("More than one entry is always expected"),
+        };
+
+        let mut index = BTreeMap::new();
+        for entry in entries.iter() {
+            index.insert((entry.key, entry.lsn), entry.value.clone());
+        }
+
+        EntriesMeta {
+            key_range,
+            lsn_range,
+            index,
+        }
+    }
+
+    fn pick_random_keyspace(rng: &mut StdRng, key_range: &Range<Key>) -> KeySpace {
+        let start = key_range.start.to_i128();
+        let end = key_range.end.to_i128();
+
+        let mut keyspace = KeySpace::default();
+
+        for _ in 0..constants::RANGES_COUNT {
+            let mut range: Option<Range<Key>> = Option::default();
+            while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) {
+                let range_start = rng.gen_range(start..end);
+                let range_end_offset = range_start + constants::MIN_RANGE_SIZE;
+                if range_end_offset >= end {
+                    range = Some(Key::from_i128(range_start)..Key::from_i128(end));
+                } else {
+                    let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end);
+                    range = Some(Key::from_i128(range_start)..Key::from_i128(range_end));
+                }
+            }
+            keyspace.ranges.push(range.unwrap());
+        }
+
+        keyspace
+    }
+
+    #[tokio::test]
+    async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
+        let (tenant, ctx) = harness.load().await;
+
+        let timeline_id = TimelineId::generate();
+        let timeline = tenant
+            .create_test_timeline(timeline_id, constants::LSN_OFFSET, DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        tracing::info!("Generating test data ...");
+
+        let rng = &mut StdRng::seed_from_u64(0);
+        let entries = generate_entries(rng);
+        let entries_meta = get_entries_meta(&entries);
+
+        tracing::info!("Done generating {} entries", entries.len());
+
+        tracing::info!("Writing test data to delta layer ...");
+        let mut writer = DeltaLayerWriter::new(
+            harness.conf,
+            timeline_id,
+            harness.tenant_shard_id,
+            entries_meta.key_range.start,
+            entries_meta.lsn_range.clone(),
+        )
+        .await?;
+
+        for entry in entries {
+            let (_, res) = writer
+                .put_value_bytes(entry.key, entry.lsn, entry.value, false)
+                .await;
+            res?;
+        }
+
+        let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
+
+        let inner = resident.get_inner_delta(&ctx).await?;
+
+        let file_size = inner.file.metadata().await?.len();
+        tracing::info!(
+            "Done writing test data to delta layer. Resulting file size is: {}",
+            file_size
+        );
+
+        for i in 0..constants::READS_COUNT {
+            tracing::info!("Doing vectored read {}/{}", i + 1, constants::READS_COUNT);
+
+            let block_reader = FileBlockReader::new(&inner.file, inner.file_id);
+            let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+                inner.index_start_blk,
+                inner.index_root_blk,
+                block_reader,
+            );
+
+            let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES);
+            let mut reconstruct_state = ValuesReconstructState::new();
+            let keyspace = pick_random_keyspace(rng, &entries_meta.key_range);
+            let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
+
+            let vectored_reads = DeltaLayerInner::plan_reads(
+                keyspace.clone(),
+                entries_meta.lsn_range.clone(),
+                data_end_offset,
+                index_reader,
+                planner,
+                &mut reconstruct_state,
+                &ctx,
+            )
+            .await?;
+
+            let vectored_blob_reader = VectoredBlobReader::new(&inner.file);
+            let buf_size = DeltaLayerInner::get_min_read_buffer_size(
+                &vectored_reads,
+                constants::MAX_VECTORED_READ_BYTES,
+            );
+            let mut buf = Some(BytesMut::with_capacity(buf_size));
+
+            for read in vectored_reads {
+                let blobs_buf = vectored_blob_reader
+                    .read_blobs(&read, buf.take().expect("Should have a buffer"))
+                    .await?;
+                for meta in blobs_buf.blobs.iter() {
+                    let value = &blobs_buf.buf[meta.start..meta.end];
+                    assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
+                }
+
+                buf = Some(blobs_buf.buf);
+            }
+        }
+
+        Ok(())
+    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -44,6 +44,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
+use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -540,7 +541,25 @@ impl ImageLayerInner {

        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
        for read in reads.into_iter() {
-            let buf = BytesMut::with_capacity(max_vectored_read_bytes);
+            let buf_size = read.size();
+
+            if buf_size > max_vectored_read_bytes {
+                // If the read is oversized, it should only contain one key.
+                let offenders = read
+                    .blobs_at
+                    .as_slice()
+                    .iter()
+                    .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
+                    .join(", ");
+                tracing::warn!(
+                    "Oversized vectored read ({} > {}) for keys {}",
+                    buf_size,
+                    max_vectored_read_bytes,
+                    offenders
+                );
+            }
+
+            let buf = BytesMut::with_capacity(buf_size);
            let res = vectored_blob_reader.read_blobs(&read, buf).await;

            match res {
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -12,7 +12,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{PageReconstructError, Timeline};
-use crate::walrecord;
+use crate::{page_cache, walrecord};
 use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
@@ -23,8 +23,12 @@ use tracing::*;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
+use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
+use std::cmp::Ordering;
 use std::fmt::Write as _;
 use std::ops::Range;
+use std::sync::atomic::Ordering as AtomicOrdering;
+use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::{RwLock, RwLockWriteGuard};

 use super::{
@@ -32,10 +36,14 @@ use super::{
    ValuesReconstructState,
 };

+#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
+pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
+
 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
+    file_id: InMemoryLayerFileId,

    /// This layer contains all the changes from 'start_lsn'. The
    /// start is inclusive.
@@ -70,6 +78,8 @@ pub struct InMemoryLayerInner {
    /// Each serialized Value is preceded by a 'u32' length field.
    /// PerSeg::page_versions map stores offsets into this file.
    file: EphemeralFile,
+
+    resource_units: GlobalResourceUnits,
 }

 impl std::fmt::Debug for InMemoryLayerInner {
@@ -78,7 +88,126 @@ impl std::fmt::Debug for InMemoryLayerInner {
    }
 }

+/// State shared by all in-memory (ephemeral) layers.  Updated infrequently during background ticks in Timeline,
+/// to minimize contention.
+///
+/// This global state is used to implement behaviors that require a global view of the system, e.g.
+/// rolling layers proactively to limit the total amount of dirty data.
+pub(crate) struct GlobalResources {
+    // Limit on how high dirty_bytes may grow before we start freezing layers to reduce it.
+    // Zero means unlimited.
+    pub(crate) max_dirty_bytes: AtomicU64,
+    // How many bytes are in all EphemeralFile objects
+    dirty_bytes: AtomicU64,
+    // How many layers are contributing to dirty_bytes
+    dirty_layers: AtomicUsize,
+}
+
+// Per-timeline RAII struct for its contribution to [`GlobalResources`]
+struct GlobalResourceUnits {
+    // How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible
+    // for decrementing the global counter by this many bytes when dropped.
+    dirty_bytes: u64,
+}
+
+impl GlobalResourceUnits {
+    // Hint for the layer append path to update us when the layer size differs from the last
+    // call to update_size by this much.  If we don't reach this threshold, we'll still get
+    // updated when the Timeline "ticks" in the background.
+    const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024;
+
+    fn new() -> Self {
+        GLOBAL_RESOURCES
+            .dirty_layers
+            .fetch_add(1, AtomicOrdering::Relaxed);
+        Self { dirty_bytes: 0 }
+    }
+
+    /// Do not call this frequently: all timelines will write to these same global atomics,
+    /// so this is a relatively expensive operation.  Wait at least a few seconds between calls.
+    ///
+    /// Returns the effective layer size limit that should be applied, if any, to keep
+    /// the total number of dirty bytes below the configured maximum.
+    fn publish_size(&mut self, size: u64) -> Option<u64> {
+        let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
+            Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
+            Ordering::Greater => {
+                let delta = size - self.dirty_bytes;
+                let old = GLOBAL_RESOURCES
+                    .dirty_bytes
+                    .fetch_add(delta, AtomicOrdering::Relaxed);
+                old + delta
+            }
+            Ordering::Less => {
+                let delta = self.dirty_bytes - size;
+                let old = GLOBAL_RESOURCES
+                    .dirty_bytes
+                    .fetch_sub(delta, AtomicOrdering::Relaxed);
+                old - delta
+            }
+        };
+
+        // This is a sloppy update: concurrent updates to the counter will race, and the exact
+        // value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes.
+        // That's okay: as long as the metric contains some recent value, it doesn't have to always
+        // be literally the last update.
+        TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);
+
+        self.dirty_bytes = size;
+
+        let max_dirty_bytes = GLOBAL_RESOURCES
+            .max_dirty_bytes
+            .load(AtomicOrdering::Relaxed);
+        if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes {
+            // Set the layer file limit to the average layer size: this implies that all above-average
+            // sized layers will be elegible for freezing.  They will be frozen in the order they
+            // next enter publish_size.
+            Some(
+                new_global_dirty_bytes
+                    / GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64,
+            )
+        } else {
+            None
+        }
+    }
+
+    // Call publish_size if the input size differs from last published size by more than
+    // the drift limit
+    fn maybe_publish_size(&mut self, size: u64) {
+        let publish = match size.cmp(&self.dirty_bytes) {
+            Ordering::Equal => false,
+            Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT,
+            Ordering::Less => self.dirty_bytes - size > Self::MAX_SIZE_DRIFT,
+        };
+
+        if publish {
+            self.publish_size(size);
+        }
+    }
+}
+
+impl Drop for GlobalResourceUnits {
+    fn drop(&mut self) {
+        GLOBAL_RESOURCES
+            .dirty_layers
+            .fetch_sub(1, AtomicOrdering::Relaxed);
+
+        // Subtract our contribution to the global total dirty bytes
+        self.publish_size(0);
+    }
+}
+
+pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
+    max_dirty_bytes: AtomicU64::new(0),
+    dirty_bytes: AtomicU64::new(0),
+    dirty_layers: AtomicUsize::new(0),
+};
+
 impl InMemoryLayer {
+    pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
+        self.file_id
+    }
+
    pub(crate) fn get_timeline_id(&self) -> TimelineId {
        self.timeline_id
    }
@@ -93,6 +222,10 @@ impl InMemoryLayer {
        }
    }

+    pub(crate) fn try_len(&self) -> Option<u64> {
+        self.inner.try_read().map(|i| i.file.len()).ok()
+    }
+
    pub(crate) fn assert_writable(&self) {
        assert!(self.end_lsn.get().is_none());
    }
@@ -318,8 +451,10 @@ impl InMemoryLayer {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");

        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
+        let key = InMemoryLayerFileId(file.id());

        Ok(InMemoryLayer {
+            file_id: key,
            conf,
            timeline_id,
            tenant_shard_id,
@@ -328,6 +463,7 @@ impl InMemoryLayer {
            inner: RwLock::new(InMemoryLayerInner {
                index: HashMap::new(),
                file,
+                resource_units: GlobalResourceUnits::new(),
            }),
        })
    }
@@ -378,9 +514,18 @@ impl InMemoryLayer {
            warn!("Key {} at {} already exists", key, lsn);
        }

+        let size = locked_inner.file.len();
+        locked_inner.resource_units.maybe_publish_size(size);
+
        Ok(())
    }

+    pub(crate) async fn tick(&self) -> Option<u64> {
+        let mut inner = self.inner.write().await;
+        let size = inner.file.len();
+        inner.resource_units.publish_size(size)
+    }
+
    pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
        // TODO: Currently, we just leak the storage for any deleted keys
        Ok(())
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1759,6 +1759,18 @@ impl ResidentLayer {
    pub(crate) fn metadata(&self) -> LayerFileMetadata {
        self.owner.metadata()
    }
+
+    #[cfg(test)]
+    pub(crate) async fn get_inner_delta<'a>(
+        &'a self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> {
+        let owner = &self.owner.0;
+        match self.downloaded.get(owner, ctx).await? {
+            LayerKind::Delta(d) => Ok(d),
+            LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")),
+        }
+    }
 }

 impl AsLayerDesc for ResidentLayer {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -9,6 +9,7 @@ pub mod uninit;
 mod walreceiver;

 use anyhow::{anyhow, bail, ensure, Context, Result};
+use arc_swap::ArcSwap;
 use bytes::Bytes;
 use camino::Utf8Path;
 use enumset::EnumSet;
@@ -19,7 +20,7 @@ use pageserver_api::{
    keyspace::KeySpaceAccum,
    models::{
        CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
-        EvictionPolicy, LayerMapInfo, TimelineState,
+        EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
    },
    reltag::BlockNumber,
    shard::{ShardIdentity, TenantShardId},
@@ -54,6 +55,7 @@ use std::{
    ops::ControlFlow,
 };

+use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    layer_map::{LayerMap, SearchResult},
@@ -64,7 +66,6 @@ use crate::{
    disk_usage_eviction_task::DiskUsageEvictionInfo,
    pgdatadir_mapping::CollectKeySpaceError,
 };
-use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};
 use crate::{
    disk_usage_eviction_task::finite_f32,
    tenant::storage_layer::{
@@ -118,11 +119,11 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

-use super::remote_timeline_client::RemoteTimelineClient;
+use super::config::TenantConf;
 use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
-use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
+use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};

 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(super) enum FlushLoopState {
@@ -183,7 +184,7 @@ pub(crate) struct AuxFilesState {

 pub struct Timeline {
    conf: &'static PageServerConf,
-    tenant_conf: Arc<RwLock<AttachedTenantConf>>,
+    tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,

    myself: Weak<Self>,

@@ -309,6 +310,8 @@ pub struct Timeline {
    /// Configuration: how often should the partitioning be recalculated.
    repartition_threshold: u64,

+    last_image_layer_creation_check_at: AtomicLsn,
+
    /// Current logical size of the "datadir", at the last LSN.
    current_logical_size: LogicalSize,

@@ -610,6 +613,25 @@ pub enum GetVectoredImpl {
    Vectored,
 }

+pub(crate) enum WaitLsnWaiter<'a> {
+    Timeline(&'a Timeline),
+    Tenant,
+    PageService,
+}
+
+/// Argument to [`Timeline::shutdown`].
+#[derive(Debug, Clone, Copy)]
+pub(crate) enum ShutdownMode {
+    /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
+    /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
+    ///
+    /// While we are flushing, we continue to accept read I/O for LSNs ingested before
+    /// the call to [`Timeline::shutdown`].
+    FreezeAndFlush,
+    /// Shut down immediately, without waiting for any open layers to flush.
+    Hard,
+}
+
 /// Public interface functions
 impl Timeline {
    /// Get the LSN where this branch was created
@@ -1058,7 +1080,8 @@ impl Timeline {
    pub(crate) async fn wait_lsn(
        &self,
        lsn: Lsn,
-        _ctx: &RequestContext, /* Prepare for use by cancellation */
+        who_is_waiting: WaitLsnWaiter<'_>,
+        ctx: &RequestContext, /* Prepare for use by cancellation */
    ) -> Result<(), WaitLsnError> {
        if self.cancel.is_cancelled() {
            return Err(WaitLsnError::Shutdown);
@@ -1066,20 +1089,28 @@ impl Timeline {
            return Err(WaitLsnError::BadState);
        }

-        // This should never be called from the WAL receiver, because that could lead
-        // to a deadlock.
-        debug_assert!(
-            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
-            "wait_lsn cannot be called in WAL receiver"
-        );
-        debug_assert!(
-            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
-            "wait_lsn cannot be called in WAL receiver"
-        );
-        debug_assert!(
-            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
-            "wait_lsn cannot be called in WAL receiver"
-        );
+        if cfg!(debug_assertions) {
+            match ctx.task_kind() {
+                TaskKind::WalReceiverManager
+                | TaskKind::WalReceiverConnectionHandler
+                | TaskKind::WalReceiverConnectionPoller => {
+                    let is_myself = match who_is_waiting {
+                        WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself),
+                        WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()),
+                    };
+                    if is_myself {
+                        if let Err(current) = self.last_record_lsn.would_wait_for(lsn) {
+                            // walingest is the only one that can advance last_record_lsn; it should make sure to never reach here
+                            panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock");
+                        }
+                    } else {
+                        // if another  timeline's  is waiting for us, there's no deadlock risk because
+                        // our walreceiver task can make progress independent of theirs
+                    }
+                }
+                _ => {}
+            }
+        }

        let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();

@@ -1142,6 +1173,79 @@ impl Timeline {
        self.flush_frozen_layers_and_wait().await
    }

+    /// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
+    ///
+    /// This is for use in background housekeeping, to provide guarantees of layers closing eventually
+    /// even if there are no ongoing writes to drive that.
+    async fn maybe_freeze_ephemeral_layer(&self) {
+        let Ok(_write_guard) = self.write_lock.try_lock() else {
+            // If the write lock is held, there is an active wal receiver: rolling open layers
+            // is their responsibility while they hold this lock.
+            return;
+        };
+
+        let Ok(layers_guard) = self.layers.try_read() else {
+            // Don't block if the layer lock is busy
+            return;
+        };
+
+        let Some(open_layer) = &layers_guard.layer_map().open_layer else {
+            // No open layer, no work to do.
+            return;
+        };
+
+        let Some(current_size) = open_layer.try_len() else {
+            // Unexpected: since we hold the write guard, nobody else should be writing to this layer, so
+            // read lock to get size should always succeed.
+            tracing::warn!("Lock conflict while reading size of open layer");
+            return;
+        };
+
+        let current_lsn = self.get_last_record_lsn();
+
+        let checkpoint_distance_override = open_layer.tick().await;
+
+        if let Some(size_override) = checkpoint_distance_override {
+            if current_size > size_override {
+                // This is not harmful, but it only happens in relatively rare cases where
+                // time-based checkpoints are not happening fast enough to keep the amount of
+                // ephemeral data within configured limits.  It's a sign of stress on the system.
+                tracing::info!("Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure");
+            }
+        }
+
+        let checkpoint_distance =
+            checkpoint_distance_override.unwrap_or(self.get_checkpoint_distance());
+
+        if self.should_roll(
+            current_size,
+            current_size,
+            checkpoint_distance,
+            self.get_last_record_lsn(),
+            self.last_freeze_at.load(),
+            *self.last_freeze_ts.read().unwrap(),
+        ) {
+            match open_layer.info() {
+                InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
+                    // We may reach this point if the layer was already frozen by not yet flushed: flushing
+                    // happens asynchronously in the background.
+                    tracing::debug!(
+                        "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})"
+                    );
+                }
+                InMemoryLayerInfo::Open { .. } => {
+                    // Upgrade to a write lock and freeze the layer
+                    drop(layers_guard);
+                    let mut layers_guard = self.layers.write().await;
+                    layers_guard
+                        .try_freeze_in_memory_layer(current_lsn, &self.last_freeze_at)
+                        .await;
+                }
+            }
+            self.flush_frozen_layers();
+        }
+    }
+
    /// Outermost timeline compaction operation; downloads needed layers.
    pub(crate) async fn compact(
        self: &Arc<Self>,
@@ -1164,6 +1268,11 @@ impl Timeline {
            (guard, permit)
        };

+        // Prior to compaction, check if an open ephemeral layer should be closed: this provides
+        // background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping
+        // an ephemeral layer open forever when idle.
+        self.maybe_freeze_ephemeral_layer().await;
+
        // this wait probably never needs any "long time spent" logging, because we already nag if
        // compaction task goes over it's period (20s) which is quite often in production.
        let (_guard, _permit) = tokio::select! {
@@ -1196,6 +1305,7 @@ impl Timeline {

    pub(crate) fn activate(
        self: &Arc<Self>,
+        parent: Arc<crate::tenant::Tenant>,
        broker_client: BrokerClientChannel,
        background_jobs_can_start: Option<&completion::Barrier>,
        ctx: &RequestContext,
@@ -1206,95 +1316,122 @@ impl Timeline {
        }
        self.launch_wal_receiver(ctx, broker_client);
        self.set_state(TimelineState::Active);
-        self.launch_eviction_task(background_jobs_can_start);
+        self.launch_eviction_task(parent, background_jobs_can_start);
    }

-    /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
-    /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
+    /// After this function returns, there are no timeline-scoped tasks are left running.
    ///
-    /// While we are flushing, we continue to accept read I/O.
-    pub(crate) async fn flush_and_shutdown(&self) {
+    /// The preferred pattern for is:
+    /// - in any spawned tasks, keep Timeline::guard open + Timeline::cancel / child token
+    /// - if early shutdown (not just cancellation) of a sub-tree of tasks is required,
+    ///   go the extra mile and keep track of JoinHandles
+    /// - Keep track of JoinHandles using a passed-down `Arc<Mutex<Option<JoinSet>>>` or similar,
+    ///   instead of spawning directly on a runtime. It is a more composable / testable pattern.
+    ///
+    /// For legacy reasons, we still have multiple tasks spawned using
+    /// `task_mgr::spawn(X, Some(tenant_id), Some(timeline_id))`.
+    /// We refer to these as "timeline-scoped task_mgr tasks".
+    /// Some of these tasks are already sensitive to Timeline::cancel while others are
+    /// not sensitive to Timeline::cancel and instead respect [`task_mgr::shutdown_token`]
+    /// or [`task_mgr::shutdown_watcher`].
+    /// We want to gradually convert the code base away from these.
+    ///
+    /// Here is an inventory of timeline-scoped task_mgr tasks that are still sensitive to
+    /// `task_mgr::shutdown_{token,watcher}` (there are also tenant-scoped and global-scoped
+    /// ones that aren't mentioned here):
+    /// - [`TaskKind::TimelineDeletionWorker`]
+    ///    - NB: also used for tenant deletion
+    /// - [`TaskKind::RemoteUploadTask`]`
+    /// - [`TaskKind::InitialLogicalSizeCalculation`]
+    /// - [`TaskKind::DownloadAllRemoteLayers`] (can we get rid of it?)
+    // Inventory of timeline-scoped task_mgr tasks that use spawn but aren't sensitive:
+    /// - [`TaskKind::Eviction`]
+    /// - [`TaskKind::LayerFlushTask`]
+    /// - [`TaskKind::OndemandLogicalSizeCalculation`]
+    /// - [`TaskKind::GarbageCollector`] (immediate_gc is timeline-scoped)
+    pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        // Stop ingesting data, so that we are not still writing to an InMemoryLayer while
-        // trying to flush
-        tracing::debug!("Waiting for WalReceiverManager...");
-        task_mgr::shutdown_tasks(
-            Some(TaskKind::WalReceiverManager),
-            Some(self.tenant_shard_id),
-            Some(self.timeline_id),
-        )
-        .await;
+        let try_freeze_and_flush = match mode {
+            ShutdownMode::FreezeAndFlush => true,
+            ShutdownMode::Hard => false,
+        };

-        // Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
+        // Regardless of whether we're going to try_freeze_and_flush
+        // or not, stop ingesting any more data. Walreceiver only provides
+        // cancellation but no "wait until gone", because it uses the Timeline::gate.
+        // So, only after the self.gate.close() below will we know for sure that
+        // no walreceiver tasks are left.
+        // For `try_freeze_and_flush=true`, this means that we might still be ingesting
+        // data during the call to `self.freeze_and_flush()` below.
+        // That's not ideal, but, we don't have the concept of a ChildGuard,
+        // which is what we'd need to properly model early shutdown of the walreceiver
+        // task sub-tree before the other Timeline task sub-trees.
+        let walreceiver = self.walreceiver.lock().unwrap().take();
+        tracing::debug!(
+            is_some = walreceiver.is_some(),
+            "Waiting for WalReceiverManager..."
+        );
+        if let Some(walreceiver) = walreceiver {
+            walreceiver.cancel();
+        }
+        // ... and inform any waiters for newer LSNs that there won't be any.
        self.last_record_lsn.shutdown();

-        // now all writers to InMemory layer are gone, do the final flush if requested
-        match self.freeze_and_flush().await {
-            Ok(_) => {
-                // drain the upload queue
-                if let Some(client) = self.remote_client.as_ref() {
-                    // if we did not wait for completion here, it might be our shutdown process
-                    // didn't wait for remote uploads to complete at all, as new tasks can forever
-                    // be spawned.
-                    //
-                    // what is problematic is the shutting down of RemoteTimelineClient, because
-                    // obviously it does not make sense to stop while we wait for it, but what
-                    // about corner cases like s3 suddenly hanging up?
-                    if let Err(e) = client.shutdown().await {
-                        // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
-                        // we have some extra WAL replay to do next time the timeline starts.
-                        warn!("failed to flush to remote storage: {e:#}");
+        if try_freeze_and_flush {
+            // we shut down walreceiver above, so, we won't add anything more
+            // to the InMemoryLayer; freeze it and wait for all frozen layers
+            // to reach the disk & upload queue, then shut the upload queue and
+            // wait for it to drain.
+            match self.freeze_and_flush().await {
+                Ok(_) => {
+                    // drain the upload queue
+                    if let Some(client) = self.remote_client.as_ref() {
+                        // if we did not wait for completion here, it might be our shutdown process
+                        // didn't wait for remote uploads to complete at all, as new tasks can forever
+                        // be spawned.
+                        //
+                        // what is problematic is the shutting down of RemoteTimelineClient, because
+                        // obviously it does not make sense to stop while we wait for it, but what
+                        // about corner cases like s3 suddenly hanging up?
+                        client.shutdown().await;
                    }
                }
-            }
-            Err(e) => {
-                // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
-                // we have some extra WAL replay to do next time the timeline starts.
-                warn!("failed to freeze and flush: {e:#}");
+                Err(e) => {
+                    // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
+                    // we have some extra WAL replay to do next time the timeline starts.
+                    warn!("failed to freeze and flush: {e:#}");
+                }
            }
        }

-        self.shutdown().await;
-    }
-
-    /// Shut down immediately, without waiting for any open layers to flush to disk.  This is a subset of
-    /// the graceful [`Timeline::flush_and_shutdown`] function.
-    pub(crate) async fn shutdown(&self) {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-
        // Signal any subscribers to our cancellation token to drop out
        tracing::debug!("Cancelling CancellationToken");
        self.cancel.cancel();

-        // Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
-        // while doing so.
-        self.last_record_lsn.shutdown();
-
-        // Shut down the layer flush task before the remote client, as one depends on the other
-        task_mgr::shutdown_tasks(
-            Some(TaskKind::LayerFlushTask),
-            Some(self.tenant_shard_id),
-            Some(self.timeline_id),
-        )
-        .await;
-
-        // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
-        // case our caller wants to use that for a deletion
+        // Transition the remote_client into a state where it's only useful for timeline deletion.
+        // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
        if let Some(remote_client) = self.remote_client.as_ref() {
-            match remote_client.stop() {
-                Ok(()) => {}
-                Err(StopError::QueueUninitialized) => {
-                    // Shutting down during initialization is legal
-                }
-            }
+            remote_client.stop();
+            // As documented in remote_client.stop()'s doc comment, it's our responsibility
+            // to shut down the upload queue tasks.
+            // TODO: fix that, task management should be encapsulated inside remote_client.
+            task_mgr::shutdown_tasks(
+                Some(TaskKind::RemoteUploadTask),
+                Some(self.tenant_shard_id),
+                Some(self.timeline_id),
+            )
+            .await;
        }

+        // TODO: work toward making this a no-op. See this funciton's doc comment for more context.
        tracing::debug!("Waiting for tasks...");
-
        task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;

-        // Finally wait until any gate-holders are complete
+        // Finally wait until any gate-holders are complete.
+        //
+        // TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks
+        // and use a TBD variant of shutdown_tasks that asserts that there were no tasks left.
        self.gate.close().await;

        self.metrics.shutdown();
@@ -1443,6 +1580,53 @@ impl Timeline {
            Err(EvictionError::Timeout) => Ok(Some(false)),
        }
    }
+
+    fn should_roll(
+        &self,
+        layer_size: u64,
+        projected_layer_size: u64,
+        checkpoint_distance: u64,
+        projected_lsn: Lsn,
+        last_freeze_at: Lsn,
+        last_freeze_ts: Instant,
+    ) -> bool {
+        let distance = projected_lsn.widening_sub(last_freeze_at);
+
+        // Rolling the open layer can be triggered by:
+        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
+        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
+        //    account for how writes are distributed across shards: we expect each node to consume
+        //    1/count of the LSN on average.
+        // 2. The size of the currently open layer.
+        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
+        //    up and suspend activity.
+        if distance >= checkpoint_distance as i128 * self.shard_identity.count.count() as i128 {
+            info!(
+                "Will roll layer at {} with layer size {} due to LSN distance ({})",
+                projected_lsn, layer_size, distance
+            );
+
+            true
+        } else if projected_layer_size >= checkpoint_distance {
+            info!(
+                "Will roll layer at {} with layer size {} due to layer size ({})",
+                projected_lsn, layer_size, projected_layer_size
+            );
+
+            true
+        } else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
+            info!(
+                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
+                projected_lsn,
+                layer_size,
+                last_freeze_ts.elapsed()
+            );
+
+            true
+        } else {
+            false
+        }
+    }
 }

 /// Number of times we will compute partition within a checkpoint distance.
@@ -1451,57 +1635,65 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 // Private functions
 impl Timeline {
    pub(crate) fn get_lazy_slru_download(&self) -> bool {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
        tenant_conf
+            .tenant_conf
            .lazy_slru_download
            .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
    }

    fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
        tenant_conf
+            .tenant_conf
            .checkpoint_distance
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
    }

    fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
        tenant_conf
+            .tenant_conf
            .checkpoint_timeout
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
    }

    fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
        tenant_conf
+            .tenant_conf
            .compaction_target_size
            .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
    }

    fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
        tenant_conf
+            .tenant_conf
            .compaction_threshold
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

    fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
        tenant_conf
+            .tenant_conf
            .image_creation_threshold
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

    fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
-        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = &self.tenant_conf.load();
        tenant_conf
+            .tenant_conf
            .compaction_algorithm
            .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
    }

    fn get_eviction_policy(&self) -> EvictionPolicy {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load();
        tenant_conf
+            .tenant_conf
            .eviction_policy
            .unwrap_or(self.conf.default_tenant_conf.eviction_policy)
    }
@@ -1515,14 +1707,26 @@ impl Timeline {
            .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
    }

-    pub(super) fn tenant_conf_updated(&self) {
+    fn get_image_layer_creation_check_threshold(&self) -> u8 {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .image_layer_creation_check_threshold
+            .unwrap_or(
+                self.conf
+                    .default_tenant_conf
+                    .image_layer_creation_check_threshold,
+            )
+    }
+
+    pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
        // NB: Most tenant conf options are read by background loops, so,
        // changes will automatically be picked up.

        // The threshold is embedded in the metric. So, we need to update it.
        {
            let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
-                &self.tenant_conf.read().unwrap().tenant_conf,
+                new_conf,
                &self.conf.default_tenant_conf,
            );

@@ -1549,7 +1753,7 @@ impl Timeline {
    #[allow(clippy::too_many_arguments)]
    pub(super) fn new(
        conf: &'static PageServerConf,
-        tenant_conf: Arc<RwLock<AttachedTenantConf>>,
+        tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
        metadata: &TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
        timeline_id: TimelineId,
@@ -1568,14 +1772,13 @@ impl Timeline {
        let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
        let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));

-        let tenant_conf_guard = tenant_conf.read().unwrap();
-
-        let evictions_low_residence_duration_metric_threshold =
+        let evictions_low_residence_duration_metric_threshold = {
+            let loaded_tenant_conf = tenant_conf.load();
            Self::get_evictions_low_residence_duration_metric_threshold(
-                &tenant_conf_guard.tenant_conf,
+                &loaded_tenant_conf.tenant_conf,
                &conf.default_tenant_conf,
-            );
-        drop(tenant_conf_guard);
+            )
+        };

        Arc::new_cyclic(|myself| {
            let mut result = Timeline {
@@ -1652,6 +1855,7 @@ impl Timeline {
                },
                partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
                repartition_threshold: 0,
+                last_image_layer_creation_check_at: AtomicLsn::new(0),

                last_received_wal: Mutex::new(None),
                rel_size_cache: RwLock::new(HashMap::new()),
@@ -1680,6 +1884,7 @@ impl Timeline {
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
+
            result
                .metrics
                .last_record_gauge
@@ -1756,20 +1961,19 @@ impl Timeline {
            self.timeline_id, self.tenant_shard_id
        );

-        let tenant_conf_guard = self.tenant_conf.read().unwrap();
-        let wal_connect_timeout = tenant_conf_guard
+        let tenant_conf = self.tenant_conf.load();
+        let wal_connect_timeout = tenant_conf
            .tenant_conf
            .walreceiver_connect_timeout
            .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
-        let lagging_wal_timeout = tenant_conf_guard
+        let lagging_wal_timeout = tenant_conf
            .tenant_conf
            .lagging_wal_timeout
            .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
-        let max_lsn_wal_lag = tenant_conf_guard
+        let max_lsn_wal_lag = tenant_conf
            .tenant_conf
            .max_lsn_wal_lag
            .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
-        drop(tenant_conf_guard);

        let mut guard = self.walreceiver.lock().unwrap();
        assert!(
@@ -2317,10 +2521,6 @@ impl Timeline {
                debug!("cancelling logical size calculation for timeline shutdown");
                calculation.await
            }
-            _ = task_mgr::shutdown_watcher() => {
-                debug!("cancelling logical size calculation for task shutdown");
-                calculation.await
-            }
        }
    }

@@ -2596,6 +2796,10 @@ impl Timeline {
                    // Get all the data needed to reconstruct the page version from this layer.
                    // But if we have an older cached page image, no need to go past that.
                    let lsn_floor = max(cached_lsn + 1, start_lsn);
+
+                    let open_layer = open_layer.clone();
+                    drop(guard);
+
                    result = match open_layer
                        .get_value_reconstruct_data(
                            key,
@@ -2613,10 +2817,7 @@ impl Timeline {
                    traversal_path.push((
                        result,
                        cont_lsn,
-                        Box::new({
-                            let open_layer = Arc::clone(open_layer);
-                            move || open_layer.traversal_id()
-                        }),
+                        Box::new(move || open_layer.traversal_id()),
                    ));
                    continue 'outer;
                }
@@ -2626,6 +2827,10 @@ impl Timeline {
                if cont_lsn > start_lsn {
                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
                    let lsn_floor = max(cached_lsn + 1, start_lsn);
+
+                    let frozen_layer = frozen_layer.clone();
+                    drop(guard);
+
                    result = match frozen_layer
                        .get_value_reconstruct_data(
                            key,
@@ -2643,10 +2848,7 @@ impl Timeline {
                    traversal_path.push((
                        result,
                        cont_lsn,
-                        Box::new({
-                            let frozen_layer = Arc::clone(frozen_layer);
-                            move || frozen_layer.traversal_id()
-                        }),
+                        Box::new(move || frozen_layer.traversal_id()),
                    ));
                    continue 'outer;
                }
@@ -2654,6 +2856,8 @@ impl Timeline {

            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
                let layer = guard.get_from_desc(&layer);
+                drop(guard);
+
                // Get all the data needed to reconstruct the page version from this layer.
                // But if we have an older cached page image, no need to go past that.
                let lsn_floor = max(cached_lsn + 1, lsn_floor);
@@ -2771,16 +2975,6 @@ impl Timeline {

        let mut completed_keyspace = KeySpace::default();

-        // Hold the layer map whilst visiting the timeline to prevent
-        // compaction, eviction and flushes from rendering the layers unreadable.
-        //
-        // TODO: Do we actually need to do this? In theory holding on
-        // to [`tenant::storage_layer::Layer`] should be enough. However,
-        // [`Timeline::get`] also holds the lock during IO, so more investigation
-        // is needed.
-        let guard = timeline.layers.read().await;
-        let layers = guard.layer_map();
-
        loop {
            if cancel.is_cancelled() {
                return Err(GetVectoredError::Cancelled);
@@ -2790,6 +2984,9 @@ impl Timeline {
            unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
            completed_keyspace.merge(&keys_done_last_step);

+            let guard = timeline.layers.read().await;
+            let layers = guard.layer_map();
+
            let in_memory_layer = layers.find_in_memory_layer(|l| {
                let start_lsn = l.get_lsn_range().start;
                cont_lsn > start_lsn
@@ -2797,12 +2994,11 @@ impl Timeline {

            match in_memory_layer {
                Some(l) => {
+                    let lsn_range = l.get_lsn_range().start..cont_lsn;
                    fringe.update(
-                        ReadableLayerDesc::InMemory {
-                            handle: l,
-                            lsn_ceil: cont_lsn,
-                        },
+                        ReadableLayer::InMemoryLayer(l),
                        unmapped_keyspace.clone(),
+                        lsn_range,
                    );
                }
                None => {
@@ -2814,30 +3010,43 @@ impl Timeline {
                            .into_iter()
                            .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
                                (
-                                    ReadableLayerDesc::Persistent {
-                                        desc: (*layer).clone(),
-                                        lsn_range: lsn_floor..cont_lsn,
-                                    },
+                                    ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
                                    keyspace_accum.to_keyspace(),
+                                    lsn_floor..cont_lsn,
                                )
                            })
-                            .for_each(|(layer, keyspace)| fringe.update(layer, keyspace));
+                            .for_each(|(layer, keyspace, lsn_range)| {
+                                fringe.update(layer, keyspace, lsn_range)
+                            });
                    }
                }
            }

-            if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() {
+            // It's safe to drop the layer map lock after planning the next round of reads.
+            // The fringe keeps readable handles for the layers which are safe to read even
+            // if layers were compacted or flushed.
+            //
+            // The more interesting consideration is: "Why is the read algorithm still correct
+            // if the layer map changes while it is operating?". Doing a vectored read on a
+            // timeline boils down to pushing an imaginary lsn boundary downwards for each range
+            // covered by the read. The layer map tells us how to move the lsn downwards for a
+            // range at *a particular point in time*. It is fine for the answer to be different
+            // at two different time points.
+            drop(guard);
+
+            if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
+                let next_cont_lsn = lsn_range.start;
                layer_to_read
                    .get_values_reconstruct_data(
-                        &guard,
                        keyspace_to_read.clone(),
+                        lsn_range,
                        reconstruct_state,
                        ctx,
                    )
                    .await?;

                unmapped_keyspace = keyspace_to_read;
-                cont_lsn = layer_to_read.get_lsn_floor();
+                cont_lsn = next_cont_lsn;
            } else {
                break;
            }
@@ -2915,7 +3124,7 @@ impl Timeline {
            }
        }
        ancestor
-            .wait_lsn(self.ancestor_lsn, ctx)
+            .wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx)
            .await
            .map_err(|e| match e {
                e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
@@ -2995,16 +3204,11 @@ impl Timeline {
        loop {
            tokio::select! {
                _ = self.cancel.cancelled() => {
-                    info!("shutting down layer flush task");
-                    break;
-                },
-                _ = task_mgr::shutdown_watcher() => {
-                    info!("shutting down layer flush task");
+                    info!("shutting down layer flush task due to Timeline::cancel");
                    break;
                },
                _ = layer_flush_start_rx.changed() => {}
            }
-
            trace!("waking up");
            let flush_counter = *layer_flush_start_rx.borrow();
            let result = loop {
@@ -3380,6 +3584,24 @@ impl Timeline {

    // Is it time to create a new image layer for the given partition?
    async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
+        let last = self.last_image_layer_creation_check_at.load();
+        if lsn != Lsn(0) {
+            let distance = lsn
+                .checked_sub(last)
+                .expect("Attempt to compact with LSN going backwards");
+
+            let min_distance = self.get_image_layer_creation_check_threshold() as u64
+                * self.get_checkpoint_distance();
+
+            // Skip the expensive delta layer counting below if we've not ingested
+            // sufficient WAL since the last check.
+            if distance.0 < min_distance {
+                return false;
+            }
+        }
+
+        self.last_image_layer_creation_check_at.store(lsn);
+
        let threshold = self.get_image_creation_threshold();

        let guard = self.layers.read().await;
@@ -3721,6 +3943,24 @@ impl Timeline {
        Ok(())
    }

+    /// Schedules the uploads of the given image layers
+    fn upload_new_image_layers(
+        self: &Arc<Self>,
+        new_images: impl IntoIterator<Item = ResidentLayer>,
+    ) -> anyhow::Result<()> {
+        let Some(remote_client) = &self.remote_client else {
+            return Ok(());
+        };
+        for layer in new_images {
+            remote_client.schedule_layer_file_upload(layer)?;
+        }
+        // should any new image layer been created, not uploading index_part will
+        // result in a mismatch between remote_physical_size and layermap calculated
+        // size, which will fail some tests, but should not be an issue otherwise.
+        remote_client.schedule_index_upload_for_file_changes()?;
+        Ok(())
+    }
+
    /// Update information about which layer files need to be retained on
    /// garbage collection. This is separate from actually performing the GC,
    /// and is updated more frequently, so that compaction can remove obsolete
@@ -4460,49 +4700,6 @@ impl<'a> TimelineWriter<'a> {
        res
    }

-    /// "Tick" the timeline writer: it will roll the open layer if required
-    /// and do nothing else.
-    pub(crate) async fn tick(&mut self) -> anyhow::Result<()> {
-        self.open_layer_if_present().await?;
-
-        let last_record_lsn = self.get_last_record_lsn();
-        let action = self.get_open_layer_action(last_record_lsn, 0);
-        if action == OpenLayerAction::Roll {
-            self.roll_layer(last_record_lsn).await?;
-        }
-
-        Ok(())
-    }
-
-    /// Populate the timeline writer state only if an in-memory layer
-    /// is already open.
-    async fn open_layer_if_present(&mut self) -> anyhow::Result<()> {
-        assert!(self.write_guard.is_none());
-
-        let open_layer = {
-            let guard = self.layers.read().await;
-            let layers = guard.layer_map();
-            match layers.open_layer {
-                Some(ref open_layer) => open_layer.clone(),
-                None => {
-                    return Ok(());
-                }
-            }
-        };
-
-        let initial_size = open_layer.size().await?;
-        let last_freeze_at = self.last_freeze_at.load();
-        let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
-        self.write_guard.replace(TimelineWriterState::new(
-            open_layer,
-            initial_size,
-            last_freeze_at,
-            last_freeze_ts,
-        ));
-
-        Ok(())
-    }
-
    async fn handle_open_layer_action(
        &mut self,
        at: Lsn,
@@ -4574,43 +4771,14 @@ impl<'a> TimelineWriter<'a> {
            return OpenLayerAction::None;
        }

-        let distance = lsn.widening_sub(state.cached_last_freeze_at);
-        let proposed_open_layer_size = state.current_size + new_value_size;
-
-        // Rolling the open layer can be triggered by:
-        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
-        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
-        //    account for how writes are distributed across shards: we expect each node to consume
-        //    1/count of the LSN on average.
-        // 2. The size of the currently open layer.
-        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
-        //    up and suspend activity.
-        if distance
-            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128
-        {
-            info!(
-                "Will roll layer at {} with layer size {} due to LSN distance ({})",
-                lsn, state.current_size, distance
-            );
-
-            OpenLayerAction::Roll
-        } else if proposed_open_layer_size >= self.get_checkpoint_distance() {
-            info!(
-                "Will roll layer at {} with layer size {} due to layer size ({})",
-                lsn, state.current_size, proposed_open_layer_size
-            );
-
-            OpenLayerAction::Roll
-        } else if distance > 0
-            && state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()
-        {
-            info!(
-                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
-                lsn,
-                state.current_size,
-                state.cached_last_freeze_ts.elapsed()
-            );
-
+        if self.tl.should_roll(
+            state.current_size,
+            state.current_size + new_value_size,
+            self.get_checkpoint_distance(),
+            lsn,
+            state.cached_last_freeze_at,
+            state.cached_last_freeze_ts,
+        ) {
            OpenLayerAction::Roll
        } else {
            OpenLayerAction::None
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -125,18 +125,8 @@ impl Timeline {
                    )
                    .await
                    .map_err(anyhow::Error::from)?;
-                if let Some(remote_client) = &self.remote_client {
-                    for layer in layers {
-                        remote_client.schedule_layer_file_upload(layer)?;
-                    }
-                }

-                if let Some(remote_client) = &self.remote_client {
-                    // should any new image layer been created, not uploading index_part will
-                    // result in a mismatch between remote_physical_size and layermap calculated
-                    // size, which will fail some tests, but should not be an issue otherwise.
-                    remote_client.schedule_index_upload_for_file_changes()?;
-                }
+                self.upload_new_image_layers(layers)?;
            }
            Err(err) => {
                // no partitioning? This is normal, if the timeline was just created
@@ -818,7 +808,10 @@ impl TimelineAdaptor {
        self.timeline
            .finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete)
            .await?;
-        self.new_images.clear();
+
+        self.timeline
+            .upload_new_image_layers(std::mem::take(&mut self.new_images))?;
+
        self.new_deltas.clear();
        self.layers_to_delete.clear();
        Ok(())
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -6,7 +6,7 @@ use std::{
 use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
-use tracing::{debug, error, info, instrument, Instrument};
+use tracing::{error, info, instrument, Instrument};
 use utils::{crashsafe, fs_ext, id::TimelineId};

 use crate::{
@@ -14,81 +14,14 @@ use crate::{
    deletion_queue::DeletionQueueClient,
    task_mgr::{self, TaskKind},
    tenant::{
-        debug_assert_current_span_has_tenant_and_timeline_id,
        metadata::TimelineMetadata,
-        remote_timeline_client::{
-            self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
-        },
+        remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
        CreateTimelineCause, DeleteTimelineError, Tenant,
    },
 };

 use super::{Timeline, TimelineResources};

-/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
-async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
-    debug_assert_current_span_has_tenant_and_timeline_id();
-    // Notify any timeline work to drop out of loops/requests
-    tracing::debug!("Cancelling CancellationToken");
-    timeline.cancel.cancel();
-
-    // Stop the walreceiver first.
-    debug!("waiting for wal receiver to shutdown");
-    let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
-    if let Some(walreceiver) = maybe_started_walreceiver {
-        walreceiver.stop().await;
-    }
-    debug!("wal receiver shutdown confirmed");
-
-    // Shut down the layer flush task before the remote client, as one depends on the other
-    task_mgr::shutdown_tasks(
-        Some(TaskKind::LayerFlushTask),
-        Some(timeline.tenant_shard_id),
-        Some(timeline.timeline_id),
-    )
-    .await;
-
-    // Prevent new uploads from starting.
-    if let Some(remote_client) = timeline.remote_client.as_ref() {
-        let res = remote_client.stop();
-        match res {
-            Ok(()) => {}
-            Err(e) => match e {
-                remote_timeline_client::StopError::QueueUninitialized => {
-                    // This case shouldn't happen currently because the
-                    // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
-                    // That is, before we declare the Tenant as Active.
-                    // But we only allow calls to delete_timeline on Active tenants.
-                    return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
-                }
-            },
-        }
-    }
-
-    // Stop & wait for the remaining timeline tasks, including upload tasks.
-    // NB: This and other delete_timeline calls do not run as a task_mgr task,
-    //     so, they are not affected by this shutdown_tasks() call.
-    info!("waiting for timeline tasks to shutdown");
-    task_mgr::shutdown_tasks(
-        None,
-        Some(timeline.tenant_shard_id),
-        Some(timeline.timeline_id),
-    )
-    .await;
-
-    fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: timeline-delete-before-index-deleted-at"
-        ))?
-    });
-
-    tracing::debug!("Waiting for gate...");
-    timeline.gate.close().await;
-    tracing::debug!("Shutdown complete");
-
-    Ok(())
-}
-
 /// Mark timeline as deleted in S3 so we won't pick it up next time
 /// during attach or pageserver restart.
 /// See comment in persist_index_part_with_deleted_flag.
@@ -282,7 +215,14 @@ impl DeleteTimelineFlow {

        guard.mark_in_progress()?;

-        stop_tasks(&timeline).await?;
+        // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
+        timeline.shutdown(super::ShutdownMode::Hard).await;
+
+        fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: timeline-delete-before-index-deleted-at"
+            ))?
+        });

        set_deleted_in_remote_index(&timeline).await?;

--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -51,6 +51,7 @@ pub struct EvictionTaskTenantState {
 impl Timeline {
    pub(super) fn launch_eviction_task(
        self: &Arc<Self>,
+        parent: Arc<Tenant>,
        background_tasks_can_start: Option<&completion::Barrier>,
    ) {
        let self_clone = Arc::clone(self);
@@ -66,20 +67,19 @@ impl Timeline {
            ),
            false,
            async move {
-                let cancel = task_mgr::shutdown_token();
                tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()); }
+                    _ = self_clone.cancel.cancelled() => { return Ok(()); }
                    _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
                };

-                self_clone.eviction_task(cancel).await;
+                self_clone.eviction_task(parent).await;
                Ok(())
            },
        );
    }

    #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
-    async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
+    async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
        use crate::tenant::tasks::random_init_delay;

        // acquire the gate guard only once within a useful span
@@ -94,7 +94,7 @@ impl Timeline {
                EvictionPolicy::OnlyImitiate(lat) => lat.period,
                EvictionPolicy::NoEviction => Duration::from_secs(10),
            };
-            if random_init_delay(period, &cancel).await.is_err() {
+            if random_init_delay(period, &self.cancel).await.is_err() {
                return;
            }
        }
@@ -103,13 +103,13 @@ impl Timeline {
        loop {
            let policy = self.get_eviction_policy();
            let cf = self
-                .eviction_iteration(&policy, &cancel, &guard, &ctx)
+                .eviction_iteration(&tenant, &policy, &self.cancel, &guard, &ctx)
                .await;

            match cf {
                ControlFlow::Break(()) => break,
                ControlFlow::Continue(sleep_until) => {
-                    if tokio::time::timeout_at(sleep_until, cancel.cancelled())
+                    if tokio::time::timeout_at(sleep_until, self.cancel.cancelled())
                        .await
                        .is_ok()
                    {
@@ -123,6 +123,7 @@ impl Timeline {
    #[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))]
    async fn eviction_iteration(
        self: &Arc<Self>,
+        tenant: &Tenant,
        policy: &EvictionPolicy,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -137,7 +138,7 @@ impl Timeline {
            }
            EvictionPolicy::LayerAccessThreshold(p) => {
                match self
-                    .eviction_iteration_threshold(p, cancel, gate, ctx)
+                    .eviction_iteration_threshold(tenant, p, cancel, gate, ctx)
                    .await
                {
                    ControlFlow::Break(()) => return ControlFlow::Break(()),
@@ -146,7 +147,11 @@ impl Timeline {
                (p.period, p.threshold)
            }
            EvictionPolicy::OnlyImitiate(p) => {
-                if self.imitiate_only(p, cancel, gate, ctx).await.is_break() {
+                if self
+                    .imitiate_only(tenant, p, cancel, gate, ctx)
+                    .await
+                    .is_break()
+                {
                    return ControlFlow::Break(());
                }
                (p.period, p.threshold)
@@ -175,6 +180,7 @@ impl Timeline {

    async fn eviction_iteration_threshold(
        self: &Arc<Self>,
+        tenant: &Tenant,
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -193,7 +199,10 @@ impl Timeline {
            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
        };

-        match self.imitate_layer_accesses(p, cancel, gate, ctx).await {
+        match self
+            .imitate_layer_accesses(tenant, p, cancel, gate, ctx)
+            .await
+        {
            ControlFlow::Break(()) => return ControlFlow::Break(()),
            ControlFlow::Continue(()) => (),
        }
@@ -315,6 +324,7 @@ impl Timeline {
    /// disk usage based eviction task.
    async fn imitiate_only(
        self: &Arc<Self>,
+        tenant: &Tenant,
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -331,7 +341,8 @@ impl Timeline {
            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
        };

-        self.imitate_layer_accesses(p, cancel, gate, ctx).await
+        self.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
+            .await
    }

    /// If we evict layers but keep cached values derived from those layers, then
@@ -361,6 +372,7 @@ impl Timeline {
    #[instrument(skip_all)]
    async fn imitate_layer_accesses(
        &self,
+        tenant: &Tenant,
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -396,17 +408,11 @@ impl Timeline {
        // Make one of the tenant's timelines draw the short straw and run the calculation.
        // The others wait until the calculation is done so that they take into account the
        // imitated accesses that the winner made.
-        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) {
-            Ok(t) => t,
-            Err(_) => {
-                return ControlFlow::Break(());
-            }
-        };
        let mut state = tenant.eviction_task_tenant_state.lock().await;
        match state.last_layer_access_imitation {
            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
-                self.imitate_synthetic_size_calculation_worker(&tenant, cancel, ctx)
+                self.imitate_synthetic_size_calculation_worker(tenant, cancel, ctx)
                    .await;
                state.last_layer_access_imitation = Some(tokio::time::Instant::now());
            }
@@ -480,7 +486,7 @@ impl Timeline {
    #[instrument(skip_all)]
    async fn imitate_synthetic_size_calculation_worker(
        &self,
-        tenant: &Arc<Tenant>,
+        tenant: &Tenant,
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) {
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -86,6 +86,7 @@ impl<'t> UninitializedTimeline<'t> {
    /// Prepares timeline data by loading it from the basebackup archive.
    pub(crate) async fn import_basebackup_from_tar(
        self,
+        tenant: Arc<Tenant>,
        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
        base_lsn: Lsn,
        broker_client: storage_broker::BrokerClientChannel,
@@ -114,7 +115,7 @@ impl<'t> UninitializedTimeline<'t> {

        // All the data has been imported. Insert the Timeline into the tenant's timelines map
        let tl = self.finish_creation()?;
-        tl.activate(broker_client, None, ctx);
+        tl.activate(tenant, broker_client, None, ctx);
        Ok(tl)
    }

--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -24,26 +24,21 @@ mod connection_manager;
 mod walreceiver_connection;

 use crate::context::{DownloadBehavior, RequestContext};
-use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
+use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME};
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::timeline::walreceiver::connection_manager::{
    connection_manager_loop_step, ConnectionManagerState,
 };

-use pageserver_api::shard::TenantShardId;
 use std::future::Future;
 use std::num::NonZeroU64;
-use std::ops::ControlFlow;
 use std::sync::Arc;
 use std::time::Duration;
 use storage_broker::BrokerClientChannel;
-use tokio::select;
 use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;

-use utils::id::TimelineId;
-
 use self::connection_manager::ConnectionManagerStatus;

 use super::Timeline;
@@ -62,9 +57,10 @@ pub struct WalReceiverConf {
 }

 pub struct WalReceiver {
-    tenant_shard_id: TenantShardId,
-    timeline_id: TimelineId,
    manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
+    /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token.
+    /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`.
+    cancel: CancellationToken,
 }

 impl WalReceiver {
@@ -78,65 +74,58 @@ impl WalReceiver {
        let timeline_id = timeline.timeline_id;
        let walreceiver_ctx =
            ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
-
        let loop_status = Arc::new(std::sync::RwLock::new(None));
        let manager_status = Arc::clone(&loop_status);
-        task_mgr::spawn(
-            WALRECEIVER_RUNTIME.handle(),
-            TaskKind::WalReceiverManager,
-            Some(timeline.tenant_shard_id),
-            Some(timeline_id),
-            &format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"),
-            false,
+        let cancel = timeline.cancel.child_token();
+        WALRECEIVER_RUNTIME.spawn({
+            let cancel = cancel.clone();
            async move {
                debug_assert_current_span_has_tenant_and_timeline_id();
+                // acquire timeline gate so we know the task doesn't outlive the Timeline
+                let Ok(_guard) = timeline.gate.enter() else {
+                    debug!("WAL receiver manager could not enter the gate timeline gate, it's closed already");
+                    return;
+                };
                debug!("WAL receiver manager started, connecting to broker");
                let mut connection_manager_state = ConnectionManagerState::new(
                    timeline,
                    conf,
+                    cancel.clone(),
                );
-                loop {
-                    select! {
-                        _ = task_mgr::shutdown_watcher() => {
-                            trace!("WAL receiver shutdown requested, shutting down");
+                while !cancel.is_cancelled() {
+                    let loop_step_result = connection_manager_loop_step(
+                        &mut broker_client,
+                        &mut connection_manager_state,
+                        &walreceiver_ctx,
+                        &cancel,
+                        &loop_status,
+                    ).await;
+                    match loop_step_result {
+                        Ok(()) => continue,
+                        Err(_cancelled) => {
+                            trace!("Connection manager loop ended, shutting down");
                            break;
-                        },
-                        loop_step_result = connection_manager_loop_step(
-                            &mut broker_client,
-                            &mut connection_manager_state,
-                            &walreceiver_ctx,
-                            &loop_status,
-                        ) => match loop_step_result {
-                            ControlFlow::Continue(()) => continue,
-                            ControlFlow::Break(()) => {
-                                trace!("Connection manager loop ended, shutting down");
-                                break;
-                            }
-                        },
+                        }
                    }
                }
-
                connection_manager_state.shutdown().await;
                *loop_status.write().unwrap() = None;
-                Ok(())
+                debug!("task exits");
            }
            .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
-        );
+        });

        Self {
-            tenant_shard_id,
-            timeline_id,
            manager_status,
+            cancel,
        }
    }

-    pub async fn stop(self) {
-        task_mgr::shutdown_tasks(
-            Some(TaskKind::WalReceiverManager),
-            Some(self.tenant_shard_id),
-            Some(self.timeline_id),
-        )
-        .await;
+    #[instrument(skip_all, level = tracing::Level::DEBUG)]
+    pub fn cancel(&self) {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        debug!("cancelling walreceiver tasks");
+        self.cancel.cancel();
    }

    pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {
@@ -170,14 +159,18 @@ enum TaskStateUpdate<E> {

 impl<E: Clone> TaskHandle<E> {
    /// Initializes the task, starting it immediately after the creation.
+    ///
+    /// The second argument to `task` is a child token of `cancel_parent` ([`CancellationToken::child_token`]).
+    /// It being a child token enables us to provide a [`Self::shutdown`] method.
    fn spawn<Fut>(
+        cancel_parent: &CancellationToken,
        task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>, CancellationToken) -> Fut + Send + 'static,
    ) -> Self
    where
        Fut: Future<Output = anyhow::Result<()>> + Send,
        E: Send + Sync + 'static,
    {
-        let cancellation = CancellationToken::new();
+        let cancellation = cancel_parent.child_token();
        let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);

        let cancellation_clone = cancellation.clone();
@@ -197,6 +190,9 @@ impl<E: Clone> TaskHandle<E> {
        }
    }

+    /// # Cancel-Safety
+    ///
+    /// Cancellation-safe.
    async fn next_task_event(&mut self) -> TaskEvent<E> {
        match self.events_receiver.changed().await {
            Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()),
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -17,7 +17,7 @@ use crate::metrics::{
    WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
    WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
 };
-use crate::task_mgr::{shutdown_token, TaskKind};
+use crate::task_mgr::TaskKind;
 use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
@@ -27,7 +27,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use storage_broker::{BrokerClientChannel, Code, Streaming};
-use tokio::select;
+use tokio_util::sync::CancellationToken;
 use tracing::*;

 use postgres_connection::PgConnectionConfig;
@@ -45,27 +45,33 @@ use super::{
    TaskEvent, TaskHandle,
 };

+pub(crate) struct Cancelled;
+
 /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
 /// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
 /// If storage broker subscription is cancelled, exits.
+///
+/// # Cancel-Safety
+///
+/// Not cancellation-safe. Use `cancel` token to request cancellation.
 pub(super) async fn connection_manager_loop_step(
    broker_client: &mut BrokerClientChannel,
    connection_manager_state: &mut ConnectionManagerState,
    ctx: &RequestContext,
+    cancel: &CancellationToken,
    manager_status: &std::sync::RwLock<Option<ConnectionManagerStatus>>,
-) -> ControlFlow<(), ()> {
-    match connection_manager_state
-        .timeline
-        .wait_to_become_active(ctx)
-        .await
-    {
+) -> Result<(), Cancelled> {
+    match tokio::select! {
+        _ = cancel.cancelled() => { return Err(Cancelled); },
+        st = connection_manager_state.timeline.wait_to_become_active(ctx) => { st }
+    } {
        Ok(()) => {}
        Err(new_state) => {
            debug!(
                ?new_state,
                "state changed, stopping wal connection manager loop"
            );
-            return ControlFlow::Break(());
+            return Err(Cancelled);
        }
    }

@@ -86,7 +92,7 @@ pub(super) async fn connection_manager_loop_step(
    // Subscribe to the broker updates. Stream shares underlying TCP connection
    // with other streams on this client (other connection managers). When
    // object goes out of scope, stream finishes in drop() automatically.
-    let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
+    let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
    debug!("Subscribed for broker timeline updates");

    loop {
@@ -94,6 +100,7 @@ pub(super) async fn connection_manager_loop_step(

        // These things are happening concurrently:
        //
+        // - cancellation request
        //  - keep receiving WAL on the current connection
        //      - if the shared state says we need to change connection, disconnect and return
        //      - this runs in a separate task and we receive updates via a watch channel
@@ -101,7 +108,11 @@ pub(super) async fn connection_manager_loop_step(
        //  - receive updates from broker
        //      - this might change the current desired connection
        //  - timeline state changes to something that does not allow walreceiver to run concurrently
-        select! {
+
+        // NB: make sure each of the select expressions are cancellation-safe
+        // (no need for arms to be cancellation-safe).
+        tokio::select! {
+            _ = cancel.cancelled() => { return Err(Cancelled); }
            Some(wal_connection_update) = async {
                match connection_manager_state.wal_connection.as_mut() {
                    Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
@@ -133,7 +144,7 @@ pub(super) async fn connection_manager_loop_step(
            },

            // Got a new update from the broker
-            broker_update = broker_subscription.message() => {
+            broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => {
                match broker_update {
                    Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
                    Err(status) => {
@@ -147,16 +158,17 @@ pub(super) async fn connection_manager_loop_step(
                                warn!("broker subscription failed: {status}");
                            }
                        }
-                        return ControlFlow::Continue(());
+                        return Ok(());
                    }
                    Ok(None) => {
                        error!("broker subscription stream ended"); // can't happen
-                        return ControlFlow::Continue(());
+                        return Ok(());
                    }
                }
            },

            new_event = async {
+                // Reminder: this match arm needs to be cancellation-safe.
                loop {
                    if connection_manager_state.timeline.current_state() == TimelineState::Loading {
                        warn!("wal connection manager should only be launched after timeline has become active");
@@ -182,11 +194,11 @@ pub(super) async fn connection_manager_loop_step(
                }
            } => match new_event {
                ControlFlow::Continue(()) => {
-                    return ControlFlow::Continue(());
+                    return Ok(());
                }
                ControlFlow::Break(()) => {
                    debug!("Timeline is no longer active, stopping wal connection manager loop");
-                    return ControlFlow::Break(());
+                    return Err(Cancelled);
                }
            },

@@ -218,16 +230,15 @@ pub(super) async fn connection_manager_loop_step(
 async fn subscribe_for_timeline_updates(
    broker_client: &mut BrokerClientChannel,
    id: TenantTimelineId,
-) -> Streaming<SafekeeperTimelineInfo> {
+    cancel: &CancellationToken,
+) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
    let mut attempt = 0;
-    let cancel = shutdown_token();
-
    loop {
        exponential_backoff(
            attempt,
            DEFAULT_BASE_BACKOFF_SECONDS,
            DEFAULT_MAX_BACKOFF_SECONDS,
-            &cancel,
+            cancel,
        )
        .await;
        attempt += 1;
@@ -241,9 +252,14 @@ async fn subscribe_for_timeline_updates(
            subscription_key: Some(key),
        };

-        match broker_client.subscribe_safekeeper_info(request).await {
+        match {
+            tokio::select! {
+                r = broker_client.subscribe_safekeeper_info(request) => { r }
+                _ = cancel.cancelled() => { return Err(Cancelled); }
+            }
+        } {
            Ok(resp) => {
-                return resp.into_inner();
+                return Ok(resp.into_inner());
            }
            Err(e) => {
                // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
@@ -264,6 +280,8 @@ pub(super) struct ConnectionManagerState {
    id: TenantTimelineId,
    /// Use pageserver data about the timeline to filter out some of the safekeepers.
    timeline: Arc<Timeline>,
+    /// Child token of [`super::WalReceiver::cancel`], inherited to all tasks we spawn.
+    cancel: CancellationToken,
    conf: WalReceiverConf,
    /// Current connection to safekeeper for WAL streaming.
    wal_connection: Option<WalConnection>,
@@ -386,7 +404,11 @@ struct BrokerSkTimeline {
 }

 impl ConnectionManagerState {
-    pub(super) fn new(timeline: Arc<Timeline>, conf: WalReceiverConf) -> Self {
+    pub(super) fn new(
+        timeline: Arc<Timeline>,
+        conf: WalReceiverConf,
+        cancel: CancellationToken,
+    ) -> Self {
        let id = TenantTimelineId {
            tenant_id: timeline.tenant_shard_id.tenant_id,
            timeline_id: timeline.timeline_id,
@@ -394,6 +416,7 @@ impl ConnectionManagerState {
        Self {
            id,
            timeline,
+            cancel,
            conf,
            wal_connection: None,
            wal_stream_candidates: HashMap::new(),
@@ -401,6 +424,22 @@ impl ConnectionManagerState {
        }
    }

+    fn spawn<Fut>(
+        &self,
+        task: impl FnOnce(
+                tokio::sync::watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
+                CancellationToken,
+            ) -> Fut
+            + Send
+            + 'static,
+    ) -> TaskHandle<WalConnectionStatus>
+    where
+        Fut: std::future::Future<Output = anyhow::Result<()>> + Send,
+    {
+        // TODO: get rid of TaskHandle
+        super::TaskHandle::spawn(&self.cancel, task)
+    }
+
    /// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
    async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
        WALRECEIVER_SWITCHES
@@ -419,7 +458,7 @@ impl ConnectionManagerState {
        );

        let span = info_span!("connection", %node_id);
-        let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
+        let connection_handle = self.spawn(move |events_sender, cancellation| {
            async move {
                debug_assert_current_span_has_tenant_and_timeline_id();

@@ -447,6 +486,12 @@ impl ConnectionManagerState {
                                info!("walreceiver connection handling ended: {e}");
                                Ok(())
                            }
+                            WalReceiverError::ClosedGate => {
+                                info!(
+                                    "walreceiver connection handling ended because of closed gate"
+                                );
+                                Ok(())
+                            }
                            WalReceiverError::Other(e) => {
                                // give out an error to have task_mgr give it a really verbose logging
                                if cancellation.is_cancelled() {
@@ -486,6 +531,10 @@ impl ConnectionManagerState {

    /// Drops the current connection (if any) and updates retry timeout for the next
    /// connection attempt to the same safekeeper.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// Not cancellation-safe.
    async fn drop_old_connection(&mut self, needs_shutdown: bool) {
        let wal_connection = match self.wal_connection.take() {
            Some(wal_connection) => wal_connection,
@@ -493,7 +542,14 @@ impl ConnectionManagerState {
        };

        if needs_shutdown {
-            wal_connection.connection_task.shutdown().await;
+            wal_connection
+                .connection_task
+                .shutdown()
+                // This here is why this function isn't cancellation-safe.
+                // If we got cancelled here, then self.wal_connection is already None and we lose track of the task.
+                // Even if our caller diligently calls Self::shutdown(), it will find a self.wal_connection=None
+                // and thus be ineffective.
+                .await;
        }

        let retry = self
@@ -838,6 +894,9 @@ impl ConnectionManagerState {
        }
    }

+    /// # Cancel-Safety
+    ///
+    /// Not cancellation-safe.
    pub(super) async fn shutdown(mut self) {
        if let Some(wal_connection) = self.wal_connection.take() {
            wal_connection.connection_task.shutdown().await;
@@ -986,7 +1045,7 @@ mod tests {
            sk_id: connected_sk_id,
            availability_zone: None,
            status: connection_status,
-            connection_task: TaskHandle::spawn(move |sender, _| async move {
+            connection_task: state.spawn(move |sender, _| async move {
                sender
                    .send(TaskStateUpdate::Progress(connection_status))
                    .ok();
@@ -1154,7 +1213,7 @@ mod tests {
            sk_id: connected_sk_id,
            availability_zone: None,
            status: connection_status,
-            connection_task: TaskHandle::spawn(move |sender, _| async move {
+            connection_task: state.spawn(move |sender, _| async move {
                sender
                    .send(TaskStateUpdate::Progress(connection_status))
                    .ok();
@@ -1221,7 +1280,7 @@ mod tests {
            sk_id: NodeId(1),
            availability_zone: None,
            status: connection_status,
-            connection_task: TaskHandle::spawn(move |sender, _| async move {
+            connection_task: state.spawn(move |sender, _| async move {
                sender
                    .send(TaskStateUpdate::Progress(connection_status))
                    .ok();
@@ -1285,7 +1344,7 @@ mod tests {
            sk_id: NodeId(1),
            availability_zone: None,
            status: connection_status,
-            connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
+            connection_task: state.spawn(move |_, _| async move { Ok(()) }),
            discovered_new_wal: Some(NewCommittedWAL {
                discovered_at: time_over_threshold,
                lsn: new_lsn,
@@ -1341,6 +1400,7 @@ mod tests {
                timeline_id: TIMELINE_ID,
            },
            timeline,
+            cancel: CancellationToken::new(),
            conf: WalReceiverConf {
                wal_connect_timeout: Duration::from_secs(1),
                lagging_wal_timeout: Duration::from_secs(1),
@@ -1384,7 +1444,7 @@ mod tests {
            sk_id: connected_sk_id,
            availability_zone: None,
            status: connection_status,
-            connection_task: TaskHandle::spawn(move |sender, _| async move {
+            connection_task: state.spawn(move |sender, _| async move {
                sender
                    .send(TaskStateUpdate::Progress(connection_status))
                    .ok();
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -27,7 +27,6 @@ use super::TaskStateUpdate;
 use crate::{
    context::RequestContext,
    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
-    task_mgr,
    task_mgr::TaskKind,
    task_mgr::WALRECEIVER_RUNTIME,
    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
@@ -37,8 +36,8 @@ use crate::{
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use utils::pageserver_feedback::PageserverFeedback;
 use utils::{id::NodeId, lsn::Lsn};
+use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};

 /// Status of the connection.
 #[derive(Debug, Clone, Copy)]
@@ -68,6 +67,7 @@ pub(super) enum WalReceiverError {
    SuccessfulCompletion(String),
    /// Generic error
    Other(anyhow::Error),
+    ClosedGate,
 }

 impl From<tokio_postgres::Error> for WalReceiverError {
@@ -119,6 +119,16 @@ pub(super) async fn handle_walreceiver_connection(
 ) -> Result<(), WalReceiverError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

+    // prevent timeline shutdown from finishing until we have exited
+    let _guard = timeline.gate.enter().map_err(|e| match e {
+        GateError::GateClosed => WalReceiverError::ClosedGate,
+    })?;
+    // This function spawns a side-car task (WalReceiverConnectionPoller).
+    // Get its gate guard now as well.
+    let poller_guard = timeline.gate.enter().map_err(|e| match e {
+        GateError::GateClosed => WalReceiverError::ClosedGate,
+    })?;
+
    WALRECEIVER_STARTED_CONNECTIONS.inc();

    // Connect to the database in replication mode.
@@ -156,22 +166,19 @@ pub(super) async fn handle_walreceiver_connection(
    }

    // The connection object performs the actual communication with the database,
-    // so spawn it off to run on its own.
+    // so spawn it off to run on its own. It shouldn't outlive this function, but,
+    // due to lack of async drop, we can't enforce that. However, we ensure that
+    // 1. it is sensitive to `cancellation` and
+    // 2. holds the Timeline gate open so that after timeline shutdown,
+    //    we know this task is gone.
    let _connection_ctx = ctx.detached_child(
        TaskKind::WalReceiverConnectionPoller,
        ctx.download_behavior(),
    );
    let connection_cancellation = cancellation.clone();
-    task_mgr::spawn(
-        WALRECEIVER_RUNTIME.handle(),
-        TaskKind::WalReceiverConnectionPoller,
-        Some(timeline.tenant_shard_id),
-        Some(timeline.timeline_id),
-        "walreceiver connection",
-        false,
+    WALRECEIVER_RUNTIME.spawn(
        async move {
            debug_assert_current_span_has_tenant_and_timeline_id();
-
            select! {
                connection_result = connection => match connection_result {
                    Ok(()) => debug!("Walreceiver db connection closed"),
@@ -182,6 +189,9 @@ pub(super) async fn handle_walreceiver_connection(
                                // with a similar error.
                            },
                            WalReceiverError::SuccessfulCompletion(_) => {}
+                            WalReceiverError::ClosedGate => {
+                                // doesn't happen at runtime
+                            }
                            WalReceiverError::Other(err) => {
                                warn!("Connection aborted: {err:#}")
                            }
@@ -190,7 +200,7 @@ pub(super) async fn handle_walreceiver_connection(
                },
                _ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
            }
-            Ok(())
+            drop(poller_guard);
        }
        // Enrich the log lines emitted by this closure with meaningful context.
        // TODO: technically, this task outlives the surrounding function, so, the
@@ -303,6 +313,7 @@ pub(super) async fn handle_walreceiver_connection(

                trace!("received XLogData between {startlsn} and {endlsn}");

+                WAL_INGEST.bytes_received.inc_by(data.len() as u64);
                waldecoder.feed_bytes(data);

                {
@@ -389,17 +400,6 @@ pub(super) async fn handle_walreceiver_connection(
            }
        }

-        {
-            // This is a hack. It piggybacks on the keepalive messages sent by the
-            // safekeeper in order to enforce `checkpoint_timeout` on the currently
-            // open layer. This hack doesn't provide a bound on the total size of
-            // in-memory layers on a pageserver. See https://github.com/neondatabase/neon/issues/6916.
-            let mut writer = timeline.writer().await;
-            if let Err(err) = writer.tick().await {
-                warn!("Timeline writer tick failed: {err}");
-            }
-        }
-
        if let Some(last_lsn) = status_update {
            let timeline_remote_consistent_lsn = timeline
                .get_remote_consistent_lsn_visible()
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -121,11 +121,16 @@ pub(super) enum SetDeletedFlagProgress {
    Successful(NaiveDateTime),
 }

-pub(super) struct UploadQueueStopped {
+pub(super) struct UploadQueueStoppedDeletable {
    pub(super) upload_queue_for_deletion: UploadQueueInitialized,
    pub(super) deleted_at: SetDeletedFlagProgress,
 }

+pub(super) enum UploadQueueStopped {
+    Deletable(UploadQueueStoppedDeletable),
+    Uninitialized,
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum NotInitialized {
    #[error("queue is in state Uninitialized")]
@@ -249,12 +254,15 @@ impl UploadQueue {
        }
    }

-    pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStopped> {
+    pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStoppedDeletable> {
        match self {
            UploadQueue::Initialized(_) | UploadQueue::Uninitialized => {
                anyhow::bail!("queue is in state {}", self.as_str())
            }
-            UploadQueue::Stopped(stopped) => Ok(stopped),
+            UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => {
+                anyhow::bail!("queue is in state Stopped(Uninitialized)")
+            }
+            UploadQueue::Stopped(UploadQueueStopped::Deletable(deletable)) => Ok(deletable),
        }
    }
 }
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -61,7 +61,7 @@ pub struct VectoredRead {
 }

 impl VectoredRead {
-    fn size(&self) -> usize {
+    pub fn size(&self) -> usize {
        (self.end - self.start) as usize
    }
 }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -782,7 +782,7 @@ where
        }
    }
    // NB: don't use `buf.is_empty()` here; it is from the
-    // `impl Deref for Slice { Target = [u8] }`; the the &[u8]
+    // `impl Deref for Slice { Target = [u8] }`; the &[u8]
    // returned by it only covers the initialized portion of `buf`.
    // Whereas we're interested in ensuring that we filled the entire
    // buffer that the user passed in.
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -111,6 +111,7 @@ static PageServer page_servers[MAX_SHARDS];

 static bool pageserver_flush(shardno_t shard_no);
 static void pageserver_disconnect(shardno_t shard_no);
+static void pageserver_disconnect_shard(shardno_t shard_no);

 static bool
 PagestoreShmemIsValid(void)
@@ -487,9 +488,31 @@ retry:
 	return ret;
 }

-
+/*
+ * Reset prefetch and drop connection to the shard.
+ * It also drops connection to all other shards involved in prefetch.
+ */
 static void
 pageserver_disconnect(shardno_t shard_no)
+{
+	if (page_servers[shard_no].conn)
+	{
+		/*
+		 * If the connection to any pageserver is lost, we throw away the
+		 * whole prefetch queue, even for other pageservers. It should not
+		 * cause big problems, because connection loss is supposed to be a
+		 * rare event.
+		 */
+		prefetch_on_ps_disconnect();
+	}
+	pageserver_disconnect_shard(shard_no);
+}
+
+/*
+ * Disconnect from specified shard
+ */
+static void
+pageserver_disconnect_shard(shardno_t shard_no)
 {
 	/*
 	 * If anything goes wrong while we were sending a request, it's not clear
@@ -503,14 +526,6 @@ pageserver_disconnect(shardno_t shard_no)
 		neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
 		PQfinish(page_servers[shard_no].conn);
 		page_servers[shard_no].conn = NULL;
-
-		/*
-		 * If the connection to any pageserver is lost, we throw away the
-		 * whole prefetch queue, even for other pageservers. It should not
-		 * cause big problems, because connection loss is supposed to be a
-		 * rare event.
-		 */
-		prefetch_on_ps_disconnect();
 	}
 	if (page_servers[shard_no].wes != NULL)
 	{
@@ -676,7 +691,8 @@ page_server_api api =
 {
 	.send = pageserver_send,
 	.flush = pageserver_flush,
-	.receive = pageserver_receive
+	.receive = pageserver_receive,
+	.disconnect = pageserver_disconnect_shard
 };

 static bool
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -312,7 +312,7 @@ pg_cluster_size(PG_FUNCTION_ARGS)
 {
 	int64		size;

-	size = GetZenithCurrentClusterSize();
+	size = GetNeonCurrentClusterSize();

 	if (size == 0)
 		PG_RETURN_NULL();
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -26,6 +26,8 @@ extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);

 extern uint64 BackpressureThrottlingTime(void);
+extern void SetNeonCurrentClusterSize(uint64 size);
+extern uint64 GetNeonCurrentClusterSize(void);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);

 extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -180,6 +180,7 @@ typedef struct
 	bool		(*send) (shardno_t  shard_no, NeonRequest * request);
 	NeonResponse *(*receive) (shardno_t shard_no);
 	bool		(*flush) (shardno_t shard_no);
+	void        (*disconnect) (shardno_t shard_no);
 } page_server_api;

 extern void prefetch_on_ps_disconnect(void);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -613,6 +613,14 @@ prefetch_on_ps_disconnect(void)
 		Assert(slot->status == PRFS_REQUESTED);
 		Assert(slot->my_ring_index == ring_index);

+		/*
+		 * Drop connection to all shards which have prefetch requests.
+		 * It is not a problem to call disconnect multiple times on the same connection
+		 * because disconnect implementation in libpagestore.c will check if connection
+		 * is alive and do nothing of connection was already dropped.
+		 */
+		page_server->disconnect(slot->shard_no);
+
 		/* clean up the request */
 		slot->status = PRFS_TAG_REMAINS;
 		MyPState->n_requests_inflight -= 1;
@@ -1680,7 +1688,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			break;

 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag);
 	}
 	pfree(resp);
 	return exists;
@@ -1831,7 +1839,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
 		!IsAutoVacuumWorkerProcess())
 	{
-		uint64		current_size = GetZenithCurrentClusterSize();
+		uint64		current_size = GetNeonCurrentClusterSize();

 		if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
 			ereport(ERROR,
@@ -1912,7 +1920,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 		reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
 		!IsAutoVacuumWorkerProcess())
 	{
-		uint64		current_size = GetZenithCurrentClusterSize();
+		uint64		current_size = GetNeonCurrentClusterSize();

 		if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
 			ereport(ERROR,
@@ -2216,7 +2224,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag);
 	}

 	/* buffer was used, clean up for later reuse */
@@ -2489,7 +2497,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			break;

 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag);
 	}
 	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);

@@ -2544,7 +2552,7 @@ neon_dbsize(Oid dbNode)
 			break;

 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag);
 	}

 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
@@ -2849,7 +2857,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 			break;

 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag);
 	}
 	pfree(resp);

--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -287,6 +287,7 @@ typedef struct WalproposerShmemState
 	slock_t		mutex;
 	term_t		mineLastElectedTerm;
 	pg_atomic_uint64 backpressureThrottlingTime;
+	pg_atomic_uint64 currentClusterSize;

 	/* last feedback from each shard */
 	PageserverFeedback shard_ps_feedback[MAX_SHARDS];
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -282,6 +282,7 @@ WalproposerShmemInit(void)
 		memset(walprop_shared, 0, WalproposerShmemSize());
 		SpinLockInit(&walprop_shared->mutex);
 		pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
+		pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0);
 	}
 	LWLockRelease(AddinShmemInitLock);

@@ -1972,7 +1973,7 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)

 		/* Only one main shard sends non-zero currentClusterSize */
 		if (sk->appendResponse.ps_feedback.currentClusterSize > 0)
-			SetZenithCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize);
+			SetNeonCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize);

 		if (min_feedback.disk_consistent_lsn != standby_apply_lsn)
 		{
@@ -2094,6 +2095,18 @@ GetLogRepRestartLSN(WalProposer *wp)
 	return lrRestartLsn;
 }

+void SetNeonCurrentClusterSize(uint64 size)
+{
+	pg_atomic_write_u64(&walprop_shared->currentClusterSize, size);
+}
+
+uint64 GetNeonCurrentClusterSize(void)
+{
+	return pg_atomic_read_u64(&walprop_shared->currentClusterSize);
+}
+uint64 GetNeonCurrentClusterSize(void);
+
+
 static const walproposer_api walprop_pg = {
 	.get_shmem_state = walprop_pg_get_shmem_state,
 	.start_streaming = walprop_pg_start_streaming,
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -10,7 +10,12 @@ testing = []

 [dependencies]
 anyhow.workspace = true
+async-compression.workspace = true
 async-trait.workspace = true
+aws-config.workspace = true
+aws-sdk-iam.workspace = true
+aws-sigv4.workspace = true
+aws-types.workspace = true
 base64.workspace = true
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
@@ -27,6 +32,7 @@ hashlink.workspace = true
 hex.workspace = true
 hmac.workspace = true
 hostname.workspace = true
+http.workspace = true
 humantime.workspace = true
 hyper-tungstenite.workspace = true
 hyper.workspace = true
@@ -92,6 +98,7 @@ workspace_hack.workspace = true

 [dev-dependencies]
 camino-tempfile.workspace = true
+fallible-iterator.workspace = true
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -12,6 +12,8 @@ use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
 use crate::console::{AuthSecret, NodeInfo};
 use crate::context::RequestMonitoring;
+use crate::intern::EndpointIdInt;
+use crate::metrics::{AUTH_RATE_LIMIT_HITS, ENDPOINTS_AUTH_RATE_LIMITED};
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
 use crate::stream::Stream;
@@ -28,7 +30,7 @@ use crate::{
 use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::info;
+use tracing::{info, warn};

 /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
 pub enum MaybeOwned<'a, T> {
@@ -174,6 +176,52 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
    }
 }

+impl AuthenticationConfig {
+    pub fn check_rate_limit(
+        &self,
+
+        ctx: &mut RequestMonitoring,
+        secret: AuthSecret,
+        endpoint: &EndpointId,
+        is_cleartext: bool,
+    ) -> auth::Result<AuthSecret> {
+        // we have validated the endpoint exists, so let's intern it.
+        let endpoint_int = EndpointIdInt::from(endpoint);
+
+        // only count the full hash count if password hack or websocket flow.
+        // in other words, if proxy needs to run the hashing
+        let password_weight = if is_cleartext {
+            match &secret {
+                #[cfg(any(test, feature = "testing"))]
+                AuthSecret::Md5(_) => 1,
+                AuthSecret::Scram(s) => s.iterations + 1,
+            }
+        } else {
+            // validating scram takes just 1 hmac_sha_256 operation.
+            1
+        };
+
+        let limit_not_exceeded = self
+            .rate_limiter
+            .check((endpoint_int, ctx.peer_addr), password_weight);
+
+        if !limit_not_exceeded {
+            warn!(
+                enabled = self.rate_limiter_enabled,
+                "rate limiting authentication"
+            );
+            AUTH_RATE_LIMIT_HITS.inc();
+            ENDPOINTS_AUTH_RATE_LIMITED.measure(endpoint);
+
+            if self.rate_limiter_enabled {
+                return Err(auth::AuthError::too_many_connections());
+            }
+        }
+
+        Ok(secret)
+    }
+}
+
 /// True to its name, this function encapsulates our current auth trade-offs.
 /// Here, we choose the appropriate auth flow based on circumstances.
 ///
@@ -214,14 +262,24 @@ async fn auth_quirks(
        Some(secret) => secret,
        None => api.get_role_secret(ctx, &info).await?,
    };
+    let (cached_entry, secret) = cached_secret.take_value();
+
+    let secret = match secret {
+        Some(secret) => config.check_rate_limit(
+            ctx,
+            secret,
+            &info.endpoint,
+            unauthenticated_password.is_some() || allow_cleartext,
+        )?,
+        None => {
+            // If we don't have an authentication secret, we mock one to
+            // prevent malicious probing (possible due to missing protocol steps).
+            // This mocked secret will never lead to successful authentication.
+            info!("authentication info not found, mocking it");
+            AuthSecret::Scram(scram::ServerSecret::mock(rand::random()))
+        }
+    };

-    let secret = cached_secret.value.clone().unwrap_or_else(|| {
-        // If we don't have an authentication secret, we mock one to
-        // prevent malicious probing (possible due to missing protocol steps).
-        // This mocked secret will never lead to successful authentication.
-        info!("authentication info not found, mocking it");
-        AuthSecret::Scram(scram::ServerSecret::mock(&info.user, rand::random()))
-    });
    match authenticate_with_secret(
        ctx,
        secret,
@@ -237,7 +295,7 @@ async fn auth_quirks(
        Err(e) => {
            if e.is_auth_failed() {
                // The password could have been changed, so we invalidate the cache.
-                cached_secret.invalidate();
+                cached_entry.invalidate();
            }
            Err(e)
        }
@@ -408,3 +466,232 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use bytes::BytesMut;
+    use fallible_iterator::FallibleIterator;
+    use once_cell::sync::Lazy;
+    use postgres_protocol::{
+        authentication::sasl::{ChannelBinding, ScramSha256},
+        message::{backend::Message as PgMessage, frontend},
+    };
+    use provider::AuthSecret;
+    use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
+
+    use crate::{
+        auth::{ComputeUserInfoMaybeEndpoint, IpPattern},
+        config::AuthenticationConfig,
+        console::{
+            self,
+            provider::{self, CachedAllowedIps, CachedRoleSecret},
+            CachedNodeInfo,
+        },
+        context::RequestMonitoring,
+        proxy::NeonOptions,
+        rate_limiter::{AuthRateLimiter, RateBucketInfo},
+        scram::ServerSecret,
+        stream::{PqStream, Stream},
+    };
+
+    use super::auth_quirks;
+
+    struct Auth {
+        ips: Vec<IpPattern>,
+        secret: AuthSecret,
+    }
+
+    impl console::Api for Auth {
+        async fn get_role_secret(
+            &self,
+            _ctx: &mut RequestMonitoring,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
+            Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
+        }
+
+        async fn get_allowed_ips_and_secret(
+            &self,
+            _ctx: &mut RequestMonitoring,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
+        {
+            Ok((
+                CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())),
+                Some(CachedRoleSecret::new_uncached(Some(self.secret.clone()))),
+            ))
+        }
+
+        async fn wake_compute(
+            &self,
+            _ctx: &mut RequestMonitoring,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
+            unimplemented!()
+        }
+    }
+
+    static CONFIG: Lazy<AuthenticationConfig> = Lazy::new(|| AuthenticationConfig {
+        scram_protocol_timeout: std::time::Duration::from_secs(5),
+        rate_limiter_enabled: true,
+        rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
+    });
+
+    async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
+        loop {
+            r.read_buf(&mut *b).await.unwrap();
+            if let Some(m) = PgMessage::parse(&mut *b).unwrap() {
+                break m;
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn auth_quirks_scram() {
+        let (mut client, server) = tokio::io::duplex(1024);
+        let mut stream = PqStream::new(Stream::from_raw(server));
+
+        let mut ctx = RequestMonitoring::test();
+        let api = Auth {
+            ips: vec![],
+            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
+        };
+
+        let user_info = ComputeUserInfoMaybeEndpoint {
+            user: "conrad".into(),
+            endpoint_id: Some("endpoint".into()),
+            options: NeonOptions::default(),
+        };
+
+        let handle = tokio::spawn(async move {
+            let mut scram = ScramSha256::new(b"my-secret-password", ChannelBinding::unsupported());
+
+            let mut read = BytesMut::new();
+
+            // server should offer scram
+            match read_message(&mut client, &mut read).await {
+                PgMessage::AuthenticationSasl(a) => {
+                    let options: Vec<&str> = a.mechanisms().collect().unwrap();
+                    assert_eq!(options, ["SCRAM-SHA-256"]);
+                }
+                _ => panic!("wrong message"),
+            }
+
+            // client sends client-first-message
+            let mut write = BytesMut::new();
+            frontend::sasl_initial_response("SCRAM-SHA-256", scram.message(), &mut write).unwrap();
+            client.write_all(&write).await.unwrap();
+
+            // server response with server-first-message
+            match read_message(&mut client, &mut read).await {
+                PgMessage::AuthenticationSaslContinue(a) => {
+                    scram.update(a.data()).await.unwrap();
+                }
+                _ => panic!("wrong message"),
+            }
+
+            // client response with client-final-message
+            write.clear();
+            frontend::sasl_response(scram.message(), &mut write).unwrap();
+            client.write_all(&write).await.unwrap();
+
+            // server response with server-final-message
+            match read_message(&mut client, &mut read).await {
+                PgMessage::AuthenticationSaslFinal(a) => {
+                    scram.finish(a.data()).unwrap();
+                }
+                _ => panic!("wrong message"),
+            }
+        });
+
+        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, &CONFIG)
+            .await
+            .unwrap();
+
+        handle.await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn auth_quirks_cleartext() {
+        let (mut client, server) = tokio::io::duplex(1024);
+        let mut stream = PqStream::new(Stream::from_raw(server));
+
+        let mut ctx = RequestMonitoring::test();
+        let api = Auth {
+            ips: vec![],
+            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
+        };
+
+        let user_info = ComputeUserInfoMaybeEndpoint {
+            user: "conrad".into(),
+            endpoint_id: Some("endpoint".into()),
+            options: NeonOptions::default(),
+        };
+
+        let handle = tokio::spawn(async move {
+            let mut read = BytesMut::new();
+            let mut write = BytesMut::new();
+
+            // server should offer cleartext
+            match read_message(&mut client, &mut read).await {
+                PgMessage::AuthenticationCleartextPassword => {}
+                _ => panic!("wrong message"),
+            }
+
+            // client responds with password
+            write.clear();
+            frontend::password_message(b"my-secret-password", &mut write).unwrap();
+            client.write_all(&write).await.unwrap();
+        });
+
+        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
+            .await
+            .unwrap();
+
+        handle.await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn auth_quirks_password_hack() {
+        let (mut client, server) = tokio::io::duplex(1024);
+        let mut stream = PqStream::new(Stream::from_raw(server));
+
+        let mut ctx = RequestMonitoring::test();
+        let api = Auth {
+            ips: vec![],
+            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
+        };
+
+        let user_info = ComputeUserInfoMaybeEndpoint {
+            user: "conrad".into(),
+            endpoint_id: None,
+            options: NeonOptions::default(),
+        };
+
+        let handle = tokio::spawn(async move {
+            let mut read = BytesMut::new();
+
+            // server should offer cleartext
+            match read_message(&mut client, &mut read).await {
+                PgMessage::AuthenticationCleartextPassword => {}
+                _ => panic!("wrong message"),
+            }
+
+            // client responds with password
+            let mut write = BytesMut::new();
+            frontend::password_message(b"endpoint=my-endpoint;my-secret-password", &mut write)
+                .unwrap();
+            client.write_all(&write).await.unwrap();
+        });
+
+        let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
+            .await
+            .unwrap();
+
+        assert_eq!(creds.info.endpoint, "my-endpoint");
+
+        handle.await.unwrap();
+    }
+}
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,8 +1,16 @@
+use aws_config::environment::EnvironmentVariableCredentialsProvider;
+use aws_config::imds::credentials::ImdsCredentialsProvider;
+use aws_config::meta::credentials::CredentialsProviderChain;
+use aws_config::meta::region::RegionProviderChain;
+use aws_config::profile::ProfileFileCredentialsProvider;
+use aws_config::provider_config::ProviderConfig;
+use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
 use futures::future::Either;
 use proxy::auth;
 use proxy::auth::backend::MaybeOwned;
 use proxy::cancellation::CancelMap;
 use proxy::cancellation::CancellationHandler;
+use proxy::config::remote_storage_from_toml;
 use proxy::config::AuthenticationConfig;
 use proxy::config::CacheOptions;
 use proxy::config::HttpConfig;
@@ -10,11 +18,15 @@ use proxy::config::ProjectInfoCacheOptions;
 use proxy::console;
 use proxy::context::parquet::ParquetUploadArgs;
 use proxy::http;
+use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT;
+use proxy::rate_limiter::AuthRateLimiter;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
 use proxy::rate_limiter::RateLimiterConfig;
+use proxy::redis::cancellation_publisher::RedisPublisherClient;
+use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
+use proxy::redis::elasticache;
 use proxy::redis::notifications;
-use proxy::redis::publisher::RedisPublisherClient;
 use proxy::serverless::GlobalConnPoolOptions;
 use proxy::usage_metrics;

@@ -131,10 +143,16 @@ struct ProxyCliArgs {
    ///
    /// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
    /// Can be given multiple times for different bucket sizes.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
    endpoint_rps_limit: Vec<RateBucketInfo>,
+    /// Whether the auth rate limiter actually takes effect (for testing)
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    auth_rate_limit_enabled: bool,
+    /// Authentication rate limiter max number of hashes per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
+    auth_rate_limit: Vec<RateBucketInfo>,
    /// Redis rate limiter max number of requests per second.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
    redis_rps_limit: Vec<RateBucketInfo>,
    /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
    #[clap(long, default_value_t = 100)]
@@ -150,15 +168,43 @@ struct ProxyCliArgs {
    /// disable ip check for http requests. If it is too time consuming, it could be turned off.
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    disable_ip_check_for_http: bool,
-    /// redis url for notifications.
+    /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
    #[clap(long)]
    redis_notifications: Option<String>,
+    /// redis host for streaming connections (might be different from the notifications host)
+    #[clap(long)]
+    redis_host: Option<String>,
+    /// redis port for streaming connections (might be different from the notifications host)
+    #[clap(long)]
+    redis_port: Option<u16>,
+    /// redis cluster name, used in aws elasticache
+    #[clap(long)]
+    redis_cluster_name: Option<String>,
+    /// redis user_id, used in aws elasticache
+    #[clap(long)]
+    redis_user_id: Option<String>,
+    /// aws region to retrieve credentials
+    #[clap(long, default_value_t = String::new())]
+    aws_region: String,
    /// cache for `project_info` (use `size=0` to disable)
    #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
    project_info_cache: String,

    #[clap(flatten)]
    parquet_upload: ParquetUploadArgs,
+
+    /// interval for backup metric collection
+    #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
+    metric_backup_collection_interval: std::time::Duration,
+    /// remote storage configuration for backup metric collection
+    /// Encoded as toml (same format as pageservers), eg
+    /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
+    #[clap(long, default_value = "{}")]
+    metric_backup_collection_remote_storage: String,
+    /// chunk size for backup metric collection
+    /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
+    #[clap(long, default_value = "4194304")]
+    metric_backup_collection_chunk_size: usize,
 }

 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -216,6 +262,61 @@ async fn main() -> anyhow::Result<()> {
    let config = build_config(&args)?;

    info!("Authentication backend: {}", config.auth_backend);
+    info!("Using region: {}", config.aws_region);
+
+    let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed
+    let provider_conf =
+        ProviderConfig::without_region().with_region(region_provider.region().await);
+    let aws_credentials_provider = {
+        // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+        CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new())
+            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
+            .or_else(
+                "profile-sso",
+                ProfileFileCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
+            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
+            // needed to access remote extensions bucket
+            .or_else(
+                "token",
+                WebIdentityTokenCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
+            // uses imds v2
+            .or_else("imds", ImdsCredentialsProvider::builder().build())
+    };
+    let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new(
+        elasticache::AWSIRSAConfig::new(
+            config.aws_region.clone(),
+            args.redis_cluster_name,
+            args.redis_user_id,
+        ),
+        aws_credentials_provider,
+    ));
+    let redis_notifications_client =
+        match (args.redis_notifications, (args.redis_host, args.redis_port)) {
+            (Some(url), _) => {
+                info!("Starting redis notifications listener ({url})");
+                Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
+            }
+            (None, (Some(host), Some(port))) => Some(
+                ConnectionWithCredentialsProvider::new_with_credentials_provider(
+                    host,
+                    port,
+                    elasticache_credentials_provider.clone(),
+                ),
+            ),
+            (None, (None, None)) => {
+                warn!("Redis is disabled");
+                None
+            }
+            _ => {
+                bail!("redis-host and redis-port must be specified together");
+            }
+        };

    // Check that we can bind to address before further initialization
    let http_address: SocketAddr = args.http.parse()?;
@@ -233,17 +334,22 @@ async fn main() -> anyhow::Result<()> {

    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
    let cancel_map = CancelMap::default();
-    let redis_publisher = match &args.redis_notifications {
-        Some(url) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
-            url,
+
+    // let redis_notifications_client = redis_notifications_client.map(|x| Box::leak(Box::new(x)));
+    let redis_publisher = match &redis_notifications_client {
+        Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
+            redis_publisher.clone(),
            args.region.clone(),
            &config.redis_rps_limit,
        )?))),
        None => None,
    };
-    let cancellation_handler = Arc::new(CancellationHandler::new(
+    let cancellation_handler = Arc::new(CancellationHandler::<
+        Option<Arc<tokio::sync::Mutex<RedisPublisherClient>>>,
+    >::new(
        cancel_map.clone(),
        redis_publisher,
+        NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT,
    ));

    // client facing tasks. these will exit on error or on cancellation
@@ -280,27 +386,31 @@ async fn main() -> anyhow::Result<()> {

    // maintenance tasks. these never return unless there's an error
    let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
+    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
    maintenance_tasks.spawn(http::health_server::task_main(http_listener));
    maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));

    if let Some(metrics_config) = &config.metric_collection {
+        // TODO: Add gc regardles of the metric collection being enabled.
        maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
+        client_tasks.spawn(usage_metrics::task_backup(
+            &metrics_config.backup_metric_collection_config,
+            cancellation_token,
+        ));
    }

    if let auth::BackendType::Console(api, _) = &config.auth_backend {
        if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
-            let cache = api.caches.project_info.clone();
-            if let Some(url) = args.redis_notifications {
-                info!("Starting redis notifications listener ({url})");
+            if let Some(redis_notifications_client) = redis_notifications_client {
+                let cache = api.caches.project_info.clone();
                maintenance_tasks.spawn(notifications::task_main(
-                    url.to_owned(),
+                    redis_notifications_client.clone(),
                    cache.clone(),
                    cancel_map.clone(),
                    args.region.clone(),
                ));
+                maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
            }
-            maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
        }
    }

@@ -343,6 +453,13 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
    if args.allow_self_signed_compute {
        warn!("allowing self-signed compute certificates");
    }
+    let backup_metric_collection_config = config::MetricBackupCollectionConfig {
+        interval: args.metric_backup_collection_interval,
+        remote_storage_config: remote_storage_from_toml(
+            &args.metric_backup_collection_remote_storage,
+        )?,
+        chunk_size: args.metric_backup_collection_chunk_size,
+    };

    let metric_collection = match (
        &args.metric_collection_endpoint,
@@ -351,6 +468,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig {
            endpoint: endpoint.parse()?,
            interval: humantime::parse_duration(interval)?,
+            backup_metric_collection_config,
        }),
        (None, None) => None,
        _ => bail!(
@@ -426,6 +544,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
    };
    let authentication_config = AuthenticationConfig {
        scram_protocol_timeout: args.scram_protocol_timeout,
+        rate_limiter_enabled: args.auth_rate_limit_enabled,
+        rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
    };

    let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
@@ -445,8 +565,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        endpoint_rps_limit,
        redis_rps_limit,
        handshake_timeout: args.handshake_timeout,
-        // TODO: add this argument
        region: args.region.clone(),
+        aws_region: args.aws_region.clone(),
    }));

    Ok(config)
--- a/proxy/src/cache/common.rs
+++ b/proxy/src/cache/common.rs
@@ -43,6 +43,16 @@ impl<C: Cache, V> Cached<C, V> {
        Self { token: None, value }
    }

+    pub fn take_value(self) -> (Cached<C, ()>, V) {
+        (
+            Cached {
+                token: self.token,
+                value: (),
+            },
+            self.value,
+        )
+    }
+
    /// Drop this entry from a cache if it's still there.
    pub fn invalidate(self) -> V {
        if let Some((cache, info)) = &self.token {
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -373,10 +373,7 @@ mod tests {
        let endpoint_id = "endpoint".into();
        let user1: RoleName = "user1".into();
        let user2: RoleName = "user2".into();
-        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
-            user1.as_str(),
-            [1; 32],
-        )));
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
        let secret2 = None;
        let allowed_ips = Arc::new(vec![
            "127.0.0.1".parse().unwrap(),
@@ -395,10 +392,7 @@ mod tests {

        // Shouldn't add more than 2 roles.
        let user3: RoleName = "user3".into();
-        let secret3 = Some(AuthSecret::Scram(ServerSecret::mock(
-            user3.as_str(),
-            [3; 32],
-        )));
+        let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32])));
        cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone());
        assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());

@@ -431,14 +425,8 @@ mod tests {
        let endpoint_id = "endpoint".into();
        let user1: RoleName = "user1".into();
        let user2: RoleName = "user2".into();
-        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
-            user1.as_str(),
-            [1; 32],
-        )));
-        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock(
-            user2.as_str(),
-            [2; 32],
-        )));
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
+        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock([2; 32])));
        let allowed_ips = Arc::new(vec![
            "127.0.0.1".parse().unwrap(),
            "127.0.0.2".parse().unwrap(),
@@ -486,14 +474,8 @@ mod tests {
        let endpoint_id = "endpoint".into();
        let user1: RoleName = "user1".into();
        let user2: RoleName = "user2".into();
-        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
-            user1.as_str(),
-            [1; 32],
-        )));
-        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock(
-            user2.as_str(),
-            [2; 32],
-        )));
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
+        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock([2; 32])));
        let allowed_ips = Arc::new(vec![
            "127.0.0.1".parse().unwrap(),
            "127.0.0.2".parse().unwrap(),
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,4 +1,3 @@
-use async_trait::async_trait;
 use dashmap::DashMap;
 use pq_proto::CancelKeyData;
 use std::{net::SocketAddr, sync::Arc};
@@ -10,18 +9,26 @@ use tracing::info;
 use uuid::Uuid;

 use crate::{
-    error::ReportableError, metrics::NUM_CANCELLATION_REQUESTS,
-    redis::publisher::RedisPublisherClient,
+    error::ReportableError,
+    metrics::NUM_CANCELLATION_REQUESTS,
+    redis::cancellation_publisher::{
+        CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
+    },
 };

 pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
+pub type CancellationHandlerMain = CancellationHandler<Option<Arc<Mutex<RedisPublisherClient>>>>;
+pub type CancellationHandlerMainInternal = Option<Arc<Mutex<RedisPublisherClient>>>;

 /// Enables serving `CancelRequest`s.
 ///
-/// If there is a `RedisPublisherClient` available, it will be used to publish the cancellation key to other proxy instances.
-pub struct CancellationHandler {
+/// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances.
+pub struct CancellationHandler<P> {
    map: CancelMap,
-    redis_client: Option<Arc<Mutex<RedisPublisherClient>>>,
+    client: P,
+    /// This field used for the monitoring purposes.
+    /// Represents the source of the cancellation request.
+    from: &'static str,
 }

 #[derive(Debug, Error)]
@@ -44,49 +51,9 @@ impl ReportableError for CancelError {
    }
 }

-impl CancellationHandler {
-    pub fn new(map: CancelMap, redis_client: Option<Arc<Mutex<RedisPublisherClient>>>) -> Self {
-        Self { map, redis_client }
-    }
-    /// Cancel a running query for the corresponding connection.
-    pub async fn cancel_session(
-        &self,
-        key: CancelKeyData,
-        session_id: Uuid,
-    ) -> Result<(), CancelError> {
-        let from = "from_client";
-        // NB: we should immediately release the lock after cloning the token.
-        let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
-            tracing::warn!("query cancellation key not found: {key}");
-            if let Some(redis_client) = &self.redis_client {
-                NUM_CANCELLATION_REQUESTS
-                    .with_label_values(&[from, "not_found"])
-                    .inc();
-                info!("publishing cancellation key to Redis");
-                match redis_client.lock().await.try_publish(key, session_id).await {
-                    Ok(()) => {
-                        info!("cancellation key successfuly published to Redis");
-                    }
-                    Err(e) => {
-                        tracing::error!("failed to publish a message: {e}");
-                        return Err(CancelError::IO(std::io::Error::new(
-                            std::io::ErrorKind::Other,
-                            e.to_string(),
-                        )));
-                    }
-                }
-            }
-            return Ok(());
-        };
-        NUM_CANCELLATION_REQUESTS
-            .with_label_values(&[from, "found"])
-            .inc();
-        info!("cancelling query per user's request using key {key}");
-        cancel_closure.try_cancel_query().await
-    }
-
+impl<P: CancellationPublisher> CancellationHandler<P> {
    /// Run async action within an ephemeral session identified by [`CancelKeyData`].
-    pub fn get_session(self: Arc<Self>) -> Session {
+    pub fn get_session(self: Arc<Self>) -> Session<P> {
        // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
        // expose it and we don't want to do another roundtrip to query
        // for it. The client will be able to notice that this is not the
@@ -112,9 +79,39 @@ impl CancellationHandler {
            cancellation_handler: self,
        }
    }
+    /// Try to cancel a running query for the corresponding connection.
+    /// If the cancellation key is not found, it will be published to Redis.
+    pub async fn cancel_session(
+        &self,
+        key: CancelKeyData,
+        session_id: Uuid,
+    ) -> Result<(), CancelError> {
+        // NB: we should immediately release the lock after cloning the token.
+        let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
+            tracing::warn!("query cancellation key not found: {key}");
+            NUM_CANCELLATION_REQUESTS
+                .with_label_values(&[self.from, "not_found"])
+                .inc();
+            match self.client.try_publish(key, session_id).await {
+                Ok(()) => {} // do nothing
+                Err(e) => {
+                    return Err(CancelError::IO(std::io::Error::new(
+                        std::io::ErrorKind::Other,
+                        e.to_string(),
+                    )));
+                }
+            }
+            return Ok(());
+        };
+        NUM_CANCELLATION_REQUESTS
+            .with_label_values(&[self.from, "found"])
+            .inc();
+        info!("cancelling query per user's request using key {key}");
+        cancel_closure.try_cancel_query().await
+    }

    #[cfg(test)]
-    fn contains(&self, session: &Session) -> bool {
+    fn contains(&self, session: &Session<P>) -> bool {
        self.map.contains_key(&session.key)
    }

@@ -124,31 +121,19 @@ impl CancellationHandler {
    }
 }

-#[async_trait]
-pub trait NotificationsCancellationHandler {
-    async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError>;
+impl CancellationHandler<()> {
+    pub fn new(map: CancelMap, from: &'static str) -> Self {
+        Self {
+            map,
+            client: (),
+            from,
+        }
+    }
 }

-#[async_trait]
-impl NotificationsCancellationHandler for CancellationHandler {
-    async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError> {
-        let from = "from_redis";
-        let cancel_closure = self.map.get(&key).and_then(|x| x.clone());
-        match cancel_closure {
-            Some(cancel_closure) => {
-                NUM_CANCELLATION_REQUESTS
-                    .with_label_values(&[from, "found"])
-                    .inc();
-                cancel_closure.try_cancel_query().await
-            }
-            None => {
-                NUM_CANCELLATION_REQUESTS
-                    .with_label_values(&[from, "not_found"])
-                    .inc();
-                tracing::warn!("query cancellation key not found: {key}");
-                Ok(())
-            }
-        }
+impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
+    pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: &'static str) -> Self {
+        Self { map, client, from }
    }
 }

@@ -178,14 +163,14 @@ impl CancelClosure {
 }

 /// Helper for registering query cancellation tokens.
-pub struct Session {
+pub struct Session<P> {
    /// The user-facing key identifying this session.
    key: CancelKeyData,
    /// The [`CancelMap`] this session belongs to.
-    cancellation_handler: Arc<CancellationHandler>,
+    cancellation_handler: Arc<CancellationHandler<P>>,
 }

-impl Session {
+impl<P> Session<P> {
    /// Store the cancel token for the given session.
    /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
    pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
@@ -198,7 +183,7 @@ impl Session {
    }
 }

-impl Drop for Session {
+impl<P> Drop for Session<P> {
    fn drop(&mut self) {
        self.cancellation_handler.map.remove(&self.key);
        info!("dropped query cancellation key {}", &self.key);
@@ -207,14 +192,16 @@ impl Drop for Session {

 #[cfg(test)]
 mod tests {
+    use crate::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS;
+
    use super::*;

    #[tokio::test]
    async fn check_session_drop() -> anyhow::Result<()> {
-        let cancellation_handler = Arc::new(CancellationHandler {
-            map: CancelMap::default(),
-            redis_client: None,
-        });
+        let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
+            CancelMap::default(),
+            NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
+        ));

        let session = cancellation_handler.clone().get_session();
        assert!(cancellation_handler.contains(&session));
@@ -224,4 +211,19 @@ mod tests {

        Ok(())
    }
+
+    #[tokio::test]
+    async fn cancel_session_noop_regression() {
+        let handler = CancellationHandler::<()>::new(Default::default(), "local");
+        handler
+            .cancel_session(
+                CancelKeyData {
+                    backend_pid: 0,
+                    cancel_key: 0,
+                },
+                Uuid::new_v4(),
+            )
+            .await
+            .unwrap();
+    }
 }
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -82,14 +82,13 @@ pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;
 /// A config for establishing a connection to compute node.
 /// Eventually, `tokio_postgres` will be replaced with something better.
 /// Newtype allows us to implement methods on top of it.
-#[derive(Clone)]
-#[repr(transparent)]
+#[derive(Clone, Default)]
 pub struct ConnCfg(Box<tokio_postgres::Config>);

 /// Creation and initialization routines.
 impl ConnCfg {
    pub fn new() -> Self {
-        Self(Default::default())
+        Self::default()
    }

    /// Reuse password or auth keys from the other config.
@@ -165,12 +164,6 @@ impl std::ops::DerefMut for ConnCfg {
    }
 }

-impl Default for ConnCfg {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 impl ConnCfg {
    /// Establish a raw TCP connection to the compute node.
    async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> {
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,6 +1,11 @@
-use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions};
+use crate::{
+    auth,
+    rate_limiter::{AuthRateLimiter, RateBucketInfo},
+    serverless::GlobalConnPoolOptions,
+};
 use anyhow::{bail, ensure, Context, Ok};
 use itertools::Itertools;
+use remote_storage::RemoteStorageConfig;
 use rustls::{
    crypto::ring::sign,
    pki_types::{CertificateDer, PrivateKeyDer},
@@ -28,12 +33,14 @@ pub struct ProxyConfig {
    pub redis_rps_limit: Vec<RateBucketInfo>,
    pub region: String,
    pub handshake_timeout: Duration,
+    pub aws_region: String,
 }

 #[derive(Debug)]
 pub struct MetricCollectionConfig {
    pub endpoint: reqwest::Url,
    pub interval: Duration,
+    pub backup_metric_collection_config: MetricBackupCollectionConfig,
 }

 pub struct TlsConfig {
@@ -49,6 +56,8 @@ pub struct HttpConfig {

 pub struct AuthenticationConfig {
    pub scram_protocol_timeout: tokio::time::Duration,
+    pub rate_limiter_enabled: bool,
+    pub rate_limiter: AuthRateLimiter,
 }

 impl TlsConfig {
@@ -304,6 +313,21 @@ impl CertResolver {
    }
 }

+#[derive(Debug)]
+pub struct MetricBackupCollectionConfig {
+    pub interval: Duration,
+    pub remote_storage_config: OptRemoteStorageConfig,
+    pub chunk_size: usize,
+}
+
+/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
+/// runtime type errors from the value parser we use.
+pub type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
+
+pub fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
+    RemoteStorageConfig::from_toml(&s.parse()?)
+}
+
 /// Helper for cmdline cache options parsing.
 #[derive(Debug)]
 pub struct CacheOptions {
--- a/proxy/src/console.rs
+++ b/proxy/src/console.rs
@@ -6,7 +6,7 @@ pub mod messages;

 /// Wrappers for console APIs and their mocks.
 pub mod provider;
-pub use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};
+pub(crate) use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};

 /// Various cache-related types.
 pub mod caches {
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -14,7 +14,6 @@ use crate::{
    context::RequestMonitoring,
    scram, EndpointCacheKey, ProjectId,
 };
-use async_trait::async_trait;
 use dashmap::DashMap;
 use std::{sync::Arc, time::Duration};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
@@ -326,8 +325,7 @@ pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPatt

 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
-#[async_trait]
-pub trait Api {
+pub(crate) trait Api {
    /// Get the client's auth secret for authentication.
    /// Returns option because user not found situation is special.
    /// We still have to mock the scram to avoid leaking information that user doesn't exist.
@@ -363,7 +361,6 @@ pub enum ConsoleBackend {
    Test(Box<dyn crate::auth::backend::TestBackend>),
 }

-#[async_trait]
 impl Api for ConsoleBackend {
    async fn get_role_secret(
        &self,
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -8,7 +8,6 @@ use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
 use crate::context::RequestMonitoring;
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
 use crate::{auth::IpPattern, cache::Cached};
-use async_trait::async_trait;
 use futures::TryFutureExt;
 use std::{str::FromStr, sync::Arc};
 use thiserror::Error;
@@ -144,7 +143,6 @@ async fn get_execute_postgres_query(
    Ok(Some(entry))
 }

-#[async_trait]
 impl super::Api for Api {
    #[tracing::instrument(skip_all)]
    async fn get_role_secret(
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1,2 @@`

				`ALTER TABLE tenant_shards add scheduling_policy VARCHAR NOT NULL DEFAULT '"Active"';`