Prepare for first stage of deployment: do not bump format version and do not write data in new format but recognoze new format

Make ruff happy
Add test for compression
2026-03-07 18:30:37 +00:00 · 2024-03-15 10:02:51 +02:00 · 2024-03-14 18:05:30 +02:00 · 2024-03-14 16:45:45 +02:00 · 2024-03-14 14:21:35 +02:00 · 2024-03-14 08:33:37 +02:00
292 changed files with 5943 additions and 18551 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -22,7 +22,6 @@
 !s3_scrubber/
 !safekeeper/
 !storage_broker/
-!storage_controller/
 !trace/
 !vendor/postgres-*/
 !workspace_hack/
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -147,16 +147,15 @@ jobs:
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
-                      { "platform": "neon-captest-new",              "db_size": "50gb" },
-                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
+          "include": [{ "platform": "neon-captest-freetier",   "db_size": "3gb"  },
+                      { "platform": "neon-captest-new",        "db_size": "50gb" },
+                      { "platform": "neonvm-captest-freetier", "db_size": "3gb"  },
+                      { "platform": "neonvm-captest-new",      "db_size": "50gb" }]
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
-                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
+                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -172,7 +171,7 @@ jobs:

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
-                                                     { "platform": "rds-aurora"   }]')
+                                                   { "platform": "rds-aurora"   }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -191,7 +190,7 @@ jobs:

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                     { "platform": "rds-aurora",   "scale": "10" }]')
+                                                    { "platform": "rds-aurora",   "scale": "10" }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -254,9 +253,6 @@ jobs:
          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
-          neonvm-captest-sharding-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
-            ;;
          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
@@ -274,15 +270,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Benchmark init
      uses: ./.github/actions/run-python-test-set
@@ -409,15 +401,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: ClickBench benchmark
      uses: ./.github/actions/run-python-test-set
@@ -519,15 +507,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Run TPC-H benchmark
      uses: ./.github/actions/run-python-test-set
@@ -613,15 +597,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -461,7 +461,6 @@ jobs:

      - name: Pytest regression tests
        uses: ./.github/actions/run-python-test-set
-        timeout-minutes: 60
        with:
          build_type: ${{ matrix.build_type }}
          test_selection: regress
@@ -1121,36 +1120,18 @@ jobs:
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
-              -f deployPgSniRouter=false \
-              -f deployProxy=false \
-              -f deployStorage=true \
-              -f deployStorageBroker=true \
-              -f deployStorageController=true \
-              -f branch=main \
-              -f dockerTag=${{needs.tag.outputs.build-tag}} \
-              -f deployPreprodRegion=true

+            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
              -f deployPgSniRouter=false \
              -f deployProxy=false \
              -f deployStorage=true \
              -f deployStorageBroker=true \
-              -f deployStorageController=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
-              -f deployPgSniRouter=true \
-              -f deployProxy=true \
-              -f deployStorage=false \
-              -f deployStorageBroker=false \
-              -f deployStorageController=false \
-              -f branch=main \
-              -f dockerTag=${{needs.tag.outputs.build-tag}} \
-              -f deployPreprodRegion=true
-
            gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
              -f deployPgSniRouter=true \
              -f deployProxy=true \
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -62,14 +62,14 @@ jobs:

  trigger-e2e-tests:
    needs: [ tag ]
-    runs-on: ubuntu-latest
+    runs-on: [ self-hosted, gen3, small ]
    env:
      TAG: ${{ needs.tag.outputs.build-tag }}
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+      options: --init
    steps:
      - name: check if ecr image are present
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
        run: |
          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
@@ -79,55 +79,41 @@ jobs:
            fi
          done

-      - name: Set e2e-platforms
-        id: e2e-platforms
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          # Default set of platforms to run e2e tests on
-          platforms='["docker", "k8s"]'
-
-          # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
-          # If the workflow run is not a pull request, add k8s-neonvm to the list.
-          if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
-            for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
-              case "$f" in
-                vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
-                  platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
-                  ;;
-                *)
-                  # no-op
-                  ;;
-              esac
-            done
-          else
-            platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
-          fi
-
-          echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
-
      - name: Set PR's status to pending and request a remote CI test
-        env:
-          E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
-          REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud"
+          # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
+          # but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
+          # to place a job run status update later.
+          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
+          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}

-          gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \
-            --method POST \
-            --raw-field "state=pending" \
-            --raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
-            --raw-field "context=neon-cloud-e2e"
+          REMOTE_REPO="${{ github.repository_owner }}/cloud"

-          gh workflow --repo ${REMOTE_REPO} \
-            run testing.yml \
-              --ref "main" \
-              --raw-field "ci_job_name=neon-cloud-e2e" \
-              --raw-field "commit_hash=$COMMIT_SHA" \
-              --raw-field "remote_repo=${GITHUB_REPOSITORY}" \
-              --raw-field "storage_image_tag=${TAG}" \
-              --raw-field "compute_image_tag=${TAG}" \
-              --raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
-              --raw-field "e2e-platforms=${E2E_PLATFORMS}"
+          curl -f -X POST \
+          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"state\": \"pending\",
+              \"context\": \"neon-cloud-e2e\",
+              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
+            }"
+
+          curl -f -X POST \
+          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"ref\": \"main\",
+              \"inputs\": {
+                \"ci_job_name\": \"neon-cloud-e2e\",
+                \"commit_hash\": \"$COMMIT_SHA\",
+                \"remote_repo\": \"${{ github.repository }}\",
+                \"storage_image_tag\": \"${TAG}\",
+                \"compute_image_tag\": \"${TAG}\",
+                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
+              }
+            }"
--- a/5
+++ b/5
@@ -1,13 +1,12 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/storage_controller @neondatabase/storage
+/control_plane/attachment_service @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
+/libs/postgres_ffi/ @neondatabase/compute
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
 /libs/vm_monitor/ @neondatabase/autoscaling
 /pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
-/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
 /proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
 /vendor/ @neondatabase/compute
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -270,6 +270,39 @@ dependencies = [
 "critical-section",
 ]

+[[package]]
+name = "attachment_service"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "aws-config",
+ "aws-sdk-secretsmanager",
+ "camino",
+ "clap",
+ "control_plane",
+ "diesel",
+ "diesel_migrations",
+ "futures",
+ "git-version",
+ "humantime",
+ "hyper",
+ "metrics",
+ "once_cell",
+ "pageserver_api",
+ "pageserver_client",
+ "postgres_connection",
+ "r2d2",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -308,9 +341,9 @@ dependencies = [

 [[package]]
 name = "aws-credential-types"
-version = "1.1.8"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8"
+checksum = "33cc49dcdd31c8b6e79850a179af4c367669150c7ac0135f176c61bec81a70f7"
 dependencies = [
 "aws-smithy-async",
 "aws-smithy-runtime-api",
@@ -320,9 +353,9 @@ dependencies = [

 [[package]]
 name = "aws-runtime"
-version = "1.1.8"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8"
+checksum = "eb031bff99877c26c28895766f7bb8484a05e24547e370768d6cc9db514662aa"
 dependencies = [
 "aws-credential-types",
 "aws-sigv4",
@@ -342,29 +375,6 @@ dependencies = [
 "uuid",
 ]

-[[package]]
-name = "aws-sdk-iam"
-version = "1.17.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8ae76026bfb1b80a6aed0bb400c1139cd9c0563e26bce1986cd021c6a968c7b"
-dependencies = [
- "aws-credential-types",
- "aws-runtime",
- "aws-smithy-async",
- "aws-smithy-http",
- "aws-smithy-json",
- "aws-smithy-query",
- "aws-smithy-runtime",
- "aws-smithy-runtime-api",
- "aws-smithy-types",
- "aws-smithy-xml",
- "aws-types",
- "http 0.2.9",
- "once_cell",
- "regex-lite",
- "tracing",
-]
-
 [[package]]
 name = "aws-sdk-s3"
 version = "1.14.0"
@@ -394,6 +404,29 @@ dependencies = [
 "url",
 ]

+[[package]]
+name = "aws-sdk-secretsmanager"
+version = "1.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a0b64e61e7d632d9df90a2e0f32630c68c24960cab1d27d848718180af883d3"
+dependencies = [
+ "aws-credential-types",
+ "aws-runtime",
+ "aws-smithy-async",
+ "aws-smithy-http",
+ "aws-smithy-json",
+ "aws-smithy-runtime",
+ "aws-smithy-runtime-api",
+ "aws-smithy-types",
+ "aws-types",
+ "bytes",
+ "fastrand 2.0.0",
+ "http 0.2.9",
+ "once_cell",
+ "regex-lite",
+ "tracing",
+]
+
 [[package]]
 name = "aws-sdk-sso"
 version = "1.12.0"
@@ -463,9 +496,9 @@ dependencies = [

 [[package]]
 name = "aws-sigv4"
-version = "1.2.0"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263"
+checksum = "c371c6b0ac54d4605eb6f016624fb5c7c2925d315fdf600ac1bf21b19d5f1742"
 dependencies = [
 "aws-credential-types",
 "aws-smithy-eventstream",
@@ -478,7 +511,7 @@ dependencies = [
 "hex",
 "hmac",
 "http 0.2.9",
- "http 1.1.0",
+ "http 1.0.0",
 "once_cell",
 "p256",
 "percent-encoding",
@@ -492,9 +525,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-async"
-version = "1.1.8"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46"
+checksum = "72ee2d09cce0ef3ae526679b522835d63e75fb427aca5413cd371e490d52dcc6"
 dependencies = [
 "futures-util",
 "pin-project-lite",
@@ -535,9 +568,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-http"
-version = "0.60.7"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9"
+checksum = "dab56aea3cd9e1101a0a999447fb346afb680ab1406cebc44b32346e25b4117d"
 dependencies = [
 "aws-smithy-eventstream",
 "aws-smithy-runtime-api",
@@ -556,18 +589,18 @@ dependencies = [

 [[package]]
 name = "aws-smithy-json"
-version = "0.60.7"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
+checksum = "fd3898ca6518f9215f62678870064398f00031912390efd03f1f6ef56d83aa8e"
 dependencies = [
 "aws-smithy-types",
 ]

 [[package]]
 name = "aws-smithy-query"
-version = "0.60.7"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb"
+checksum = "bda4b1dfc9810e35fba8a620e900522cd1bd4f9578c446e82f49d1ce41d2e9f9"
 dependencies = [
 "aws-smithy-types",
 "urlencoding",
@@ -575,9 +608,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-runtime"
-version = "1.1.8"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01"
+checksum = "fafdab38f40ad7816e7da5dec279400dd505160780083759f01441af1bbb10ea"
 dependencies = [
 "aws-smithy-async",
 "aws-smithy-http",
@@ -600,15 +633,14 @@ dependencies = [

 [[package]]
 name = "aws-smithy-runtime-api"
-version = "1.2.0"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5"
+checksum = "c18276dd28852f34b3bf501f4f3719781f4999a51c7bff1a5c6dc8c4529adc29"
 dependencies = [
 "aws-smithy-async",
 "aws-smithy-types",
 "bytes",
 "http 0.2.9",
- "http 1.1.0",
 "pin-project-lite",
 "tokio",
 "tracing",
@@ -617,9 +649,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-types"
-version = "1.1.8"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729"
+checksum = "bb3e134004170d3303718baa2a4eb4ca64ee0a1c0a7041dca31b38be0fb414f3"
 dependencies = [
 "base64-simd",
 "bytes",
@@ -640,18 +672,18 @@ dependencies = [

 [[package]]
 name = "aws-smithy-xml"
-version = "0.60.7"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9"
+checksum = "8604a11b25e9ecaf32f9aa56b9fe253c5e2f606a3477f0071e96d3155a5ed218"
 dependencies = [
 "xmlparser",
 ]

 [[package]]
 name = "aws-types"
-version = "1.1.8"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40"
+checksum = "789bbe008e65636fe1b6dbbb374c40c8960d1232b96af5ff4aec349f9c4accf4"
 dependencies = [
 "aws-credential-types",
 "aws-smithy-async",
@@ -1312,7 +1344,6 @@ dependencies = [
 "futures",
 "git-version",
 "hex",
- "humantime",
 "hyper",
 "nix 0.27.1",
 "once_cell",
@@ -2196,9 +2227,9 @@ dependencies = [

 [[package]]
 name = "h2"
-version = "0.3.26"
+version = "0.3.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8"
+checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9"
 dependencies = [
 "bytes",
 "fnv",
@@ -2358,9 +2389,9 @@ dependencies = [

 [[package]]
 name = "http"
-version = "1.1.0"
+version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258"
+checksum = "b32afd38673a8016f7c9ae69e5af41a58f81b1d31689040f2f1959594ce194ea"
 dependencies = [
 "bytes",
 "fnv",
@@ -2460,7 +2491,7 @@ dependencies = [
 "hyper",
 "log",
 "rustls 0.21.9",
- "rustls-native-certs 0.6.2",
+ "rustls-native-certs",
 "tokio",
 "tokio-rustls 0.24.0",
 ]
@@ -2810,6 +2841,15 @@ version = "0.4.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"

+[[package]]
+name = "lz4_flex"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3ea9b256699eda7b0387ffbc776dd625e28bde3918446381781245b7a50349d8"
+dependencies = [
+ "twox-hash",
+]
+
 [[package]]
 name = "match_cfg"
 version = "0.1.0"
@@ -2846,35 +2886,6 @@ version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

-[[package]]
-name = "measured"
-version = "0.0.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
-dependencies = [
- "bytes",
- "hashbrown 0.14.0",
- "itoa",
- "lasso",
- "measured-derive",
- "memchr",
- "parking_lot 0.12.1",
- "rustc-hash",
- "ryu",
-]
-
-[[package]]
-name = "measured-derive"
-version = "0.0.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
-dependencies = [
- "heck",
- "proc-macro2",
- "quote",
- "syn 2.0.52",
-]
-
 [[package]]
 name = "memchr"
 version = "2.6.4"
@@ -3397,9 +3408,9 @@ dependencies = [

 [[package]]
 name = "ordered-multimap"
-version = "0.7.3"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
+checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
 dependencies = [
 "dlv-list",
 "hashbrown 0.14.0",
@@ -3509,6 +3520,7 @@ dependencies = [
 "hyper",
 "itertools",
 "leaky-bucket",
+ "lz4_flex",
 "md5",
 "metrics",
 "nix 0.27.1",
@@ -3525,7 +3537,6 @@ dependencies = [
 "postgres_connection",
 "postgres_ffi",
 "pq_proto",
- "procfs",
 "rand 0.8.5",
 "regex",
 "remote_storage",
@@ -3543,7 +3554,6 @@ dependencies = [
 "strum_macros",
 "svg_fmt",
 "sync_wrapper",
- "sysinfo",
 "tenant_size_model",
 "thiserror",
 "tokio",
@@ -3897,7 +3907,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3910,7 +3920,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
 dependencies = [
 "native-tls",
 "tokio",
@@ -3921,7 +3931,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -3934,13 +3944,12 @@ dependencies = [
 "rand 0.8.5",
 "sha2",
 "stringprep",
- "tokio",
 ]

 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4161,12 +4170,7 @@ name = "proxy"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "async-compression",
 "async-trait",
- "aws-config",
- "aws-sdk-iam",
- "aws-sigv4",
- "aws-types",
 "base64 0.13.1",
 "bstr",
 "bytes",
@@ -4177,7 +4181,6 @@ dependencies = [
 "consumption_metrics",
 "dashmap",
 "env_logger",
- "fallible-iterator",
 "futures",
 "git-version",
 "hashbrown 0.13.2",
@@ -4185,7 +4188,6 @@ dependencies = [
 "hex",
 "hmac",
 "hostname",
- "http 1.1.0",
 "humantime",
 "hyper",
 "hyper-tungstenite",
@@ -4229,7 +4231,6 @@ dependencies = [
 "smallvec",
 "smol_str",
 "socket2 0.5.5",
- "subtle",
 "sync_wrapper",
 "task-local-extensions",
 "thiserror",
@@ -4401,9 +4402,9 @@ dependencies = [

 [[package]]
 name = "redis"
-version = "0.25.2"
+version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "71d64e978fd98a0e6b105d066ba4889a7301fca65aeac850a877d8797343feeb"
+checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd"
 dependencies = [
 "async-trait",
 "bytes",
@@ -4412,15 +4413,15 @@ dependencies = [
 "itoa",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.22.2",
- "rustls-native-certs 0.7.0",
- "rustls-pemfile 2.1.1",
- "rustls-pki-types",
+ "rustls 0.21.9",
+ "rustls-native-certs",
+ "rustls-pemfile 1.0.2",
+ "rustls-webpki 0.101.7",
 "ryu",
 "sha1_smol",
- "socket2 0.5.5",
+ "socket2 0.4.9",
 "tokio",
- "tokio-rustls 0.25.0",
+ "tokio-rustls 0.24.0",
 "tokio-util",
 "url",
 ]
@@ -4849,19 +4850,6 @@ dependencies = [
 "security-framework",
 ]

-[[package]]
-name = "rustls-native-certs"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792"
-dependencies = [
- "openssl-probe",
- "rustls-pemfile 2.1.1",
- "rustls-pki-types",
- "schannel",
- "security-framework",
-]
-
 [[package]]
 name = "rustls-pemfile"
 version = "1.0.2"
@@ -5364,23 +5352,13 @@ checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"

 [[package]]
 name = "sha2"
-version = "0.10.8"
+version = "0.10.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
+checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"
 dependencies = [
 "cfg-if",
 "cpufeatures",
 "digest",
- "sha2-asm",
-]
-
-[[package]]
-name = "sha2-asm"
-version = "0.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f27ba7066011e3fb30d808b51affff34f0a66d3a03a58edd787c6e420e40e44e"
-dependencies = [
- "cc",
 ]

 [[package]]
@@ -5584,65 +5562,6 @@ dependencies = [
 "workspace_hack",
 ]

-[[package]]
-name = "storage_controller"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "aws-config",
- "bytes",
- "camino",
- "clap",
- "control_plane",
- "diesel",
- "diesel_migrations",
- "fail",
- "futures",
- "git-version",
- "hex",
- "humantime",
- "hyper",
- "itertools",
- "lasso",
- "measured",
- "metrics",
- "once_cell",
- "pageserver_api",
- "pageserver_client",
- "postgres_connection",
- "r2d2",
- "reqwest",
- "routerify",
- "serde",
- "serde_json",
- "thiserror",
- "tokio",
- "tokio-util",
- "tracing",
- "utils",
- "workspace_hack",
-]
-
-[[package]]
-name = "storcon_cli"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "clap",
- "comfy-table",
- "hyper",
- "pageserver_api",
- "pageserver_client",
- "reqwest",
- "serde",
- "serde_json",
- "thiserror",
- "tokio",
- "tracing",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "stringprep"
 version = "0.1.2"
@@ -5799,23 +5718,23 @@ dependencies = [

 [[package]]
 name = "test-context"
-version = "0.3.0"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6676ab8513edfd2601a108621103fdb45cac9098305ca25ec93f7023b06b05d9"
+checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3"
 dependencies = [
+ "async-trait",
 "futures",
 "test-context-macros",
 ]

 [[package]]
 name = "test-context-macros"
-version = "0.3.0"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1"
+checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d"
 dependencies = [
- "proc-macro2",
 "quote",
- "syn 2.0.52",
+ "syn 1.0.109",
 ]

 [[package]]
@@ -5956,9 +5875,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"

 [[package]]
 name = "tokio"
-version = "1.37.0"
+version = "1.36.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787"
+checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
 dependencies = [
 "backtrace",
 "bytes",
@@ -5975,7 +5894,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
 dependencies = [
 "futures",
 "nix 0.26.4",
@@ -6022,7 +5941,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -6188,7 +6107,7 @@ dependencies = [
 "percent-encoding",
 "pin-project",
 "prost",
- "rustls-native-certs 0.6.2",
+ "rustls-native-certs",
 "rustls-pemfile 1.0.2",
 "tokio",
 "tokio-rustls 0.24.0",
@@ -6512,7 +6431,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
 dependencies = [
 "bytes",
 "io-uring",
@@ -6555,7 +6474,6 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "arc-swap",
- "async-compression",
 "async-trait",
 "bincode",
 "byteorder",
@@ -6594,14 +6512,12 @@ dependencies = [
 "thiserror",
 "tokio",
 "tokio-stream",
- "tokio-tar",
 "tokio-util",
 "tracing",
 "tracing-error",
 "tracing-subscriber",
 "url",
 "uuid",
- "walkdir",
 "workspace_hack",
 ]

@@ -7073,6 +6989,7 @@ dependencies = [
 "aws-sigv4",
 "aws-smithy-async",
 "aws-smithy-http",
+ "aws-smithy-runtime-api",
 "aws-smithy-types",
 "axum",
 "base64 0.21.1",
@@ -7118,7 +7035,6 @@ dependencies = [
 "scopeguard",
 "serde",
 "serde_json",
- "sha2",
 "smallvec",
 "subtle",
 "syn 1.0.109",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
 members = [
    "compute_tools",
    "control_plane",
-    "control_plane/storcon_cli",
+    "control_plane/attachment_service",
    "pageserver",
    "pageserver/compaction",
    "pageserver/ctl",
@@ -12,7 +12,6 @@ members = [
    "proxy",
    "safekeeper",
    "storage_broker",
-    "storage_controller",
    "s3_scrubber",
    "workspace_hack",
    "trace",
@@ -53,12 +52,10 @@ async-stream = "0.3"
 async-trait = "0.1"
 aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "1.14"
-aws-sdk-iam = "1.15.0"
+aws-sdk-secretsmanager = { version = "1.14.0" }
 aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
 aws-smithy-types = "1.1.4"
 aws-credential-types = "1.1.4"
-aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
-aws-types = "1.1.7"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -79,7 +76,6 @@ either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
 fail = "0.5.0"
-fallible-iterator = "0.2"
 fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
@@ -92,7 +88,6 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
-http = {version = "1.1.0", features = ["std"]}
 http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
@@ -105,8 +100,8 @@ jsonwebtoken = "9"
 lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
+lz4_flex = "0.11.1"
 md5 = "0.7.0"
-measured = { version = "0.0.13", features=["default", "lasso"] }
 memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -126,7 +121,7 @@ procfs = "0.14"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
-redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
+redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
@@ -154,12 +149,11 @@ smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
-"subtle"  = "2.5.0"
 svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
-test-context = "0.3"
+test-context = "0.1"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"
 tikv-jemalloc-ctl = "0.5"
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.77.0
+ENV RUSTC_VERSION=1.76.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
@@ -149,7 +149,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    cargo install --git https://github.com/paritytech/cachepot && \
    cargo install rustfilt && \
    cargo install cargo-hakari && \
-    cargo install cargo-deny --locked && \
+    cargo install cargo-deny && \
    cargo install cargo-hack && \
    cargo install cargo-nextest && \
    rm -rf /home/nonroot/.cargo/registry && \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -944,9 +944,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl

-# Create remote extension download directory
-RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
-
 # Install:
 # libreadline8 for psql
 # libicu67, locales for collations (including ICU and plpgsql_check)
--- a/README.md
+++ b/README.md
@@ -238,14 +238,6 @@ If you encounter errors during setting up the initial tenant, it's best to stop

 ## Running tests

-### Rust unit tests
-
-We are using [`cargo-nextest`](https://nexte.st/) to run the tests in Github Workflows.
-Some crates do not support running plain `cargo test` anymore, prefer `cargo nextest run` instead.
-You can install `cargo-nextest` with `cargo install cargo-nextest`.
-
-### Integration tests
-
 Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).

 ```sh
--- a/clippy.toml
+++ b/clippy.toml
@@ -2,8 +2,6 @@ disallowed-methods = [
    "tokio::task::block_in_place",
    # Allow this for now, to deny it later once we stop using Handle::block_on completely
    # "tokio::runtime::Handle::block_on",
-    # use tokio_epoll_uring_ext instead
-    "tokio_epoll_uring::thread_local_system",
 ]

 disallowed-macros = [
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -32,29 +32,6 @@ compute_ctl -D /var/db/postgres/compute \
            -b /usr/local/bin/postgres
 ```

-## State Diagram
-
-Computes can be in various states. Below is a diagram that details how a
-compute moves between states.
-
-```mermaid
-%% https://mermaid.js.org/syntax/stateDiagram.html
-stateDiagram-v2
-  [*] --> Empty : Compute spawned
-  Empty --> ConfigurationPending : Waiting for compute spec
-  ConfigurationPending --> Configuration : Received compute spec
-  Configuration --> Failed : Failed to configure the compute
-  Configuration --> Running : Compute has been configured
-  Empty --> Init : Compute spec is immediately available
-  Empty --> TerminationPending : Requested termination
-  Init --> Failed : Failed to start Postgres
-  Init --> Running : Started Postgres
-  Running --> TerminationPending : Requested termination
-  TerminationPending --> Terminated : Terminated compute
-  Failed --> [*] : Compute exited
-  Terminated --> [*] : Compute exited
-```
-
 ## Tests

 Cargo formatter:
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1262,12 +1262,10 @@ LIMIT 100",
        .await
        .map_err(DownloadError::Other);

-        if download_size.is_ok() {
-            self.ext_download_progress
-                .write()
-                .expect("bad lock")
-                .insert(ext_archive_name.to_string(), (download_start, true));
-        }
+        self.ext_download_progress
+            .write()
+            .expect("bad lock")
+            .insert(ext_archive_name.to_string(), (download_start, true));

        download_size
    }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -17,7 +17,6 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
        .write(true)
        .create(true)
        .append(false)
-        .truncate(false)
        .open(path)?;
    let buf = io::BufReader::new(&file);
    let mut count: usize = 0;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            RoleAction::Create => {
                // This branch only runs when roles are created through the console, so it is
                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
-                // from neon_superuser.
+                // from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
                let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("running role create query: '{}'", &query);
@@ -745,12 +745,7 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
    // - extension was already installed and is up to date
    let query = "ALTER EXTENSION neon UPDATE";
    info!("update neon extension version with query: {}", query);
-    if let Err(e) = client.simple_query(query) {
-        error!(
-            "failed to upgrade neon extension during `handle_extension_neon`: {}",
-            e
-        );
-    }
+    client.simple_query(query)?;

    Ok(())
 }
@@ -809,8 +804,19 @@ $$;"#,
        "",
        "",
        "",
-        "",
        // Add new migrations below.
+        r#"
+DO $$
+DECLARE
+    role_name TEXT;
+BEGIN
+    FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
+    END LOOP;
+END
+$$;"#,
    ];

    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -12,7 +12,6 @@ clap.workspace = true
 comfy-table.workspace = true
 futures.workspace = true
 git-version.workspace = true
-humantime.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "storage_controller"
+name = "attachment_service"
 version = "0.1.0"
 edition.workspace = true
 license.workspace = true
@@ -16,37 +16,31 @@ testing = []
 [dependencies]
 anyhow.workspace = true
 aws-config.workspace = true
-bytes.workspace = true
+aws-sdk-secretsmanager.workspace = true
 camino.workspace = true
 clap.workspace = true
-fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
-hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
-itertools.workspace = true
-lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
 reqwest.workspace = true
-routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
-measured.workspace = true

 diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
 r2d2 = { version = "0.8.10" }

-utils = { path = "../libs/utils/" }
-metrics = { path = "../libs/metrics/" }
-control_plane = { path = "../control_plane" }
-workspace_hack = { version = "0.1", path = "../workspace_hack" }
+utils = { path = "../../libs/utils/" }
+metrics = { path = "../../libs/metrics/" }
+control_plane = { path = ".." }
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }

--- a/control_plane/attachment_service/migrations/.keep
+++ b/control_plane/attachment_service/migrations/.keep
--- a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
--- a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
--- a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
--- a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
--- a/control_plane/attachment_service/src/auth.rs
+++ b/control_plane/attachment_service/src/auth.rs
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -1,4 +1,3 @@
-use std::sync::Arc;
 use std::{collections::HashMap, time::Duration};

 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
@@ -15,30 +14,19 @@ use utils::{

 use crate::service::Config;

+const BUSY_DELAY: Duration = Duration::from_secs(1);
 const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);

 pub(crate) const API_CONCURRENCY: usize = 32;

-struct UnshardedComputeHookTenant {
-    // Which node is this tenant attached to
-    node_id: NodeId,
-
-    // Must hold this lock to send a notification.
-    send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
-}
 struct ShardedComputeHookTenant {
    stripe_size: ShardStripeSize,
    shard_count: ShardCount,
    shards: Vec<(ShardNumber, NodeId)>,
-
-    // Must hold this lock to send a notification.  The contents represent
-    // the last successfully sent notification, and are used to coalesce multiple
-    // updates by only sending when there is a chance since our last successful send.
-    send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
 }

 enum ComputeHookTenant {
-    Unsharded(UnshardedComputeHookTenant),
+    Unsharded(NodeId),
    Sharded(ShardedComputeHookTenant),
 }

@@ -50,20 +38,9 @@ impl ComputeHookTenant {
                shards: vec![(tenant_shard_id.shard_number, node_id)],
                stripe_size,
                shard_count: tenant_shard_id.shard_count,
-                send_lock: Arc::default(),
            })
        } else {
-            Self::Unsharded(UnshardedComputeHookTenant {
-                node_id,
-                send_lock: Arc::default(),
-            })
-        }
-    }
-
-    fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>> {
-        match self {
-            Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock,
-            Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock,
+            Self::Unsharded(node_id)
        }
    }

@@ -76,8 +53,8 @@ impl ComputeHookTenant {
        node_id: NodeId,
    ) {
        match self {
-            Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => {
-                unsharded_tenant.node_id = node_id
+            Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
+                *existing_node_id = node_id
            }
            Self::Sharded(sharded_tenant)
                if sharded_tenant.stripe_size == stripe_size
@@ -104,14 +81,14 @@ impl ComputeHookTenant {
    }
 }

-#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Debug)]
 struct ComputeHookNotifyRequestShard {
    node_id: NodeId,
    shard_number: ShardNumber,
 }

 /// Request body that we send to the control plane to notify it of where a tenant is attached
-#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Debug)]
 struct ComputeHookNotifyRequest {
    tenant_id: TenantId,
    stripe_size: Option<ShardStripeSize>,
@@ -144,44 +121,14 @@ pub(crate) enum NotifyError {
    Fatal(StatusCode),
 }

-enum MaybeSendResult {
-    // Please send this request while holding the lock, and if you succeed then write
-    // the request into the lock.
-    Transmit(
-        (
-            ComputeHookNotifyRequest,
-            tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>,
-        ),
-    ),
-    // Something requires sending, but you must wait for a current sender then call again
-    AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>),
-    // Nothing requires sending
-    Noop,
-}
-
 impl ComputeHookTenant {
-    fn maybe_send(
-        &self,
-        tenant_id: TenantId,
-        lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>>,
-    ) -> MaybeSendResult {
-        let locked = match lock {
-            Some(already_locked) => already_locked,
-            None => {
-                // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock.
-                let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else {
-                    return MaybeSendResult::AwaitLock(self.get_send_lock().clone());
-                };
-                locked
-            }
-        };
-
-        let request = match self {
-            Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest {
+    fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
+        match self {
+            Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
                tenant_id,
                shards: vec![ComputeHookNotifyRequestShard {
                    shard_number: ShardNumber(0),
-                    node_id: unsharded_tenant.node_id,
+                    node_id: *node_id,
                }],
                stripe_size: None,
            }),
@@ -205,25 +152,12 @@ impl ComputeHookTenant {
                // Sharded tenant doesn't yet have information for all its shards

                tracing::info!(
-                    "ComputeHookTenant::maybe_send: not enough shards ({}/{})",
+                    "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
                    sharded_tenant.shards.len(),
                    sharded_tenant.shard_count.count()
                );
                None
            }
-        };
-
-        match request {
-            None => {
-                // Not yet ready to emit a notification
-                tracing::info!("Tenant isn't yet ready to emit a notification");
-                MaybeSendResult::Noop
-            }
-            Some(request) if Some(&request) == locked.as_ref() => {
-                // No change from the last value successfully sent
-                MaybeSendResult::Noop
-            }
-            Some(request) => MaybeSendResult::Transmit((request, locked)),
        }
    }
 }
@@ -233,15 +167,8 @@ impl ComputeHookTenant {
 /// the compute connection string.
 pub(super) struct ComputeHook {
    config: Config,
-    state: std::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
+    state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
    authorization_header: Option<String>,
-
-    // Concurrency limiter, so that we do not overload the cloud control plane when updating
-    // large numbers of tenants (e.g. when failing over after a node failure)
-    api_concurrency: tokio::sync::Semaphore,
-
-    // This lock is only used in testing enviroments, to serialize calls into neon_lock
-    neon_local_lock: tokio::sync::Mutex<()>,
 }

 impl ComputeHook {
@@ -255,20 +182,14 @@ impl ComputeHook {
            state: Default::default(),
            config,
            authorization_header,
-            neon_local_lock: Default::default(),
-            api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
        }
    }

    /// For test environments: use neon_local's LocalEnv to update compute
    async fn do_notify_local(
        &self,
-        reconfigure_request: &ComputeHookNotifyRequest,
+        reconfigure_request: ComputeHookNotifyRequest,
    ) -> anyhow::Result<()> {
-        // neon_local updates are not safe to call concurrently, use a lock to serialize
-        // all calls to this function
-        let _locked = self.neon_local_lock.lock().await;
-
        let env = match LocalEnv::load_config() {
            Ok(e) => e,
            Err(e) => {
@@ -285,7 +206,7 @@ impl ComputeHook {
        } = reconfigure_request;

        let compute_pageservers = shards
-            .iter()
+            .into_iter()
            .map(|shard| {
                let ps_conf = env
                    .get_pageserver_conf(shard.node_id)
@@ -297,10 +218,10 @@ impl ComputeHook {
            .collect::<Vec<_>>();

        for (endpoint_name, endpoint) in &cplane.endpoints {
-            if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
+            if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
                tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
                endpoint
-                    .reconfigure(compute_pageservers.clone(), *stripe_size)
+                    .reconfigure(compute_pageservers.clone(), stripe_size)
                    .await?;
            }
        }
@@ -359,10 +280,11 @@ impl ComputeHook {
                Err(NotifyError::SlowDown)
            }
            StatusCode::LOCKED => {
-                // We consider this fatal, because it's possible that the operation blocking the control one is
-                // also the one that is waiting for this reconcile.  We should let the reconciler calling
-                // this hook fail, to give control plane a chance to un-lock.
-                tracing::info!("Control plane reports tenant is locked, dropping out of notify");
+                // Delay our retry if busy: the usual fast exponential backoff in backoff::retry
+                // is not appropriate
+                tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
+                    .await
+                    .ok();
                Err(NotifyError::Busy)
            }
            StatusCode::SERVICE_UNAVAILABLE
@@ -378,29 +300,13 @@ impl ComputeHook {
    async fn do_notify(
        &self,
        url: &String,
-        reconfigure_request: &ComputeHookNotifyRequest,
+        reconfigure_request: ComputeHookNotifyRequest,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
        let client = reqwest::Client::new();
-
-        // We hold these semaphore units across all retries, rather than only across each
-        // HTTP request: this is to preserve fairness and avoid a situation where a retry might
-        // time out waiting for a semaphore.
-        let _units = self
-            .api_concurrency
-            .acquire()
-            .await
-            // Interpret closed semaphore as shutdown
-            .map_err(|_| NotifyError::ShuttingDown)?;
-
        backoff::retry(
-            || self.do_notify_iteration(&client, url, reconfigure_request, cancel),
-            |e| {
-                matches!(
-                    e,
-                    NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy
-                )
-            },
+            || self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
+            |e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
            3,
            10,
            "Send compute notification",
@@ -434,70 +340,42 @@ impl ComputeHook {
        stripe_size: ShardStripeSize,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
-        let maybe_send_result = {
-            let mut state_locked = self.state.lock().unwrap();
+        let mut locked = self.state.lock().await;

-            use std::collections::hash_map::Entry;
-            let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
-                Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
-                    tenant_shard_id,
-                    stripe_size,
-                    node_id,
-                )),
-                Entry::Occupied(e) => {
-                    let tenant = e.into_mut();
-                    tenant.update(tenant_shard_id, stripe_size, node_id);
-                    tenant
-                }
-            };
-            tenant.maybe_send(tenant_shard_id.tenant_id, None)
+        use std::collections::hash_map::Entry;
+        let tenant = match locked.entry(tenant_shard_id.tenant_id) {
+            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
+                tenant_shard_id,
+                stripe_size,
+                node_id,
+            )),
+            Entry::Occupied(e) => {
+                let tenant = e.into_mut();
+                tenant.update(tenant_shard_id, stripe_size, node_id);
+                tenant
+            }
        };

-        // Process result: we may get an update to send, or we may have to wait for a lock
-        // before trying again.
-        let (request, mut send_lock_guard) = match maybe_send_result {
-            MaybeSendResult::Noop => {
-                return Ok(());
-            }
-            MaybeSendResult::AwaitLock(send_lock) => {
-                let send_locked = send_lock.lock_owned().await;
-
-                // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here
-                // we have acquired the send lock and take `[Self::state]` lock.  This is safe because maybe_send only uses
-                // try_lock.
-                let state_locked = self.state.lock().unwrap();
-                let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else {
-                    return Ok(());
-                };
-                match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) {
-                    MaybeSendResult::AwaitLock(_) => {
-                        unreachable!("We supplied lock guard")
-                    }
-                    MaybeSendResult::Noop => {
-                        return Ok(());
-                    }
-                    MaybeSendResult::Transmit((request, lock)) => (request, lock),
-                }
-            }
-            MaybeSendResult::Transmit((request, lock)) => (request, lock),
+        let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
+        let Some(reconfigure_request) = reconfigure_request else {
+            // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
+            // until it does.
+            tracing::info!("Tenant isn't yet ready to emit a notification");
+            return Ok(());
        };

-        let result = if let Some(notify_url) = &self.config.compute_hook_url {
-            self.do_notify(notify_url, &request, cancel).await
+        if let Some(notify_url) = &self.config.compute_hook_url {
+            self.do_notify(notify_url, reconfigure_request, cancel)
+                .await
        } else {
-            self.do_notify_local(&request).await.map_err(|e| {
-                // This path is for testing only, so munge the error into our prod-style error type.
-                tracing::error!("Local notification hook failed: {e}");
-                NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
-            })
-        };
-
-        if result.is_ok() {
-            // Before dropping the send lock, stash the request we just sent so that
-            // subsequent callers can avoid redundantly re-sending the same thing.
-            *send_lock_guard = Some(request);
+            self.do_notify_local(reconfigure_request)
+                .await
+                .map_err(|e| {
+                    // This path is for testing only, so munge the error into our prod-style error type.
+                    tracing::error!("Local notification hook failed: {e}");
+                    NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
+                })
        }
-        result
    }
 }

@@ -521,22 +399,21 @@ pub(crate) mod tests {
            NodeId(1),
        );

-        // An unsharded tenant is always ready to emit a notification, but won't
-        // send the same one twice
-        let send_result = tenant_state.maybe_send(tenant_id, None);
-        let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
-            anyhow::bail!("Wrong send result");
-        };
-        assert_eq!(request.shards.len(), 1);
-        assert!(request.stripe_size.is_none());
-
-        // Simulate successful send
-        *guard = Some(request);
-        drop(guard);
-
-        // Try asking again: this should be a no-op
-        let send_result = tenant_state.maybe_send(tenant_id, None);
-        assert!(matches!(send_result, MaybeSendResult::Noop));
+        // An unsharded tenant is always ready to emit a notification
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .shards
+                .len(),
+            1
+        );
+        assert!(tenant_state
+            .maybe_reconfigure(tenant_id)
+            .unwrap()
+            .stripe_size
+            .is_none());

        // Writing the first shard of a multi-sharded situation (i.e. in a split)
        // resets the tenant state and puts it in an non-notifying state (need to
@@ -550,10 +427,7 @@ pub(crate) mod tests {
            ShardStripeSize(32768),
            NodeId(1),
        );
-        assert!(matches!(
-            tenant_state.maybe_send(tenant_id, None),
-            MaybeSendResult::Noop
-        ));
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());

        // Writing the second shard makes it ready to notify
        tenant_state.update(
@@ -566,16 +440,22 @@ pub(crate) mod tests {
            NodeId(1),
        );

-        let send_result = tenant_state.maybe_send(tenant_id, None);
-        let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
-            anyhow::bail!("Wrong send result");
-        };
-        assert_eq!(request.shards.len(), 2);
-        assert_eq!(request.stripe_size, Some(ShardStripeSize(32768)));
-
-        // Simulate successful send
-        *guard = Some(request);
-        drop(guard);
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .shards
+                .len(),
+            2
+        );
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .stripe_size,
+            Some(ShardStripeSize(32768))
+        );

        Ok(())
    }
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,11 +1,5 @@
-use crate::metrics::{
-    HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
-    METRICS_REGISTRY,
-};
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
-use futures::Future;
-use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
@@ -16,11 +10,9 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
-use tokio_util::sync::CancellationToken;
 use utils::auth::{Scope, SwappableJwtAuth};
-use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
-use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param};
+use utils::http::request::{must_get_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};

 use utils::{
@@ -34,15 +26,12 @@ use utils::{
 };

 use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest,
-    TenantShardMigrateRequest,
+    NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
 };
 use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};

 use control_plane::storage_controller::{AttachHookRequest, InspectRequest};

-use routerify::Middleware;
-
 /// State available to HTTP request handlers
 #[derive(Clone)]
 pub struct HttpState {
@@ -257,10 +246,8 @@ async fn handle_tenant_secondary_download(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
-
-    let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
-    json_response(status, progress)
+    service.tenant_secondary_download(tenant_id).await?;
+    json_response(StatusCode::OK, ())
 }

 async fn handle_tenant_delete(
@@ -322,7 +309,7 @@ async fn handle_tenant_timeline_passthrough(
    tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);

    // Find the node that holds shard zero
-    let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?;
+    let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;

    // Callers will always pass an unsharded tenant ID.  Before proxying, we must
    // rewrite this to a shard-aware shard zero ID.
@@ -331,39 +318,12 @@ async fn handle_tenant_timeline_passthrough(
    let tenant_shard_str = format!("{}", tenant_shard_id);
    let path = path.replace(&tenant_str, &tenant_shard_str);

-    let latency = &METRICS_REGISTRY
-        .metrics_group
-        .storage_controller_passthrough_request_latency;
-
-    // This is a bit awkward. We remove the param from the request
-    // and join the words by '_' to get a label for the request.
-    let just_path = path.replace(&tenant_shard_str, "");
-    let path_label = just_path
-        .split('/')
-        .filter(|token| !token.is_empty())
-        .collect::<Vec<_>>()
-        .join("_");
-    let labels = PageserverRequestLabelGroup {
-        pageserver_id: &node.get_id().to_string(),
-        path: &path_label,
-        method: crate::metrics::Method::Get,
-    };
-
-    let _timer = latency.start_timer(labels.clone());
-
-    let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
+    let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
    let resp = client.get_raw(path).await.map_err(|_e|
        // FIXME: give APiError a proper Unavailable variant.  We return 503 here because
        // if we can't successfully send a request to the pageserver, we aren't available.
        ApiError::ShuttingDown)?;

-    if !resp.status().is_success() {
-        let error_counter = &METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_passthrough_request_error;
-        error_counter.inc(labels);
-    }
-
    // We have a reqest::Response, would like a http::Response
    let mut builder = hyper::Response::builder()
        .status(resp.status())
@@ -389,25 +349,6 @@ async fn handle_tenant_locate(
    json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }

-async fn handle_tenant_describe(
-    service: Arc<Service>,
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
-}
-
-async fn handle_tenant_list(
-    service: Arc<Service>,
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    json_response(StatusCode::OK, service.tenant_list())
-}
-
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -421,10 +362,7 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
    check_permissions(&req, Scope::Admin)?;

    let state = get_state(&req);
-    let nodes = state.service.node_list().await?;
-    let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
-
-    json_response(StatusCode::OK, api_nodes)
+    json_response(StatusCode::OK, state.service.node_list().await?)
 }

 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -449,14 +387,7 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,

    json_response(
        StatusCode::OK,
-        state
-            .service
-            .node_configure(
-                config_req.node_id,
-                config_req.availability.map(NodeAvailability::from),
-                config_req.scheduling,
-            )
-            .await?,
+        state.service.node_configure(config_req).await?,
    )
 }

@@ -491,22 +422,6 @@ async fn handle_tenant_shard_migrate(
    )
 }

-async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
-    let state = get_state(&req);
-
-    json_response(
-        StatusCode::OK,
-        state
-            .service
-            .tenant_update_policy(tenant_id, update_req)
-            .await?,
-    )
-}
-
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    check_permissions(&req, Scope::PageServerApi)?;
@@ -538,14 +453,6 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, state.service.consistency_check().await?)
 }

-async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let state = get_state(&req);
-
-    json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
-}
-
 /// Status endpoint is just used for checking that our HTTP listener is up
 async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(StatusCode::OK, ())
@@ -570,11 +477,7 @@ impl From<ReconcileError> for ApiError {

 /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
 /// be allowed to run if Service has finished its initial reconciliation.
-async fn tenant_service_handler<R, H>(
-    request: Request<Body>,
-    handler: H,
-    request_name: RequestName,
-) -> R::Output
+async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
 where
    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
    H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
@@ -594,120 +497,24 @@ where
        ));
    }

-    named_request_span(
+    request_span(
        request,
        |request| async move { handler(service, request).await },
-        request_name,
    )
    .await
 }

-/// Check if the required scope is held in the request's token, or if the request has
-/// a token with 'admin' scope then always permit it.
 fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
    check_permission_with(request, |claims| {
-        match crate::auth::check_permission(claims, required_scope) {
-            Err(e) => match crate::auth::check_permission(claims, Scope::Admin) {
-                Ok(()) => Ok(()),
-                Err(_) => Err(e),
-            },
-            Ok(()) => Ok(()),
-        }
+        crate::auth::check_permission(claims, required_scope)
    })
 }

-#[derive(Clone, Debug)]
-struct RequestMeta {
-    method: hyper::http::Method,
-    at: Instant,
-}
-
-fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-) -> Middleware<B, ApiError> {
-    Middleware::pre(move |req| async move {
-        let meta = RequestMeta {
-            method: req.method().clone(),
-            at: Instant::now(),
-        };
-
-        req.set_context(meta);
-
-        Ok(req)
-    })
-}
-
-fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-) -> Middleware<B, ApiError> {
-    Middleware::post_with_info(move |resp, req_info| async move {
-        let request_name = match req_info.context::<RequestName>() {
-            Some(name) => name,
-            None => {
-                return Ok(resp);
-            }
-        };
-
-        if let Some(meta) = req_info.context::<RequestMeta>() {
-            let status = &crate::metrics::METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_http_request_status;
-            let latency = &crate::metrics::METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_http_request_latency;
-
-            status.inc(HttpRequestStatusLabelGroup {
-                path: request_name.0,
-                method: meta.method.clone().into(),
-                status: crate::metrics::StatusCode(resp.status()),
-            });
-
-            latency.observe(
-                HttpRequestLatencyLabelGroup {
-                    path: request_name.0,
-                    method: meta.method.into(),
-                },
-                meta.at.elapsed().as_secs_f64(),
-            );
-        }
-        Ok(resp)
-    })
-}
-
-pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
-
-    let payload = crate::metrics::METRICS_REGISTRY.encode();
-    let response = Response::builder()
-        .status(200)
-        .header(CONTENT_TYPE, TEXT_FORMAT)
-        .body(payload.into())
-        .unwrap();
-
-    Ok(response)
-}
-
-#[derive(Clone)]
-struct RequestName(&'static str);
-
-async fn named_request_span<R, H>(
-    request: Request<Body>,
-    handler: H,
-    name: RequestName,
-) -> R::Output
-where
-    R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
-    H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
-{
-    request.set_context(name);
-    request_span(request, handler).await
-}
-
 pub fn make_router(
    service: Arc<Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
 ) -> RouterBuilder<hyper::Body, ApiError> {
-    let mut router = endpoint::make_router()
-        .middleware(prologue_metrics_middleware())
-        .middleware(epilogue_metrics_middleware());
+    let mut router = endpoint::make_router();
    if auth.is_some() {
        router = router.middleware(auth_middleware(|request| {
            let state = get_state(request);
@@ -716,179 +523,93 @@ pub fn make_router(
            } else {
                state.auth.as_deref()
            }
-        }));
+        }))
    }

    router
        .data(Arc::new(HttpState::new(service, auth)))
-        .get("/metrics", |r| {
-            named_request_span(r, measured_metrics_handler, RequestName("metrics"))
-        })
        // Non-prefixed generic endpoints (status, metrics)
-        .get("/status", |r| {
-            named_request_span(r, handle_status, RequestName("status"))
-        })
-        .get("/ready", |r| {
-            named_request_span(r, handle_ready, RequestName("ready"))
-        })
+        .get("/status", |r| request_span(r, handle_status))
+        .get("/ready", |r| request_span(r, handle_ready))
        // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
        .post("/upcall/v1/re-attach", |r| {
-            named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach"))
-        })
-        .post("/upcall/v1/validate", |r| {
-            named_request_span(r, handle_validate, RequestName("upcall_v1_validate"))
+            request_span(r, handle_re_attach)
        })
+        .post("/upcall/v1/validate", |r| request_span(r, handle_validate))
        // Test/dev/debug endpoints
        .post("/debug/v1/attach-hook", |r| {
-            named_request_span(r, handle_attach_hook, RequestName("debug_v1_attach_hook"))
-        })
-        .post("/debug/v1/inspect", |r| {
-            named_request_span(r, handle_inspect, RequestName("debug_v1_inspect"))
+            request_span(r, handle_attach_hook)
        })
+        .post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
        .post("/debug/v1/tenant/:tenant_id/drop", |r| {
-            named_request_span(r, handle_tenant_drop, RequestName("debug_v1_tenant_drop"))
+            request_span(r, handle_tenant_drop)
        })
        .post("/debug/v1/node/:node_id/drop", |r| {
-            named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
-        })
-        .get("/debug/v1/tenant", |r| {
-            named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
-        })
-        .get("/debug/v1/tenant/:tenant_id/locate", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_locate,
-                RequestName("debug_v1_tenant_locate"),
-            )
+            request_span(r, handle_node_drop)
        })
+        .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
        .get("/debug/v1/scheduler", |r| {
-            named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
+            request_span(r, handle_scheduler_dump)
        })
        .post("/debug/v1/consistency_check", |r| {
-            named_request_span(
-                r,
-                handle_consistency_check,
-                RequestName("debug_v1_consistency_check"),
-            )
+            request_span(r, handle_consistency_check)
        })
-        .post("/debug/v1/reconcile_all", |r| {
-            request_span(r, handle_reconcile_all)
-        })
-        .put("/debug/v1/failpoints", |r| {
-            request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
+        .get("/control/v1/tenant/:tenant_id/locate", |r| {
+            tenant_service_handler(r, handle_tenant_locate)
        })
        // Node operations
        .post("/control/v1/node", |r| {
-            named_request_span(r, handle_node_register, RequestName("control_v1_node"))
-        })
-        .get("/control/v1/node", |r| {
-            named_request_span(r, handle_node_list, RequestName("control_v1_node"))
+            request_span(r, handle_node_register)
        })
+        .get("/control/v1/node", |r| request_span(r, handle_node_list))
        .put("/control/v1/node/:node_id/config", |r| {
-            named_request_span(
-                r,
-                handle_node_configure,
-                RequestName("control_v1_node_config"),
-            )
+            request_span(r, handle_node_configure)
        })
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_shard_migrate,
-                RequestName("control_v1_tenant_migrate"),
-            )
+            tenant_service_handler(r, handle_tenant_shard_migrate)
        })
        .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_shard_split,
-                RequestName("control_v1_tenant_shard_split"),
-            )
-        })
-        .get("/control/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_describe,
-                RequestName("control_v1_tenant_describe"),
-            )
-        })
-        .get("/control/v1/tenant", |r| {
-            tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list"))
-        })
-        .put("/control/v1/tenant/:tenant_id/policy", |r| {
-            named_request_span(
-                r,
-                handle_tenant_update_policy,
-                RequestName("control_v1_tenant_policy"),
-            )
+            tenant_service_handler(r, handle_tenant_shard_split)
        })
        // Tenant operations
        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
        .post("/v1/tenant", |r| {
-            tenant_service_handler(r, handle_tenant_create, RequestName("v1_tenant"))
+            tenant_service_handler(r, handle_tenant_create)
        })
        .delete("/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
+            tenant_service_handler(r, handle_tenant_delete)
        })
        .put("/v1/tenant/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
+            tenant_service_handler(r, handle_tenant_config_set)
        })
        .get("/v1/tenant/:tenant_id/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_get, RequestName("v1_tenant_config"))
+            tenant_service_handler(r, handle_tenant_config_get)
        })
        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_location_config,
-                RequestName("v1_tenant_location_config"),
-            )
+            tenant_service_handler(r, handle_tenant_location_config)
        })
        .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_time_travel_remote_storage,
-                RequestName("v1_tenant_time_travel_remote_storage"),
-            )
+            tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
        })
        .post("/v1/tenant/:tenant_id/secondary/download", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_secondary_download,
-                RequestName("v1_tenant_secondary_download"),
-            )
+            tenant_service_handler(r, handle_tenant_secondary_download)
        })
        // Timeline operations
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_timeline_delete,
-                RequestName("v1_tenant_timeline"),
-            )
+            tenant_service_handler(r, handle_tenant_timeline_delete)
        })
        .post("/v1/tenant/:tenant_id/timeline", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_timeline_create,
-                RequestName("v1_tenant_timeline"),
-            )
+            tenant_service_handler(r, handle_tenant_timeline_create)
        })
        // Tenant detail GET passthrough to shard zero
        .get("/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_timeline_passthrough,
-                RequestName("v1_tenant_passthrough"),
-            )
+            tenant_service_handler(r, handle_tenant_timeline_passthrough)
        })
        // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
        // timeline GET APIs will be implicitly included.
        .get("/v1/tenant/:tenant_id/timeline*", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_timeline_passthrough,
-                RequestName("v1_tenant_timeline_passthrough"),
-            )
+            tenant_service_handler(r, handle_tenant_timeline_passthrough)
        })
 }
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -3,18 +3,15 @@ use utils::seqwait::MonotonicCounter;

 mod auth;
 mod compute_hook;
-mod heartbeater;
 pub mod http;
-mod id_lock_map;
 pub mod metrics;
 mod node;
-mod pageserver_client;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
 mod schema;
 pub mod service;
-mod tenant_shard;
+mod tenant_state;

 #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
 struct Sequence(u64);
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -1,19 +1,19 @@
 use anyhow::{anyhow, Context};
+use attachment_service::http::make_router;
+use attachment_service::metrics::preinitialize_metrics;
+use attachment_service::persistence::Persistence;
+use attachment_service::service::{Config, Service};
+use aws_config::{BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use std::sync::Arc;
-use storage_controller::http::make_router;
-use storage_controller::metrics::preinitialize_metrics;
-use storage_controller::persistence::Persistence;
-use storage_controller::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};

-use utils::sentry_init::init_sentry;
 use utils::{project_build_tag, project_git_version, tcp_listener};

 project_git_version!(GIT_VERSION);
@@ -51,33 +51,9 @@ struct Cli {
    #[arg(short, long)]
    path: Option<Utf8PathBuf>,

-    /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller
+    /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
    #[arg(long)]
    database_url: Option<String>,
-
-    /// Flag to enable dev mode, which permits running without auth
-    #[arg(long, default_value = "false")]
-    dev: bool,
-
-    /// Grace period before marking unresponsive pageserver offline
-    #[arg(long)]
-    max_unavailable_interval: Option<humantime::Duration>,
-}
-
-enum StrictMode {
-    /// In strict mode, we will require that all secrets are loaded, i.e. security features
-    /// may not be implicitly turned off by omitting secrets in the environment.
-    Strict,
-    /// In dev mode, secrets are optional, and omitting a particular secret will implicitly
-    /// disable the auth related to it (e.g. no pageserver jwt key -> send unauthenticated
-    /// requests, no public key -> don't authenticate incoming requests).
-    Dev,
-}
-
-impl Default for StrictMode {
-    fn default() -> Self {
-        Self::Strict
-    }
 }

 /// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
@@ -90,6 +66,13 @@ struct Secrets {
 }

 impl Secrets {
+    const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url";
+    const PAGESERVER_JWT_TOKEN_SECRET: &'static str =
+        "neon-storage-controller-pageserver-jwt-token";
+    const CONTROL_PLANE_JWT_TOKEN_SECRET: &'static str =
+        "neon-storage-controller-control-plane-jwt-token";
+    const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
+
    const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
    const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
    const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
@@ -100,41 +83,111 @@ impl Secrets {
    /// - Environment variables if DATABASE_URL is set.
    /// - AWS Secrets Manager secrets
    async fn load(args: &Cli) -> anyhow::Result<Self> {
-        let Some(database_url) =
-            Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await
-        else {
-            anyhow::bail!(
-                "Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)"
-            )
-        };
-
-        let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await {
-            Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?),
-            None => None,
-        };
-
-        let this = Self {
-            database_url,
-            public_key,
-            jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await,
-            control_plane_jwt_token: Self::load_secret(
-                &args.control_plane_jwt_token,
-                Self::CONTROL_PLANE_JWT_TOKEN_ENV,
-            )
-            .await,
-        };
-
-        Ok(this)
+        match &args.database_url {
+            Some(url) => Self::load_cli(url, args),
+            None => match std::env::var(Self::DATABASE_URL_ENV) {
+                Ok(database_url) => Self::load_env(database_url),
+                Err(_) => Self::load_aws_sm().await,
+            },
+        }
    }

-    async fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
-        if let Some(v) = cli {
-            Some(v.clone())
-        } else if let Ok(v) = std::env::var(env_name) {
-            Some(v)
-        } else {
-            None
+    fn load_env(database_url: String) -> anyhow::Result<Self> {
+        let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
+            Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
+            Err(_) => None,
+        };
+        Ok(Self {
+            database_url,
+            public_key,
+            jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
+            control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
+        })
+    }
+
+    async fn load_aws_sm() -> anyhow::Result<Self> {
+        let Ok(region) = std::env::var("AWS_REGION") else {
+            anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
+        };
+        let config = aws_config::defaults(BehaviorVersion::v2023_11_09())
+            .region(Region::new(region.clone()))
+            .load()
+            .await;
+
+        let asm = aws_sdk_secretsmanager::Client::new(&config);
+
+        let Some(database_url) = asm
+            .get_secret_value()
+            .secret_id(Self::DATABASE_URL_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string)
+        else {
+            anyhow::bail!(
+                "Database URL secret not found at {region}/{}",
+                Self::DATABASE_URL_SECRET
+            )
+        };
+
+        let jwt_token = asm
+            .get_secret_value()
+            .secret_id(Self::PAGESERVER_JWT_TOKEN_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string);
+        if jwt_token.is_none() {
+            tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver");
        }
+
+        let control_plane_jwt_token = asm
+            .get_secret_value()
+            .secret_id(Self::CONTROL_PLANE_JWT_TOKEN_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string);
+        if jwt_token.is_none() {
+            tracing::warn!("No control plane JWT token set: this will only work if authentication is disabled on the pageserver");
+        }
+
+        let public_key = asm
+            .get_secret_value()
+            .secret_id(Self::PUBLIC_KEY_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string);
+        let public_key = match public_key {
+            Some(key) => Some(JwtAuth::from_key(key)?),
+            None => {
+                tracing::warn!(
+                    "No public key set: inccoming HTTP requests will not be authenticated"
+                );
+                None
+            }
+        };
+
+        Ok(Self {
+            database_url,
+            public_key,
+            jwt_token,
+            control_plane_jwt_token,
+        })
+    }
+
+    fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
+        let public_key = match &args.public_key {
+            None => None,
+            Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?),
+        };
+        Ok(Self {
+            database_url: database_url.to_owned(),
+            public_key,
+            jwt_token: args.jwt_token.clone(),
+            control_plane_jwt_token: args.control_plane_jwt_token.clone(),
+        })
    }
 }

@@ -153,14 +206,6 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> {
 }

 fn main() -> anyhow::Result<()> {
-    let default_panic = std::panic::take_hook();
-    std::panic::set_hook(Box::new(move |info| {
-        default_panic(info);
-        std::process::exit(1);
-    }));
-
-    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
-
    tokio::runtime::Builder::new_current_thread()
        // We use spawn_blocking for database operations, so require approximately
        // as many blocking threads as we will open database connections.
@@ -192,50 +237,12 @@ async fn async_main() -> anyhow::Result<()> {
        args.listen
    );

-    let strict_mode = if args.dev {
-        StrictMode::Dev
-    } else {
-        StrictMode::Strict
-    };
-
    let secrets = Secrets::load(&args).await?;

-    // Validate required secrets and arguments are provided in strict mode
-    match strict_mode {
-        StrictMode::Strict
-            if (secrets.public_key.is_none()
-                || secrets.jwt_token.is_none()
-                || secrets.control_plane_jwt_token.is_none()) =>
-        {
-            // Production systems should always have secrets configured: if public_key was not set
-            // then we would implicitly disable auth.
-            anyhow::bail!(
-                    "Insecure config!  One or more secrets is not set.  This is only permitted in `--dev` mode"
-                );
-        }
-        StrictMode::Strict if args.compute_hook_url.is_none() => {
-            // Production systems should always have a compute hook set, to prevent falling
-            // back to trying to use neon_local.
-            anyhow::bail!(
-                "`--compute-hook-url` is not set: this is only permitted in `--dev` mode"
-            );
-        }
-        StrictMode::Strict => {
-            tracing::info!("Starting in strict mode: configuration is OK.")
-        }
-        StrictMode::Dev => {
-            tracing::warn!("Starting in dev mode: this may be an insecure configuration.")
-        }
-    }
-
    let config = Config {
        jwt_token: secrets.jwt_token,
        control_plane_jwt_token: secrets.control_plane_jwt_token,
        compute_hook_url: args.compute_hook_url,
-        max_unavailable_interval: args
-            .max_unavailable_interval
-            .map(humantime::Duration::into)
-            .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
    };

    // After loading secrets & config, but before starting anything else, apply database migrations
--- a/control_plane/attachment_service/src/metrics.rs
+++ b/control_plane/attachment_service/src/metrics.rs
@@ -0,0 +1,32 @@
+use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
+use once_cell::sync::Lazy;
+
+pub(crate) struct ReconcilerMetrics {
+    pub(crate) spawned: IntCounter,
+    pub(crate) complete: IntCounterVec,
+}
+
+impl ReconcilerMetrics {
+    // Labels used on [`Self::complete`]
+    pub(crate) const SUCCESS: &'static str = "ok";
+    pub(crate) const ERROR: &'static str = "success";
+    pub(crate) const CANCEL: &'static str = "cancel";
+}
+
+pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
+    spawned: register_int_counter!(
+        "storage_controller_reconcile_spawn",
+        "Count of how many times we spawn a reconcile task",
+    )
+    .expect("failed to define a metric"),
+    complete: register_int_counter_vec!(
+        "storage_controller_reconcile_complete",
+        "Reconciler tasks completed, broken down by success/failure/cancelled",
+        &["status"],
+    )
+    .expect("failed to define a metric"),
+});
+
+pub fn preinitialize_metrics() {
+    Lazy::force(&RECONCILER);
+}
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -3,8 +3,7 @@ use std::{str::FromStr, time::Duration};
 use hyper::StatusCode;
 use pageserver_api::{
    controller_api::{
-        NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
-        TenantLocateResponseShard,
+        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
    },
    shard::TenantShardId,
 };
@@ -13,9 +12,7 @@ use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, id::NodeId};

-use crate::{
-    pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule,
-};
+use crate::persistence::NodePersistence;

 /// Represents the in-memory description of a Node.
 ///
@@ -86,38 +83,29 @@ impl Node {
        }
    }

-    pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
-        match self.get_availability_transition(availability) {
-            AvailabilityTransition::ToActive => {
+    pub(crate) fn set_availability(
+        &mut self,
+        availability: NodeAvailability,
+    ) -> AvailabilityTransition {
+        use NodeAvailability::*;
+        let transition = match (self.availability, availability) {
+            (Offline, Active) => {
                // Give the node a new cancellation token, effectively resetting it to un-cancelled.  Any
                // users of previously-cloned copies of the node will still see the old cancellation
                // state.  For example, Reconcilers in flight will have to complete and be spawned
                // again to realize that the node has become available.
                self.cancel = CancellationToken::new();
+                AvailabilityTransition::ToActive
            }
-            AvailabilityTransition::ToOffline => {
+            (Active, Offline) => {
                // Fire the node's cancellation token to cancel any in-flight API requests to it
                self.cancel.cancel();
+                AvailabilityTransition::ToOffline
            }
-            AvailabilityTransition::Unchanged => {}
-        }
+            _ => AvailabilityTransition::Unchanged,
+        };
        self.availability = availability;
-    }
-
-    /// Without modifying the availability of the node, convert the intended availability
-    /// into a description of the transition.
-    pub(crate) fn get_availability_transition(
-        &self,
-        availability: NodeAvailability,
-    ) -> AvailabilityTransition {
-        use AvailabilityTransition::*;
-        use NodeAvailability::*;
-
-        match (self.availability, availability) {
-            (Offline, Active(_)) => ToActive,
-            (Active(_), Offline) => ToOffline,
-            _ => Unchanged,
-        }
+        transition
    }

    /// Whether we may send API requests to this node.
@@ -126,21 +114,21 @@ impl Node {
        // a reference to the original Node's cancellation status.  Checking both of these results
        // in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
        // when we cloned it, or if the original Node instance's cancellation token was fired.
-        matches!(self.availability, NodeAvailability::Active(_)) && !self.cancel.is_cancelled()
+        matches!(self.availability, NodeAvailability::Active) && !self.cancel.is_cancelled()
    }

    /// Is this node elegible to have work scheduled onto it?
-    pub(crate) fn may_schedule(&self) -> MaySchedule {
-        let score = match self.availability {
-            NodeAvailability::Active(score) => score,
-            NodeAvailability::Offline => return MaySchedule::No,
-        };
+    pub(crate) fn may_schedule(&self) -> bool {
+        match self.availability {
+            NodeAvailability::Active => {}
+            NodeAvailability::Offline => return false,
+        }

        match self.scheduling {
-            NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
-            NodeSchedulingPolicy::Draining => MaySchedule::No,
-            NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
-            NodeSchedulingPolicy::Pause => MaySchedule::No,
+            NodeSchedulingPolicy::Active => true,
+            NodeSchedulingPolicy::Draining => false,
+            NodeSchedulingPolicy::Filling => true,
+            NodeSchedulingPolicy::Pause => false,
        }
    }

@@ -158,7 +146,8 @@ impl Node {
            listen_pg_addr,
            listen_pg_port,
            scheduling: NodeSchedulingPolicy::Filling,
-            availability: NodeAvailability::Offline,
+            // TODO: we shouldn't really call this Active until we've heartbeated it.
+            availability: NodeAvailability::Active,
            cancel: CancellationToken::new(),
        }
    }
@@ -205,7 +194,7 @@ impl Node {
        cancel: &CancellationToken,
    ) -> Option<mgmt_api::Result<T>>
    where
-        O: FnMut(PageserverClient) -> F,
+        O: FnMut(mgmt_api::Client) -> F,
        F: std::future::Future<Output = mgmt_api::Result<T>>,
    {
        fn is_fatal(e: &mgmt_api::Error) -> bool {
@@ -227,12 +216,8 @@ impl Node {
                    .build()
                    .expect("Failed to construct HTTP client");

-                let client = PageserverClient::from_client(
-                    self.get_id(),
-                    http_client,
-                    self.base_url(),
-                    jwt.as_deref(),
-                );
+                let client =
+                    mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref());

                let node_cancel_fut = self.cancel.cancelled();

@@ -257,19 +242,6 @@ impl Node {
        )
        .await
    }
-
-    /// Generate the simplified API-friendly description of a node's state
-    pub(crate) fn describe(&self) -> NodeDescribeResponse {
-        NodeDescribeResponse {
-            id: self.id,
-            availability: self.availability.into(),
-            scheduling: self.scheduling,
-            listen_http_addr: self.listen_http_addr.clone(),
-            listen_http_port: self.listen_http_port,
-            listen_pg_addr: self.listen_pg_addr.clone(),
-            listen_pg_port: self.listen_pg_port,
-        }
-    }
 }

 impl std::fmt::Display for Node {
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -9,20 +9,13 @@ use camino::Utf8PathBuf;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
-use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
-use pageserver_api::shard::ShardConfigError;
-use pageserver_api::shard::ShardIdentity;
-use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};

-use crate::metrics::{
-    DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY,
-};
 use crate::node::Node;

 /// ## What do we store?
@@ -79,41 +72,8 @@ pub(crate) enum DatabaseError {
    Logical(String),
 }

-#[derive(measured::FixedCardinalityLabel, Clone)]
-pub(crate) enum DatabaseOperation {
-    InsertNode,
-    UpdateNode,
-    DeleteNode,
-    ListNodes,
-    BeginShardSplit,
-    CompleteShardSplit,
-    AbortShardSplit,
-    Detach,
-    ReAttach,
-    IncrementGeneration,
-    ListTenantShards,
-    InsertTenantShards,
-    UpdateTenantShard,
-    DeleteTenant,
-    UpdateTenantConfig,
-}
-
-#[must_use]
-pub(crate) enum AbortShardSplitStatus {
-    /// We aborted the split in the database by reverting to the parent shards
-    Aborted,
-    /// The split had already been persisted.
-    Complete,
-}
-
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;

-/// Some methods can operate on either a whole tenant or a single shard
-pub(crate) enum TenantFilter {
-    Tenant(TenantId),
-    Shard(TenantShardId),
-}
-
 impl Persistence {
    // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
    // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
@@ -144,38 +104,10 @@ impl Persistence {
        }
    }

-    /// Wraps `with_conn` in order to collect latency and error metrics
-    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
-    where
-        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
-        R: Send + 'static,
-    {
-        let latency = &METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_database_query_latency;
-        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
-            operation: op.clone(),
-        });
-
-        let res = self.with_conn(func).await;
-
-        if let Err(err) = &res {
-            let error_counter = &METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_database_query_error;
-            error_counter.inc(DatabaseQueryErrorLabelGroup {
-                error_type: err.error_label(),
-                operation: op,
-            })
-        }
-
-        res
-    }
-
    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
    where
-        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
        R: Send + 'static,
    {
        let mut conn = self.connection_pool.get()?;
@@ -187,27 +119,21 @@ impl Persistence {
    /// When a node is first registered, persist it before using it for anything
    pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
        let np = node.to_persistent();
-        self.with_measured_conn(
-            DatabaseOperation::InsertNode,
-            move |conn| -> DatabaseResult<()> {
-                diesel::insert_into(crate::schema::nodes::table)
-                    .values(&np)
-                    .execute(conn)?;
-                Ok(())
-            },
-        )
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::insert_into(crate::schema::nodes::table)
+                .values(&np)
+                .execute(conn)?;
+            Ok(())
+        })
        .await
    }

    /// At startup, populate the list of nodes which our shards may be placed on
    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
        let nodes: Vec<NodePersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::ListNodes,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
-                },
-            )
+            .with_conn(move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
+            })
            .await?;

        tracing::info!("list_nodes: loaded {} nodes", nodes.len());
@@ -222,7 +148,7 @@ impl Persistence {
    ) -> DatabaseResult<()> {
        use crate::schema::nodes::dsl::*;
        let updated = self
-            .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
+            .with_conn(move |conn| {
                let updated = diesel::update(nodes)
                    .filter(node_id.eq(input_node_id.0 as i64))
                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
@@ -244,12 +170,9 @@ impl Persistence {
    /// be enriched at runtime with state discovered on pageservers.
    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
        let loaded = self
-            .with_measured_conn(
-                DatabaseOperation::ListTenantShards,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
-                },
-            )
+            .with_conn(move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
+            })
            .await?;

        if loaded.is_empty() {
@@ -277,15 +200,15 @@ impl Persistence {

        let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
            .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
-        for shard in decoded.tenants.values_mut() {
-            if shard.placement_policy == "\"Single\"" {
-                // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
-                shard.placement_policy = "{\"Attached\":0}".to_string();
-            }
-
-            if shard.scheduling_policy.is_empty() {
-                shard.scheduling_policy =
-                    serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
+        for (tenant_id, tenant) in &mut decoded.tenants {
+            // Backward compat: an old attachments.json from before PR #6251, replace
+            // empty strings with proper defaults.
+            if tenant.tenant_id.is_empty() {
+                tenant.tenant_id = tenant_id.to_string();
+                tenant.config = serde_json::to_string(&TenantConfig::default())
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
+                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single)
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
            }
        }

@@ -331,20 +254,17 @@ impl Persistence {
        shards: Vec<TenantShardPersistence>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::InsertTenantShards,
-            move |conn| -> DatabaseResult<()> {
-                conn.transaction(|conn| -> QueryResult<()> {
-                    for tenant in &shards {
-                        diesel::insert_into(tenant_shards)
-                            .values(tenant)
-                            .execute(conn)?;
-                    }
-                    Ok(())
-                })?;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            conn.transaction(|conn| -> QueryResult<()> {
+                for tenant in &shards {
+                    diesel::insert_into(tenant_shards)
+                        .values(tenant)
+                        .execute(conn)?;
+                }
                Ok(())
-            },
-        )
+            })?;
+            Ok(())
+        })
        .await
    }

@@ -352,31 +272,25 @@ impl Persistence {
    /// the tenant from memory on this server.
    pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::DeleteTenant,
-            move |conn| -> DatabaseResult<()> {
-                diesel::delete(tenant_shards)
-                    .filter(tenant_id.eq(del_tenant_id.to_string()))
-                    .execute(conn)?;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::delete(tenant_shards)
+                .filter(tenant_id.eq(del_tenant_id.to_string()))
+                .execute(conn)?;

-                Ok(())
-            },
-        )
+            Ok(())
+        })
        .await
    }

    pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
        use crate::schema::nodes::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::DeleteNode,
-            move |conn| -> DatabaseResult<()> {
-                diesel::delete(nodes)
-                    .filter(node_id.eq(del_node_id.0 as i64))
-                    .execute(conn)?;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::delete(nodes)
+                .filter(node_id.eq(del_node_id.0 as i64))
+                .execute(conn)?;

-                Ok(())
-            },
-        )
+            Ok(())
+        })
        .await
    }

@@ -390,7 +304,7 @@ impl Persistence {
    ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
        use crate::schema::tenant_shards::dsl::*;
        let updated = self
-            .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
+            .with_conn(move |conn| {
                let rows_updated = diesel::update(tenant_shards)
                    .filter(generation_pageserver.eq(node_id.0 as i64))
                    .set(generation.eq(generation + 1))
@@ -440,7 +354,7 @@ impl Persistence {
    ) -> anyhow::Result<Generation> {
        use crate::schema::tenant_shards::dsl::*;
        let updated = self
-            .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
+            .with_conn(move |conn| {
                let updated = diesel::update(tenant_shards)
                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -477,45 +391,59 @@ impl Persistence {
    /// that we only do the first time a tenant is set to an attached policy via /location_config.
    pub(crate) async fn update_tenant_shard(
        &self,
-        tenant: TenantFilter,
-        input_placement_policy: Option<PlacementPolicy>,
-        input_config: Option<TenantConfig>,
+        tenant_shard_id: TenantShardId,
+        input_placement_policy: PlacementPolicy,
+        input_config: TenantConfig,
        input_generation: Option<Generation>,
-        input_scheduling_policy: Option<ShardSchedulingPolicy>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;

-        self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
-            let query = match tenant {
-                TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .into_boxed(),
-                TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(input_tenant_id.to_string()))
-                    .into_boxed(),
-            };
+        self.with_conn(move |conn| {
+            let query = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));

-            #[derive(AsChangeset)]
-            #[diesel(table_name = crate::schema::tenant_shards)]
-            struct ShardUpdate {
-                generation: Option<i32>,
-                placement_policy: Option<String>,
-                config: Option<String>,
-                scheduling_policy: Option<String>,
+            if let Some(input_generation) = input_generation {
+                // Update includes generation column
+                query
+                    .set((
+                        generation.eq(Some(input_generation.into().unwrap() as i32)),
+                        placement_policy
+                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
+                        config.eq(serde_json::to_string(&input_config).unwrap()),
+                    ))
+                    .execute(conn)?;
+            } else {
+                // Update does not include generation column
+                query
+                    .set((
+                        placement_policy
+                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
+                        config.eq(serde_json::to_string(&input_config).unwrap()),
+                    ))
+                    .execute(conn)?;
            }

-            let update = ShardUpdate {
-                generation: input_generation.map(|g| g.into().unwrap() as i32),
-                placement_policy: input_placement_policy
-                    .map(|p| serde_json::to_string(&p).unwrap()),
-                config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
-                scheduling_policy: input_scheduling_policy
-                    .map(|p| serde_json::to_string(&p).unwrap()),
-            };
+            Ok(())
+        })
+        .await?;

-            query.set(update).execute(conn)?;
+        Ok(())
+    }
+
+    pub(crate) async fn update_tenant_config(
+        &self,
+        input_tenant_id: TenantId,
+        input_config: TenantConfig,
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+
+        self.with_conn(move |conn| {
+            diesel::update(tenant_shards)
+                .filter(tenant_id.eq(input_tenant_id.to_string()))
+                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
+                .execute(conn)?;

            Ok(())
        })
@@ -526,7 +454,7 @@ impl Persistence {

    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
+        self.with_conn(move |conn| {
            let updated = diesel::update(tenant_shards)
                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -556,7 +484,7 @@ impl Persistence {
        parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
+        self.with_conn(move |conn| -> DatabaseResult<()> {
            conn.transaction(|conn| -> DatabaseResult<()> {
                // Mark parent shards as splitting

@@ -620,83 +548,31 @@ impl Persistence {
        old_shard_count: ShardCount,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::CompleteShardSplit,
-            move |conn| -> DatabaseResult<()> {
-                conn.transaction(|conn| -> QueryResult<()> {
-                    // Drop parent shards
-                    diesel::delete(tenant_shards)
-                        .filter(tenant_id.eq(split_tenant_id.to_string()))
-                        .filter(shard_count.eq(old_shard_count.literal() as i32))
-                        .execute(conn)?;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            conn.transaction(|conn| -> QueryResult<()> {
+                // Drop parent shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(old_shard_count.literal() as i32))
+                    .execute(conn)?;

-                    // Clear sharding flag
-                    let updated = diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(split_tenant_id.to_string()))
-                        .set((splitting.eq(0),))
-                        .execute(conn)?;
-                    debug_assert!(updated > 0);
-
-                    Ok(())
-                })?;
+                // Clear sharding flag
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;
+                debug_assert!(updated > 0);

                Ok(())
-            },
-        )
-        .await
-    }
+            })?;

-    /// Used when the remote part of a shard split failed: we will revert the database state to have only
-    /// the parent shards, with SplitState::Idle.
-    pub(crate) async fn abort_shard_split(
-        &self,
-        split_tenant_id: TenantId,
-        new_shard_count: ShardCount,
-    ) -> DatabaseResult<AbortShardSplitStatus> {
-        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::AbortShardSplit,
-            move |conn| -> DatabaseResult<AbortShardSplitStatus> {
-                let aborted =
-                    conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
-                        // Clear the splitting state on parent shards
-                        let updated = diesel::update(tenant_shards)
-                            .filter(tenant_id.eq(split_tenant_id.to_string()))
-                            .filter(shard_count.ne(new_shard_count.literal() as i32))
-                            .set((splitting.eq(0),))
-                            .execute(conn)?;
-
-                        // Parent shards are already gone: we cannot abort.
-                        if updated == 0 {
-                            return Ok(AbortShardSplitStatus::Complete);
-                        }
-
-                        // Sanity check: if parent shards were present, their cardinality should
-                        // be less than the number of child shards.
-                        if updated >= new_shard_count.count() as usize {
-                            return Err(DatabaseError::Logical(format!(
-                                "Unexpected parent shard count {updated} while aborting split to \
-                            count {new_shard_count:?} on tenant {split_tenant_id}"
-                            )));
-                        }
-
-                        // Erase child shards
-                        diesel::delete(tenant_shards)
-                            .filter(tenant_id.eq(split_tenant_id.to_string()))
-                            .filter(shard_count.eq(new_shard_count.literal() as i32))
-                            .execute(conn)?;
-
-                        Ok(AbortShardSplitStatus::Aborted)
-                    })?;
-
-                Ok(aborted)
-            },
-        )
+            Ok(())
+        })
        .await
    }
 }

-/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
+/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
 #[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
 #[diesel(table_name = crate::schema::tenant_shards)]
 pub(crate) struct TenantShardPersistence {
@@ -726,30 +602,6 @@ pub(crate) struct TenantShardPersistence {
    pub(crate) splitting: SplitState,
    #[serde(default)]
    pub(crate) config: String,
-    #[serde(default)]
-    pub(crate) scheduling_policy: String,
-}
-
-impl TenantShardPersistence {
-    pub(crate) fn get_shard_identity(&self) -> Result<ShardIdentity, ShardConfigError> {
-        if self.shard_count == 0 {
-            Ok(ShardIdentity::unsharded())
-        } else {
-            Ok(ShardIdentity::new(
-                ShardNumber(self.shard_number as u8),
-                ShardCount::new(self.shard_count as u8),
-                ShardStripeSize(self.shard_stripe_size as u32),
-            )?)
-        }
-    }
-
-    pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
-        Ok(TenantShardId {
-            tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
-            shard_number: ShardNumber(self.shard_number as u8),
-            shard_count: ShardCount::new(self.shard_count as u8),
-        })
-    }
 }

 /// Parts of [`crate::node::Node`] that are stored durably
--- a/control_plane/attachment_service/src/persistence/split_state.rs
+++ b/control_plane/attachment_service/src/persistence/split_state.rs
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,7 +1,5 @@
-use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
 use crate::service;
-use hyper::StatusCode;
 use pageserver_api::models::{
    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
@@ -9,7 +7,7 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_client::mgmt_api;
 use std::collections::HashMap;
 use std::sync::Arc;
-use std::time::{Duration, Instant};
+use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
@@ -18,14 +16,12 @@ use utils::sync::gate::GateGuard;

 use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
-use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation};
-
-const DEFAULT_HEATMAP_PERIOD: &str = "60s";
+use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};

 /// Object with the lifetime of the background reconcile task that is created
 /// for tenants which have a difference between their intent and observed states.
 pub(super) struct Reconciler {
-    /// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot
+    /// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
    /// of a tenant's state from when we spawned a reconcile task.
    pub(super) tenant_shard_id: TenantShardId,
    pub(crate) shard: ShardIdentity,
@@ -48,11 +44,11 @@ pub(super) struct Reconciler {

    /// To avoid stalling if the cloud control plane is unavailable, we may proceed
    /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
-    /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry.
+    /// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry.
    pub(crate) compute_notify_failure: bool,

    /// A means to abort background reconciliation: it is essential to
-    /// call this when something changes in the original TenantShard that
+    /// call this when something changes in the original TenantState that
    /// will make this reconciliation impossible or unnecessary, for
    /// example when a pageserver node goes offline, or the PlacementPolicy for
    /// the tenant is changed.
@@ -66,7 +62,7 @@ pub(super) struct Reconciler {
    pub(crate) persistence: Arc<Persistence>,
 }

-/// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any
+/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
 /// reference counting for Scheduler.  The IntentState is what the scheduler works with,
 /// and the TargetState is just the instruction for a particular Reconciler run.
 #[derive(Debug)]
@@ -118,15 +114,6 @@ impl Reconciler {
        flush_ms: Option<Duration>,
        lazy: bool,
    ) -> Result<(), ReconcileError> {
-        if !node.is_available() && config.mode == LocationConfigMode::Detached {
-            // Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline
-            // will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of
-            // what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`]
-            tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation");
-            self.observed.locations.remove(&node.get_id());
-            return Ok(());
-        }
-
        self.observed
            .locations
            .insert(node.get_id(), ObservedStateLocation { conf: None });
@@ -159,16 +146,9 @@ impl Reconciler {
        };
        tracing::info!("location_config({node}) complete: {:?}", config);

-        match config.mode {
-            LocationConfigMode::Detached => {
-                self.observed.locations.remove(&node.get_id());
-            }
-            _ => {
-                self.observed
-                    .locations
-                    .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
-            }
-        }
+        self.observed
+            .locations
+            .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });

        Ok(())
    }
@@ -260,11 +240,8 @@ impl Reconciler {
        tenant_shard_id: TenantShardId,
        node: &Node,
    ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-        let client = PageserverClient::new(
-            node.get_id(),
-            node.base_url(),
-            self.service_config.jwt_token.as_deref(),
-        );
+        let client =
+            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());

        let timelines = client.timeline_list(&tenant_shard_id).await?;
        Ok(timelines
@@ -278,81 +255,22 @@ impl Reconciler {
        tenant_shard_id: TenantShardId,
        node: &Node,
    ) -> Result<(), ReconcileError> {
-        // This is not the timeout for a request, but the total amount of time we're willing to wait
-        // for a secondary location to get up to date before
-        const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
-
-        // This the long-polling interval for the secondary download requests we send to destination pageserver
-        // during a migration.
-        const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
-
-        let started_at = Instant::now();
-
-        loop {
-            let (status, progress) = match node
-                .with_client_retries(
-                    |client| async move {
-                        client
-                            .tenant_secondary_download(
-                                tenant_shard_id,
-                                Some(REQUEST_DOWNLOAD_TIMEOUT),
-                            )
-                            .await
-                    },
-                    &self.service_config.jwt_token,
-                    1,
-                    3,
-                    REQUEST_DOWNLOAD_TIMEOUT * 2,
-                    &self.cancel,
-                )
-                .await
-            {
-                None => Err(ReconcileError::Cancel),
-                Some(Ok(v)) => Ok(v),
-                Some(Err(e)) => {
-                    // Give up, but proceed: it's unfortunate if we couldn't freshen the destination before
-                    // attaching, but we should not let an issue with a secondary location stop us proceeding
-                    // with a live migration.
-                    tracing::warn!("Failed to prepare by downloading layers on node {node}: {e})");
-                    return Ok(());
-                }
-            }?;
-
-            if status == StatusCode::OK {
-                tracing::info!(
-                    "Downloads to {} complete: {}/{} layers, {}/{} bytes",
-                    node,
-                    progress.layers_downloaded,
-                    progress.layers_total,
-                    progress.bytes_downloaded,
-                    progress.bytes_total
-                );
-                return Ok(());
-            } else if status == StatusCode::ACCEPTED {
-                let total_runtime = started_at.elapsed();
-                if total_runtime > TOTAL_DOWNLOAD_TIMEOUT {
-                    tracing::warn!("Timed out after {}ms downloading layers to {node}.  Progress so far: {}/{} layers, {}/{} bytes",
-                        total_runtime.as_millis(),
-                        progress.layers_downloaded,
-                        progress.layers_total,
-                        progress.bytes_downloaded,
-                        progress.bytes_total
-                    );
-                    // Give up, but proceed: an incompletely warmed destination doesn't prevent migration working,
-                    // it just makes the I/O performance for users less good.
-                    return Ok(());
-                }
-
-                // Log and proceed around the loop to retry.  We don't sleep between requests, because our HTTP call
-                // to the pageserver is a long-poll.
-                tracing::info!(
-                    "Downloads to {} not yet complete: {}/{} layers, {}/{} bytes",
-                    node,
-                    progress.layers_downloaded,
-                    progress.layers_total,
-                    progress.bytes_downloaded,
-                    progress.bytes_total
-                );
+        match node
+            .with_client_retries(
+                |client| async move { client.tenant_secondary_download(tenant_shard_id).await },
+                &self.service_config.jwt_token,
+                1,
+                1,
+                Duration::from_secs(60),
+                &self.cancel,
+            )
+            .await
+        {
+            None => Err(ReconcileError::Cancel),
+            Some(Ok(_)) => Ok(()),
+            Some(Err(e)) => {
+                tracing::info!("  (skipping destination download: {})", e);
+                Ok(())
            }
        }
    }
@@ -487,7 +405,6 @@ impl Reconciler {
        while let Err(e) = self.compute_notify().await {
            match e {
                NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
-                NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
                _ => {
                    tracing::warn!(
                        "Live migration blocked by compute notification error, retrying: {e}"
@@ -496,7 +413,7 @@ impl Reconciler {
            }
        }

-        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then
+        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Single, then
        // this location will be deleted in the general case reconciliation that runs after this.
        let origin_secondary_conf = build_location_config(
            &self.shard,
@@ -568,29 +485,17 @@ impl Reconciler {
                )
                .await
            {
-                Some(Ok(observed)) => Some(observed),
-                Some(Err(mgmt_api::Error::ApiError(status, _msg)))
-                    if status == StatusCode::NOT_FOUND =>
-                {
-                    None
-                }
+                Some(Ok(observed)) => observed,
                Some(Err(e)) => return Err(e.into()),
                None => return Err(ReconcileError::Cancel),
            };
            tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}");
-            match observed_conf {
-                Some(conf) => {
-                    // Pageserver returned a state: update it in observed.  This may still be an indeterminate (None) state,
-                    // if internally the pageserver's TenantSlot was being mutated (e.g. some long running API call is still running)
-                    self.observed
-                        .locations
-                        .insert(attached_node.get_id(), ObservedStateLocation { conf });
-                }
-                None => {
-                    // Pageserver returned 404: we have confirmation that there is no state for this shard on that pageserver.
-                    self.observed.locations.remove(&attached_node.get_id());
-                }
-            }
+            self.observed.locations.insert(
+                attached_node.get_id(),
+                ObservedStateLocation {
+                    conf: observed_conf,
+                },
+            );
        }

        Ok(())
@@ -620,12 +525,7 @@ impl Reconciler {
                )));
            };

-            let mut wanted_conf = attached_location_conf(
-                generation,
-                &self.shard,
-                &self.config,
-                !self.intent.secondary.is_empty(),
-            );
+            let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
            match self.observed.locations.get(&node.get_id()) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                    // Nothing to do
@@ -762,26 +662,10 @@ impl Reconciler {
    }
 }

-/// We tweak the externally-set TenantConfig while configuring
-/// locations, using our awareness of whether secondary locations
-/// are in use to automatically enable/disable heatmap uploads.
-fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig {
-    let mut config = config.clone();
-    if has_secondaries {
-        if config.heatmap_period.is_none() {
-            config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
-        }
-    } else {
-        config.heatmap_period = None;
-    }
-    config
-}
-
 pub(crate) fn attached_location_conf(
    generation: Generation,
    shard: &ShardIdentity,
    config: &TenantConfig,
-    has_secondaries: bool,
 ) -> LocationConfig {
    LocationConfig {
        mode: LocationConfigMode::AttachedSingle,
@@ -790,7 +674,7 @@ pub(crate) fn attached_location_conf(
        shard_number: shard.number.0,
        shard_count: shard.count.literal(),
        shard_stripe_size: shard.stripe_size.0,
-        tenant_conf: ha_aware_config(config, has_secondaries),
+        tenant_conf: config.clone(),
    }
 }

@@ -805,6 +689,6 @@ pub(crate) fn secondary_location_conf(
        shard_number: shard.number.0,
        shard_count: shard.count.literal(),
        shard_stripe_size: shard.stripe_size.0,
-        tenant_conf: ha_aware_config(config, true),
+        tenant_conf: config.clone(),
    }
 }
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -1,5 +1,4 @@
-use crate::{node::Node, tenant_shard::TenantShard};
-use pageserver_api::controller_api::UtilizationScore;
+use crate::{node::Node, tenant_state::TenantState};
 use serde::Serialize;
 use std::collections::HashMap;
 use utils::{http::error::ApiError, id::NodeId};
@@ -20,34 +19,15 @@ impl From<ScheduleError> for ApiError {
 }

 #[derive(Serialize, Eq, PartialEq)]
-pub enum MaySchedule {
-    Yes(UtilizationScore),
-    No,
-}
-
-#[derive(Serialize)]
 struct SchedulerNode {
-    /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
+    /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
    shard_count: usize,

    /// Whether this node is currently elegible to have new shards scheduled (this is derived
    /// from a node's availability state and scheduling policy).
-    may_schedule: MaySchedule,
+    may_schedule: bool,
 }

-impl PartialEq for SchedulerNode {
-    fn eq(&self, other: &Self) -> bool {
-        let may_schedule_matches = matches!(
-            (&self.may_schedule, &other.may_schedule),
-            (MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
-        );
-
-        may_schedule_matches && self.shard_count == other.shard_count
-    }
-}
-
-impl Eq for SchedulerNode {}
-
 /// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
 /// on which to run.
 ///
@@ -58,70 +38,6 @@ pub(crate) struct Scheduler {
    nodes: HashMap<NodeId, SchedulerNode>,
 }

-/// Score for soft constraint scheduling: lower scores are preferred to higher scores.
-///
-/// For example, we may set an affinity score based on the number of shards from the same
-/// tenant already on a node, to implicitly prefer to balance out shards.
-#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
-pub(crate) struct AffinityScore(pub(crate) usize);
-
-impl AffinityScore {
-    /// If we have no anti-affinity at all toward a node, this is its score.  It means
-    /// the scheduler has a free choice amongst nodes with this score, and may pick a node
-    /// based on other information such as total utilization.
-    pub(crate) const FREE: Self = Self(0);
-
-    pub(crate) fn inc(&mut self) {
-        self.0 += 1;
-    }
-}
-
-impl std::ops::Add for AffinityScore {
-    type Output = Self;
-
-    fn add(self, rhs: Self) -> Self::Output {
-        Self(self.0 + rhs.0)
-    }
-}
-
-// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
-// it for many shards in the same tenant.
-#[derive(Debug, Default)]
-pub(crate) struct ScheduleContext {
-    /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
-    pub(crate) nodes: HashMap<NodeId, AffinityScore>,
-
-    /// Specifically how many _attached_ locations are on each node
-    pub(crate) attached_nodes: HashMap<NodeId, usize>,
-}
-
-impl ScheduleContext {
-    /// Input is a list of nodes we would like to avoid using again within this context.  The more
-    /// times a node is passed into this call, the less inclined we are to use it.
-    pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
-        for node_id in nodes {
-            let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE);
-            entry.inc()
-        }
-    }
-
-    pub(crate) fn push_attached(&mut self, node_id: NodeId) {
-        let entry = self.attached_nodes.entry(node_id).or_default();
-        *entry += 1;
-    }
-
-    pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
-        self.nodes
-            .get(&node_id)
-            .copied()
-            .unwrap_or(AffinityScore::FREE)
-    }
-
-    pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
-        self.attached_nodes.get(&node_id).copied().unwrap_or(0)
-    }
-}
-
 impl Scheduler {
    pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
        let mut scheduler_nodes = HashMap::new();
@@ -147,7 +63,7 @@ impl Scheduler {
    pub(crate) fn consistency_check<'a>(
        &self,
        nodes: impl Iterator<Item = &'a Node>,
-        shards: impl Iterator<Item = &'a TenantShard>,
+        shards: impl Iterator<Item = &'a TenantState>,
    ) -> anyhow::Result<()> {
        let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
        for node in nodes {
@@ -270,15 +186,13 @@ impl Scheduler {
            return None;
        }

-        // TODO: When the utilization score returned by the pageserver becomes meaningful,
-        // schedule based on that instead of the shard count.
        let node = nodes
            .iter()
            .map(|node_id| {
                let may_schedule = self
                    .nodes
                    .get(node_id)
-                    .map(|n| n.may_schedule != MaySchedule::No)
+                    .map(|n| n.may_schedule)
                    .unwrap_or(false);
                (*node_id, may_schedule)
            })
@@ -288,47 +202,27 @@ impl Scheduler {
        node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
    }

-    /// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they
-    /// are already in use by this shard -- we use this to avoid picking the same node
-    /// as both attached and secondary location.  This is a hard constraint: if we cannot
-    /// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`].
-    ///
-    /// context: we prefer to avoid using nodes identified in the context, according
-    /// to their anti-affinity score.  We use this to prefeer to avoid placing shards in
-    /// the same tenant on the same node.  This is a soft constraint: the context will never
-    /// cause us to fail to schedule a shard.
-    pub(crate) fn schedule_shard(
-        &self,
-        hard_exclude: &[NodeId],
-        context: &ScheduleContext,
-    ) -> Result<NodeId, ScheduleError> {
+    pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
        if self.nodes.is_empty() {
            return Err(ScheduleError::NoPageservers);
        }

-        let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
+        let mut tenant_counts: Vec<(NodeId, usize)> = self
            .nodes
            .iter()
            .filter_map(|(k, v)| {
-                if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
+                if hard_exclude.contains(k) || !v.may_schedule {
                    None
                } else {
-                    Some((
-                        *k,
-                        context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
-                        v.shard_count,
-                    ))
+                    Some((*k, v.shard_count))
                }
            })
            .collect();

-        // Sort by, in order of precedence:
-        //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
-        //  2nd: Utilization.  Within nodes with the same affinity, use the least loaded nodes.
-        //  3rd: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
-        scores.sort_by_key(|i| (i.1, i.2, i.0));
+        // Sort by tenant count.  Nodes with the same tenant count are sorted by ID.
+        tenant_counts.sort_by_key(|i| (i.1, i.0));

-        if scores.is_empty() {
+        if tenant_counts.is_empty() {
            // After applying constraints, no pageservers were left.  We log some detail about
            // the state of nodes to help understand why this happened.  This is not logged as an error because
            // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
@@ -336,7 +230,7 @@ impl Scheduler {
            for (node_id, node) in &self.nodes {
                tracing::info!(
                    "Node {node_id}: may_schedule={} shards={}",
-                    node.may_schedule != MaySchedule::No,
+                    node.may_schedule,
                    node.shard_count
                );
            }
@@ -344,11 +238,10 @@ impl Scheduler {
            return Err(ScheduleError::ImpossibleConstraint);
        }

-        // Lowest score wins
-        let node_id = scores.first().unwrap().0;
+        let node_id = tenant_counts.first().unwrap().0;
        tracing::info!(
-            "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
-            scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
+            "scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
+            tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
        );

        // Note that we do not update shard count here to reflect the scheduling: that
@@ -356,19 +249,12 @@ impl Scheduler {

        Ok(node_id)
    }
-
-    /// Unit test access to internal state
-    #[cfg(test)]
-    pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
-        self.nodes.get(&node_id).unwrap().shard_count
-    }
 }

 #[cfg(test)]
 pub(crate) mod test_utils {

    use crate::node::Node;
-    use pageserver_api::controller_api::{NodeAvailability, UtilizationScore};
    use std::collections::HashMap;
    use utils::id::NodeId;
    /// Test helper: synthesize the requested number of nodes, all in active state.
@@ -378,14 +264,13 @@ pub(crate) mod test_utils {
        (1..n + 1)
            .map(|i| {
                (NodeId(i), {
-                    let mut node = Node::new(
+                    let node = Node::new(
                        NodeId(i),
                        format!("httphost-{i}"),
                        80 + i as u16,
                        format!("pghost-{i}"),
                        5432 + i as u16,
                    );
-                    node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
                    assert!(node.is_available());
                    node
                })
@@ -398,7 +283,7 @@ pub(crate) mod test_utils {
 mod tests {
    use super::*;

-    use crate::tenant_shard::IntentState;
+    use crate::tenant_state::IntentState;
    #[test]
    fn scheduler_basic() -> anyhow::Result<()> {
        let nodes = test_utils::make_test_nodes(2);
@@ -407,17 +292,15 @@ mod tests {
        let mut t1_intent = IntentState::new();
        let mut t2_intent = IntentState::new();

-        let context = ScheduleContext::default();
-
-        let scheduled = scheduler.schedule_shard(&[], &context)?;
+        let scheduled = scheduler.schedule_shard(&[])?;
        t1_intent.set_attached(&mut scheduler, Some(scheduled));
-        let scheduled = scheduler.schedule_shard(&[], &context)?;
+        let scheduled = scheduler.schedule_shard(&[])?;
        t2_intent.set_attached(&mut scheduler, Some(scheduled));

        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);

-        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
+        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
        t1_intent.push_secondary(&mut scheduler, scheduled);

        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -22,7 +22,6 @@ diesel::table! {
        placement_policy -> Varchar,
        splitting -> Int2,
        config -> Text,
-        scheduling_policy -> Varchar,
    }
 }

--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -4,12 +4,8 @@ use std::{
    time::Duration,
 };

-use crate::{
-    metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
-    persistence::TenantShardPersistence,
-    scheduler::{AffinityScore, MaySchedule, ScheduleContext},
-};
-use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
+use crate::{metrics, persistence::TenantShardPersistence};
+use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
    shard::{ShardIdentity, TenantShardId},
@@ -50,7 +46,7 @@ where
 /// This struct implement Serialize for debugging purposes, but is _not_ persisted
 /// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
 #[derive(Serialize)]
-pub(crate) struct TenantShard {
+pub(crate) struct TenantState {
    pub(crate) tenant_shard_id: TenantShardId,

    pub(crate) shard: ShardIdentity,
@@ -117,10 +113,6 @@ pub(crate) struct TenantShard {
    /// sending it.  This is the mechanism by which compute notifications are included in the scope
    /// of state that we publish externally in an eventually consistent way.
    pub(crate) pending_compute_notification: bool,
-
-    // Support/debug tool: if something is going wrong or flapping with scheduling, this may
-    // be set to a non-active state to avoid making changes while the issue is fixed.
-    scheduling_policy: ShardSchedulingPolicy,
 }

 #[derive(Default, Clone, Debug, Serialize)]
@@ -251,13 +243,8 @@ impl IntentState {

 impl Drop for IntentState {
    fn drop(&mut self) {
-        // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler.
-        // We do not check this while panicking, to avoid polluting unit test failures or
-        // other assertions with this assertion's output.  It's still wrong to leak these,
-        // but if we already have a panic then we don't need to independently flag this case.
-        if !(std::thread::panicking()) {
-            debug_assert!(self.attached.is_none() && self.secondary.is_empty());
-        }
+        // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
+        debug_assert!(self.attached.is_none() && self.secondary.is_empty());
    }
 }

@@ -302,26 +289,6 @@ pub enum ReconcileWaitError {
    Failed(TenantShardId, String),
 }

-#[derive(Eq, PartialEq, Debug)]
-pub(crate) struct ReplaceSecondary {
-    old_node_id: NodeId,
-    new_node_id: NodeId,
-}
-
-#[derive(Eq, PartialEq, Debug)]
-pub(crate) struct MigrateAttachment {
-    old_attached_node_id: NodeId,
-    new_attached_node_id: NodeId,
-}
-
-#[derive(Eq, PartialEq, Debug)]
-pub(crate) enum ScheduleOptimization {
-    // Replace one of our secondary locations with a different node
-    ReplaceSecondary(ReplaceSecondary),
-    // Migrate attachment to an existing secondary location
-    MigrateAttachment(MigrateAttachment),
-}
-
 impl ReconcilerWaiter {
    pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
        tokio::select! {
@@ -354,7 +321,7 @@ pub(crate) struct ReconcilerHandle {
 }

 /// When a reconcile task completes, it sends this result object
-/// to be applied to the primary TenantShard.
+/// to be applied to the primary TenantState.
 pub(crate) struct ReconcileResult {
    pub(crate) sequence: Sequence,
    /// On errors, `observed` should be treated as an incompleted description
@@ -367,7 +334,7 @@ pub(crate) struct ReconcileResult {
    pub(crate) generation: Option<Generation>,
    pub(crate) observed: ObservedState,

-    /// Set [`TenantShard::pending_compute_notification`] from this flag
+    /// Set [`TenantState::pending_compute_notification`] from this flag
    pub(crate) pending_compute_notification: bool,
 }

@@ -379,7 +346,7 @@ impl ObservedState {
    }
 }

-impl TenantShard {
+impl TenantState {
    pub(crate) fn new(
        tenant_shard_id: TenantShardId,
        shard: ShardIdentity,
@@ -400,7 +367,6 @@ impl TenantShard {
            error_waiter: Arc::new(SeqWait::new(Sequence(0))),
            last_error: Arc::default(),
            pending_compute_notification: false,
-            scheduling_policy: ShardSchedulingPolicy::default(),
        }
    }

@@ -456,7 +422,6 @@ impl TenantShard {
    fn schedule_attached(
        &mut self,
        scheduler: &mut Scheduler,
-        context: &ScheduleContext,
    ) -> Result<(bool, NodeId), ScheduleError> {
        // No work to do if we already have an attached tenant
        if let Some(node_id) = self.intent.attached {
@@ -470,33 +435,14 @@ impl TenantShard {
            Ok((true, promote_secondary))
        } else {
            // Pick a fresh node: either we had no secondaries or none were schedulable
-            let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?;
+            let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
            tracing::debug!("Selected {} as attached", node_id);
            self.intent.set_attached(scheduler, Some(node_id));
            Ok((true, node_id))
        }
    }

-    pub(crate) fn schedule(
-        &mut self,
-        scheduler: &mut Scheduler,
-        context: &mut ScheduleContext,
-    ) -> Result<(), ScheduleError> {
-        let r = self.do_schedule(scheduler, context);
-
-        context.avoid(&self.intent.all_pageservers());
-        if let Some(attached) = self.intent.get_attached() {
-            context.push_attached(*attached);
-        }
-
-        r
-    }
-
-    pub(crate) fn do_schedule(
-        &mut self,
-        scheduler: &mut Scheduler,
-        context: &ScheduleContext,
-    ) -> Result<(), ScheduleError> {
+    pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
        // TODO: before scheduling new nodes, check if any existing content in
        // self.intent refers to pageservers that are offline, and pick other
        // pageservers if so.
@@ -504,16 +450,6 @@ impl TenantShard {
        // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
        // change their attach location.

-        match self.scheduling_policy {
-            ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {}
-            ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
-                // Warn to make it obvious why other things aren't happening/working, if we skip scheduling
-                tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
-                    "Scheduling is disabled by policy {:?}", self.scheduling_policy);
-                return Ok(());
-            }
-        }
-
        // Build the set of pageservers already in use by this tenant, to avoid scheduling
        // more work on the same pageservers we're already using.
        let mut modified = false;
@@ -521,7 +457,22 @@ impl TenantShard {
        // Add/remove nodes to fulfil policy
        use PlacementPolicy::*;
        match self.policy {
-            Attached(secondary_count) => {
+            Single => {
+                // Should have exactly one attached, and zero secondaries
+                if !self.intent.secondary.is_empty() {
+                    self.intent.clear_secondary(scheduler);
+                    modified = true;
+                }
+
+                let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
+                modified |= modified_attached;
+
+                if !self.intent.secondary.is_empty() {
+                    self.intent.clear_secondary(scheduler);
+                    modified = true;
+                }
+            }
+            Double(secondary_count) => {
                let retain_secondaries = if self.intent.attached.is_none()
                    && scheduler.node_preferred(&self.intent.secondary).is_some()
                {
@@ -540,13 +491,12 @@ impl TenantShard {
                }

                // Should have exactly one attached, and N secondaries
-                let (modified_attached, attached_node_id) =
-                    self.schedule_attached(scheduler, context)?;
+                let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
                modified |= modified_attached;

                let mut used_pageservers = vec![attached_node_id];
                while self.intent.secondary.len() < secondary_count {
-                    let node_id = scheduler.schedule_shard(&used_pageservers, context)?;
+                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
                    self.intent.push_secondary(scheduler, node_id);
                    used_pageservers.push(node_id);
                    modified = true;
@@ -559,7 +509,7 @@ impl TenantShard {
                    modified = true;
                } else if self.intent.secondary.is_empty() {
                    // Populate secondary by scheduling a fresh node
-                    let node_id = scheduler.schedule_shard(&[], context)?;
+                    let node_id = scheduler.schedule_shard(&[])?;
                    self.intent.push_secondary(scheduler, node_id);
                    modified = true;
                }
@@ -586,167 +536,6 @@ impl TenantShard {
        Ok(())
    }

-    /// Optimize attachments: if a shard has a secondary location that is preferable to
-    /// its primary location based on soft constraints, switch that secondary location
-    /// to be attached.
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
-    pub(crate) fn optimize_attachment(
-        &self,
-        nodes: &HashMap<NodeId, Node>,
-        schedule_context: &ScheduleContext,
-    ) -> Option<ScheduleOptimization> {
-        let attached = (*self.intent.get_attached())?;
-        if self.intent.secondary.is_empty() {
-            // We can only do useful work if we have both attached and secondary locations: this
-            // function doesn't schedule new locations, only swaps between attached and secondaries.
-            return None;
-        }
-
-        let current_affinity_score = schedule_context.get_node_affinity(attached);
-        let current_attachment_count = schedule_context.get_node_attachments(attached);
-
-        // Generate score for each node, dropping any un-schedulable nodes.
-        let all_pageservers = self.intent.all_pageservers();
-        let mut scores = all_pageservers
-            .iter()
-            .flat_map(|node_id| {
-                if matches!(
-                    nodes
-                        .get(node_id)
-                        .map(|n| n.may_schedule())
-                        .unwrap_or(MaySchedule::No),
-                    MaySchedule::No
-                ) {
-                    None
-                } else {
-                    let affinity_score = schedule_context.get_node_affinity(*node_id);
-                    let attachment_count = schedule_context.get_node_attachments(*node_id);
-                    Some((*node_id, affinity_score, attachment_count))
-                }
-            })
-            .collect::<Vec<_>>();
-
-        // Sort precedence:
-        //  1st - prefer nodes with the lowest total affinity score
-        //  2nd - prefer nodes with the lowest number of attachments in this context
-        //  3rd - if all else is equal, sort by node ID for determinism in tests.
-        scores.sort_by_key(|i| (i.1, i.2, i.0));
-
-        if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) =
-            scores.first()
-        {
-            if attached != *preferred_node {
-                // The best alternative must be more than 1 better than us, otherwise we could end
-                // up flapping back next time we're called (e.g. there's no point migrating from
-                // a location with score 1 to a score zero, because on next location the situation
-                // would be the same, but in reverse).
-                if current_affinity_score > *preferred_affinity_score + AffinityScore(1)
-                    || current_attachment_count > *preferred_attachment_count + 1
-                {
-                    tracing::info!(
-                        "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
-                        self.intent.get_secondary()
-                    );
-                    return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                        old_attached_node_id: attached,
-                        new_attached_node_id: *preferred_node,
-                    }));
-                }
-            } else {
-                tracing::debug!(
-                    "Node {} is already preferred (score {:?})",
-                    preferred_node,
-                    preferred_affinity_score
-                );
-            }
-        }
-
-        // Fall-through: we didn't find an optimization
-        None
-    }
-
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
-    pub(crate) fn optimize_secondary(
-        &self,
-        scheduler: &Scheduler,
-        schedule_context: &ScheduleContext,
-    ) -> Option<ScheduleOptimization> {
-        if self.intent.secondary.is_empty() {
-            // We can only do useful work if we have both attached and secondary locations: this
-            // function doesn't schedule new locations, only swaps between attached and secondaries.
-            return None;
-        }
-
-        for secondary in self.intent.get_secondary() {
-            let Some(affinity_score) = schedule_context.nodes.get(secondary) else {
-                // We're already on a node unaffected any affinity constraints,
-                // so we won't change it.
-                continue;
-            };
-
-            // Let the scheduler suggest a node, where it would put us if we were scheduling afresh
-            // This implicitly limits the choice to nodes that are available, and prefers nodes
-            // with lower utilization.
-            let Ok(candidate_node) =
-                scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context)
-            else {
-                // A scheduling error means we have no possible candidate replacements
-                continue;
-            };
-
-            let candidate_affinity_score = schedule_context
-                .nodes
-                .get(&candidate_node)
-                .unwrap_or(&AffinityScore::FREE);
-
-            // The best alternative must be more than 1 better than us, otherwise we could end
-            // up flapping back next time we're called.
-            if *candidate_affinity_score + AffinityScore(1) < *affinity_score {
-                // If some other node is available and has a lower score than this node, then
-                // that other node is a good place to migrate to.
-                tracing::info!(
-                    "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
-                    self.intent.get_secondary()
-                );
-                return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
-                    old_node_id: *secondary,
-                    new_node_id: candidate_node,
-                }));
-            }
-        }
-
-        None
-    }
-
-    pub(crate) fn apply_optimization(
-        &mut self,
-        scheduler: &mut Scheduler,
-        optimization: ScheduleOptimization,
-    ) {
-        metrics::METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_schedule_optimization
-            .inc();
-
-        match optimization {
-            ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                old_attached_node_id,
-                new_attached_node_id,
-            }) => {
-                self.intent.demote_attached(old_attached_node_id);
-                self.intent
-                    .promote_attached(scheduler, new_attached_node_id);
-            }
-            ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
-                old_node_id,
-                new_node_id,
-            }) => {
-                self.intent.remove_secondary(scheduler, old_node_id);
-                self.intent.push_secondary(scheduler, new_node_id);
-            }
-        }
-    }
-
    /// Query whether the tenant's observed state for attached node matches its intent state, and if so,
    /// yield the node ID.  This is appropriate for emitting compute hook notifications: we are checking that
    /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
@@ -788,12 +577,7 @@ impl TenantShard {
                .generation
                .expect("Attempted to enter attached state without a generation");

-            let wanted_conf = attached_location_conf(
-                generation,
-                &self.shard,
-                &self.config,
-                !self.intent.secondary.is_empty(),
-            );
+            let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
            match self.observed.locations.get(&node_id) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                Some(_) | None => {
@@ -891,19 +675,6 @@ impl TenantShard {
            }
        }

-        // Pre-checks done: finally check whether we may actually do the work
-        match self.scheduling_policy {
-            ShardSchedulingPolicy::Active
-            | ShardSchedulingPolicy::Essential
-            | ShardSchedulingPolicy::Pause => {}
-            ShardSchedulingPolicy::Stop => {
-                // We only reach this point if there is work to do and we're going to skip
-                // doing it: warn it obvious why this tenant isn't doing what it ought to.
-                tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
-                return None;
-            }
-        }
-
        // Build list of nodes from which the reconciler should detach
        let mut detach = Vec::new();
        for node_id in self.observed.locations.keys() {
@@ -957,10 +728,7 @@ impl TenantShard {
        let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
                                                        tenant_id=%reconciler.tenant_shard_id.tenant_id,
                                                        shard_id=%reconciler.tenant_shard_id.shard_slug());
-        metrics::METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_reconcile_spawn
-            .inc();
+        metrics::RECONCILER.spawned.inc();
        let result_tx = result_tx.clone();
        let join_handle = tokio::task::spawn(
            async move {
@@ -978,12 +746,10 @@ impl TenantShard {
                // TODO: wrap all remote API operations in cancellation check
                // as well.
                if reconciler.cancel.is_cancelled() {
-                    metrics::METRICS_REGISTRY
-                        .metrics_group
-                        .storage_controller_reconcile_complete
-                        .inc(ReconcileCompleteLabelGroup {
-                            status: ReconcileOutcome::Cancel,
-                        });
+                    metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
+                        .inc();
                    return;
                }

@@ -998,18 +764,18 @@ impl TenantShard {
                }

                // Update result counter
-                let outcome_label = match &result {
-                    Ok(_) => ReconcileOutcome::Success,
-                    Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel,
-                    Err(_) => ReconcileOutcome::Error,
-                };
-
-                metrics::METRICS_REGISTRY
-                    .metrics_group
-                    .storage_controller_reconcile_complete
-                    .inc(ReconcileCompleteLabelGroup {
-                        status: outcome_label,
-                    });
+                match &result {
+                    Ok(_) => metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
+                    Err(ReconcileError::Cancel) => metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
+                    Err(_) => metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
+                }
+                .inc();

                result_tx
                    .send(ReconcileResult {
@@ -1040,22 +806,6 @@ impl TenantShard {
        })
    }

-    /// Get a waiter for any reconciliation in flight, but do not start reconciliation
-    /// if it is not already running
-    pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
-        if self.reconciler.is_some() {
-            Some(ReconcilerWaiter {
-                tenant_shard_id: self.tenant_shard_id,
-                seq_wait: self.waiter.clone(),
-                error_seq_wait: self.error_waiter.clone(),
-                error: self.last_error.clone(),
-                seq: self.sequence,
-            })
-        } else {
-            None
-        }
-    }
-
    /// Called when a ReconcileResult has been emitted and the service is updating
    /// our state: if the result is from a sequence >= my ReconcileHandle, then drop
    /// the handle to indicate there is no longer a reconciliation in progress.
@@ -1081,40 +831,6 @@ impl TenantShard {
        debug_assert!(!self.intent.all_pageservers().contains(&node_id));
    }

-    pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) {
-        self.scheduling_policy = p;
-    }
-
-    pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy {
-        &self.scheduling_policy
-    }
-
-    pub(crate) fn from_persistent(
-        tsp: TenantShardPersistence,
-        intent: IntentState,
-    ) -> anyhow::Result<Self> {
-        let tenant_shard_id = tsp.get_tenant_shard_id()?;
-        let shard_identity = tsp.get_shard_identity()?;
-
-        Ok(Self {
-            tenant_shard_id,
-            shard: shard_identity,
-            sequence: Sequence::initial(),
-            generation: tsp.generation.map(|g| Generation::new(g as u32)),
-            policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
-            intent,
-            observed: ObservedState::new(),
-            config: serde_json::from_str(&tsp.config).unwrap(),
-            reconciler: None,
-            splitting: tsp.splitting,
-            waiter: Arc::new(SeqWait::new(Sequence::initial())),
-            error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
-            last_error: Arc::default(),
-            pending_compute_notification: false,
-            scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
-        })
-    }
-
    pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
        TenantShardPersistence {
            tenant_id: self.tenant_shard_id.tenant_id.to_string(),
@@ -1126,7 +842,6 @@ impl TenantShard {
            placement_policy: serde_json::to_string(&self.policy).unwrap(),
            config: serde_json::to_string(&self.config).unwrap(),
            splitting: SplitState::default(),
-            scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
        }
    }
 }
@@ -1143,7 +858,7 @@ pub(crate) mod tests {

    use super::*;

-    fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard {
+    fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
        let tenant_id = TenantId::generate();
        let shard_number = ShardNumber(0);
        let shard_count = ShardCount::new(1);
@@ -1153,7 +868,7 @@ pub(crate) mod tests {
            shard_number,
            shard_count,
        };
-        TenantShard::new(
+        TenantState::new(
            tenant_shard_id,
            ShardIdentity::new(
                shard_number,
@@ -1165,32 +880,6 @@ pub(crate) mod tests {
        )
    }

-    fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantShard> {
-        let tenant_id = TenantId::generate();
-
-        (0..shard_count.count())
-            .map(|i| {
-                let shard_number = ShardNumber(i);
-
-                let tenant_shard_id = TenantShardId {
-                    tenant_id,
-                    shard_number,
-                    shard_count,
-                };
-                TenantShard::new(
-                    tenant_shard_id,
-                    ShardIdentity::new(
-                        shard_number,
-                        shard_count,
-                        pageserver_api::shard::ShardStripeSize(32768),
-                    )
-                    .unwrap(),
-                    policy.clone(),
-                )
-            })
-            .collect()
-    }
-
    /// Test the scheduling behaviors used when a tenant configured for HA is subject
    /// to nodes being marked offline.
    #[test]
@@ -1200,26 +889,25 @@ pub(crate) mod tests {
        let mut nodes = make_test_nodes(3);

        let mut scheduler = Scheduler::new(nodes.values());
-        let mut context = ScheduleContext::default();

-        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
-        tenant_shard
-            .schedule(&mut scheduler, &mut context)
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
+        tenant_state
+            .schedule(&mut scheduler)
            .expect("we have enough nodes, scheduling should work");

        // Expect to initially be schedule on to different nodes
-        assert_eq!(tenant_shard.intent.secondary.len(), 1);
-        assert!(tenant_shard.intent.attached.is_some());
+        assert_eq!(tenant_state.intent.secondary.len(), 1);
+        assert!(tenant_state.intent.attached.is_some());

-        let attached_node_id = tenant_shard.intent.attached.unwrap();
-        let secondary_node_id = *tenant_shard.intent.secondary.iter().last().unwrap();
+        let attached_node_id = tenant_state.intent.attached.unwrap();
+        let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
        assert_ne!(attached_node_id, secondary_node_id);

        // Notifying the attached node is offline should demote it to a secondary
-        let changed = tenant_shard.intent.demote_attached(attached_node_id);
+        let changed = tenant_state.intent.demote_attached(attached_node_id);
        assert!(changed);
-        assert!(tenant_shard.intent.attached.is_none());
-        assert_eq!(tenant_shard.intent.secondary.len(), 2);
+        assert!(tenant_state.intent.attached.is_none());
+        assert_eq!(tenant_state.intent.secondary.len(), 2);

        // Update the scheduler state to indicate the node is offline
        nodes
@@ -1229,18 +917,18 @@ pub(crate) mod tests {
        scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());

        // Scheduling the node should promote the still-available secondary node to attached
-        tenant_shard
-            .schedule(&mut scheduler, &mut context)
+        tenant_state
+            .schedule(&mut scheduler)
            .expect("active nodes are available");
-        assert_eq!(tenant_shard.intent.attached.unwrap(), secondary_node_id);
+        assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);

        // The original attached node should have been retained as a secondary
        assert_eq!(
-            *tenant_shard.intent.secondary.iter().last().unwrap(),
+            *tenant_state.intent.secondary.iter().last().unwrap(),
            attached_node_id
        );

-        tenant_shard.intent.clear(&mut scheduler);
+        tenant_state.intent.clear(&mut scheduler);

        Ok(())
    }
@@ -1250,263 +938,48 @@ pub(crate) mod tests {
        let nodes = make_test_nodes(3);
        let mut scheduler = Scheduler::new(nodes.values());

-        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));

-        tenant_shard.observed.locations.insert(
+        tenant_state.observed.locations.insert(
            NodeId(3),
            ObservedStateLocation {
                conf: Some(LocationConfig {
                    mode: LocationConfigMode::AttachedMulti,
                    generation: Some(2),
                    secondary_conf: None,
-                    shard_number: tenant_shard.shard.number.0,
-                    shard_count: tenant_shard.shard.count.literal(),
-                    shard_stripe_size: tenant_shard.shard.stripe_size.0,
+                    shard_number: tenant_state.shard.number.0,
+                    shard_count: tenant_state.shard.count.literal(),
+                    shard_stripe_size: tenant_state.shard.stripe_size.0,
                    tenant_conf: TenantConfig::default(),
                }),
            },
        );

-        tenant_shard.observed.locations.insert(
+        tenant_state.observed.locations.insert(
            NodeId(2),
            ObservedStateLocation {
                conf: Some(LocationConfig {
                    mode: LocationConfigMode::AttachedStale,
                    generation: Some(1),
                    secondary_conf: None,
-                    shard_number: tenant_shard.shard.number.0,
-                    shard_count: tenant_shard.shard.count.literal(),
-                    shard_stripe_size: tenant_shard.shard.stripe_size.0,
+                    shard_number: tenant_state.shard.number.0,
+                    shard_count: tenant_state.shard.count.literal(),
+                    shard_stripe_size: tenant_state.shard.stripe_size.0,
                    tenant_conf: TenantConfig::default(),
                }),
            },
        );

-        tenant_shard.intent_from_observed(&mut scheduler);
+        tenant_state.intent_from_observed(&mut scheduler);

        // The highest generationed attached location gets used as attached
-        assert_eq!(tenant_shard.intent.attached, Some(NodeId(3)));
+        assert_eq!(tenant_state.intent.attached, Some(NodeId(3)));
        // Other locations get used as secondary
-        assert_eq!(tenant_shard.intent.secondary, vec![NodeId(2)]);
+        assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]);

-        scheduler.consistency_check(nodes.values(), [&tenant_shard].into_iter())?;
-
-        tenant_shard.intent.clear(&mut scheduler);
-        Ok(())
-    }
-
-    #[test]
-    fn scheduling_mode() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(3);
-        let mut scheduler = Scheduler::new(nodes.values());
-
-        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
-
-        // In pause mode, schedule() shouldn't do anything
-        tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause;
-        assert!(tenant_shard
-            .schedule(&mut scheduler, &mut ScheduleContext::default())
-            .is_ok());
-        assert!(tenant_shard.intent.all_pageservers().is_empty());
-
-        // In active mode, schedule() works
-        tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active;
-        assert!(tenant_shard
-            .schedule(&mut scheduler, &mut ScheduleContext::default())
-            .is_ok());
-        assert!(!tenant_shard.intent.all_pageservers().is_empty());
-
-        tenant_shard.intent.clear(&mut scheduler);
-        Ok(())
-    }
-
-    #[test]
-    fn optimize_attachment() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(3);
-        let mut scheduler = Scheduler::new(nodes.values());
-
-        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
-        let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
-
-        // Initially: both nodes attached on shard 1, and both have secondary locations
-        // on different nodes.
-        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
-        shard_a.intent.push_secondary(&mut scheduler, NodeId(2));
-        shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1)));
-        shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
-
-        let mut schedule_context = ScheduleContext::default();
-        schedule_context.avoid(&shard_a.intent.all_pageservers());
-        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
-        schedule_context.avoid(&shard_b.intent.all_pageservers());
-        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
-
-        let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);
-
-        // Either shard should recognize that it has the option to switch to a secondary location where there
-        // would be no other shards from the same tenant, and request to do so.
-        assert_eq!(
-            optimization_a,
-            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                old_attached_node_id: NodeId(1),
-                new_attached_node_id: NodeId(2)
-            }))
-        );
-
-        // Note that these optimizing two shards in the same tenant with the same ScheduleContext is
-        // mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility
-        // of [`Service::optimize_all`] to avoid trying
-        // to do optimizations for multiple shards in the same tenant at the same time.  Generating
-        // both optimizations is just done for test purposes
-        let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
-        assert_eq!(
-            optimization_b,
-            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                old_attached_node_id: NodeId(1),
-                new_attached_node_id: NodeId(3)
-            }))
-        );
-
-        // Applying these optimizations should result in the end state proposed
-        shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
-        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2)));
-        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
-        shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap());
-        assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3)));
-        assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]);
-
-        shard_a.intent.clear(&mut scheduler);
-        shard_b.intent.clear(&mut scheduler);
-
-        Ok(())
-    }
-
-    #[test]
-    fn optimize_secondary() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(4);
-        let mut scheduler = Scheduler::new(nodes.values());
-
-        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
-        let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
-
-        // Initially: both nodes attached on shard 1, and both have secondary locations
-        // on different nodes.
-        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
-        shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
-        shard_b.intent.set_attached(&mut scheduler, Some(NodeId(2)));
-        shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
-
-        let mut schedule_context = ScheduleContext::default();
-        schedule_context.avoid(&shard_a.intent.all_pageservers());
-        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
-        schedule_context.avoid(&shard_b.intent.all_pageservers());
-        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
-
-        let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context);
-
-        // Since there is a node with no locations available, the node with two locations for the
-        // same tenant should generate an optimization to move one away
-        assert_eq!(
-            optimization_a,
-            Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
-                old_node_id: NodeId(3),
-                new_node_id: NodeId(4)
-            }))
-        );
-
-        shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
-        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
-        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(4)]);
-
-        shard_a.intent.clear(&mut scheduler);
-        shard_b.intent.clear(&mut scheduler);
-
-        Ok(())
-    }
-
-    // Optimize til quiescent: this emulates what Service::optimize_all does, when
-    // called repeatedly in the background.
-    fn optimize_til_idle(
-        nodes: &HashMap<NodeId, Node>,
-        scheduler: &mut Scheduler,
-        shards: &mut [TenantShard],
-    ) {
-        let mut loop_n = 0;
-        loop {
-            let mut schedule_context = ScheduleContext::default();
-            let mut any_changed = false;
-
-            for shard in shards.iter() {
-                schedule_context.avoid(&shard.intent.all_pageservers());
-                if let Some(attached) = shard.intent.get_attached() {
-                    schedule_context.push_attached(*attached);
-                }
-            }
-
-            for shard in shards.iter_mut() {
-                let optimization = shard.optimize_attachment(nodes, &schedule_context);
-                if let Some(optimization) = optimization {
-                    shard.apply_optimization(scheduler, optimization);
-                    any_changed = true;
-                    break;
-                }
-
-                let optimization = shard.optimize_secondary(scheduler, &schedule_context);
-                if let Some(optimization) = optimization {
-                    shard.apply_optimization(scheduler, optimization);
-                    any_changed = true;
-                    break;
-                }
-            }
-
-            if !any_changed {
-                break;
-            }
-
-            // Assert no infinite loop
-            loop_n += 1;
-            assert!(loop_n < 1000);
-        }
-    }
-
-    /// Test the balancing behavior of shard scheduling: that it achieves a balance, and
-    /// that it converges.
-    #[test]
-    fn optimize_add_nodes() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(4);
-
-        // Only show the scheduler a couple of nodes
-        let mut scheduler = Scheduler::new([].iter());
-        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
-        scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
-
-        let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
-        let mut schedule_context = ScheduleContext::default();
-        for shard in &mut shards {
-            assert!(shard
-                .schedule(&mut scheduler, &mut schedule_context)
-                .is_ok());
-        }
-
-        // We should see equal number of locations on the two nodes.
-        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
-        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
-
-        // Add another two nodes: we should see the shards spread out when their optimize
-        // methods are called
-        scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap());
-        scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
-        optimize_til_idle(&nodes, &mut scheduler, &mut shards);
-
-        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
-        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
-        assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
-        assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
-
-        for shard in shards.iter_mut() {
-            shard.intent.clear(&mut scheduler);
-        }
+        scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?;

+        tenant_state.intent.clear(&mut scheduler);
        Ok(())
    }
 }
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -294,7 +294,7 @@ where
    //      is in state 'taken' but the thread that would unlock it is
    //      not there.
    //   2. A rust object that represented some external resource in the
-    //      parent now got implicitly copied by the fork, even though
+    //      parent now got implicitly copied by the the fork, even though
    //      the object's type is not `Copy`. The parent program may use
    //      non-copyability as way to enforce unique ownership of an
    //      external resource in the typesystem. The fork breaks that
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -14,7 +14,9 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
-use pageserver_api::controller_api::PlacementPolicy;
+use pageserver_api::controller_api::{
+    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
+};
 use pageserver_api::models::{
    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
@@ -435,7 +437,7 @@ async fn handle_tenant(

            let placement_policy = match create_match.get_one::<String>("placement-policy") {
                Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
-                _ => PlacementPolicy::Attached(0),
+                _ => PlacementPolicy::Single,
            };

            let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
@@ -521,6 +523,88 @@ async fn handle_tenant(
                .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
+        Some(("migrate", matches)) => {
+            let tenant_shard_id = get_tenant_shard_id(matches, env)?;
+            let new_pageserver = get_pageserver(env, matches)?;
+            let new_pageserver_id = new_pageserver.conf.id;
+
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
+                .tenant_migrate(tenant_shard_id, new_pageserver_id)
+                .await?;
+
+            println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
+        }
+        Some(("status", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+
+            let mut shard_table = comfy_table::Table::new();
+            shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
+
+            let mut tenant_synthetic_size = None;
+
+            let storage_controller = StorageController::from_env(env);
+            for shard in storage_controller.tenant_locate(tenant_id).await?.shards {
+                let pageserver =
+                    PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
+
+                let size = pageserver
+                    .http_client
+                    .tenant_details(shard.shard_id)
+                    .await?
+                    .tenant_info
+                    .current_physical_size
+                    .unwrap();
+
+                shard_table.add_row([
+                    format!("{}", shard.shard_id.shard_slug()),
+                    format!("{}", shard.node_id.0),
+                    format!("{} MiB", size / (1024 * 1024)),
+                ]);
+
+                if shard.shard_id.is_zero() {
+                    tenant_synthetic_size =
+                        Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
+                }
+            }
+
+            let Some(synthetic_size) = tenant_synthetic_size else {
+                bail!("Shard 0 not found")
+            };
+
+            let mut tenant_table = comfy_table::Table::new();
+            tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
+            tenant_table.add_row([
+                "Synthetic size".to_string(),
+                format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
+            ]);
+
+            println!("{tenant_table}");
+            println!("{shard_table}");
+        }
+        Some(("shard-split", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+            let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
+            let shard_stripe_size: Option<ShardStripeSize> = matches
+                .get_one::<Option<ShardStripeSize>>("shard-stripe-size")
+                .cloned()
+                .unwrap();
+
+            let storage_controller = StorageController::from_env(env);
+            let result = storage_controller
+                .tenant_split(tenant_id, shard_count, shard_stripe_size)
+                .await?;
+            println!(
+                "Split tenant {} into shards {}",
+                tenant_id,
+                result
+                    .new_shards
+                    .iter()
+                    .map(|s| format!("{:?}", s))
+                    .collect::<Vec<_>>()
+                    .join(",")
+            );
+        }

        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
@@ -1058,6 +1142,21 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }
        }

+        Some(("set-state", subcommand_args)) => {
+            let pageserver = get_pageserver(env, subcommand_args)?;
+            let scheduling = subcommand_args.get_one("scheduling");
+            let availability = subcommand_args.get_one("availability");
+
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
+                .node_configure(NodeConfigureRequest {
+                    node_id: pageserver.conf.id,
+                    scheduling: scheduling.cloned(),
+                    availability: availability.cloned(),
+                })
+                .await?;
+        }
+
        Some(("status", subcommand_args)) => {
            match get_pageserver(env, subcommand_args)?.check_status().await {
                Ok(_) => println!("Page server is up and running"),
@@ -1479,6 +1578,19 @@ fn cli() -> Command {
            .subcommand(Command::new("config")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
+            .subcommand(Command::new("migrate")
+                .about("Migrate a tenant from one pageserver to another")
+                .arg(tenant_id_arg.clone())
+                .arg(pageserver_id_arg.clone()))
+            .subcommand(Command::new("status")
+                .about("Human readable summary of the tenant's shards and attachment locations")
+                .arg(tenant_id_arg.clone()))
+            .subcommand(Command::new("shard-split")
+                .about("Increase the number of shards in the tenant")
+                .arg(tenant_id_arg.clone())
+                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
+                .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
+                )
        )
        .subcommand(
            Command::new("pageserver")
@@ -1498,6 +1610,12 @@ fn cli() -> Command {
                    .about("Restart local pageserver")
                    .arg(pageserver_config_args.clone())
                )
+                .subcommand(Command::new("set-state")
+                    .arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
+                    .arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
+                    .about("Set scheduling or availability state of pageserver node")
+                    .arg(pageserver_config_args.clone())
+                )
        )
        .subcommand(
            Command::new("storage_controller")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -12,7 +12,7 @@
 //!
 //! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
 //! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
-//! the basebackup from the pageserver to initialize the data directory, and
+//! the basebackup from the pageserver to initialize the the data directory, and
 //! finally launches the PostgreSQL process. It watches the PostgreSQL process
 //! until it exits.
 //!
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -114,7 +114,7 @@ impl NeonBroker {
 }

 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
-#[serde(default, deny_unknown_fields)]
+#[serde(default)]
 pub struct PageServerConf {
    // node id
    pub id: NodeId,
@@ -126,9 +126,6 @@ pub struct PageServerConf {
    // auth type used for the PG and HTTP ports
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,
-
-    pub(crate) virtual_file_io_engine: Option<String>,
-    pub(crate) get_vectored_impl: Option<String>,
 }

 impl Default for PageServerConf {
@@ -139,8 +136,6 @@ impl Default for PageServerConf {
            listen_http_addr: String::new(),
            pg_auth_type: AuthType::Trust,
            http_auth_type: AuthType::Trust,
-            virtual_file_io_engine: None,
-            get_vectored_impl: None,
        }
    }
 }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -78,39 +78,18 @@ impl PageServerNode {
    ///
    /// These all end up on the command line of the `pageserver` binary.
    fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
+        let id = format!("id={}", self.conf.id);
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
        let pg_distrib_dir_param = format!(
            "pg_distrib_dir='{}'",
            self.env.pg_distrib_dir_raw().display()
        );

-        let PageServerConf {
-            id,
-            listen_pg_addr,
-            listen_http_addr,
-            pg_auth_type,
-            http_auth_type,
-            virtual_file_io_engine,
-            get_vectored_impl,
-        } = &self.conf;
+        let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
+        let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);

-        let id = format!("id={}", id);
-
-        let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
-        let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
-
-        let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
-        let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
-        let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
-            format!("virtual_file_io_engine='{virtual_file_io_engine}'")
-        } else {
-            String::new()
-        };
-        let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
-            format!("get_vectored_impl='{get_vectored_impl}'")
-        } else {
-            String::new()
-        };
+        let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
+        let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);

        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());

@@ -122,8 +101,6 @@ impl PageServerNode {
            listen_http_addr_param,
            listen_pg_addr_param,
            broker_endpoint_param,
-            virtual_file_io_engine,
-            get_vectored_impl,
        ];

        if let Some(control_plane_api) = &self.env.control_plane_api {
@@ -134,7 +111,7 @@ impl PageServerNode {

            // Storage controller uses the same auth as pageserver: if JWT is enabled
            // for us, we will also need it to talk to them.
-            if matches!(http_auth_type, AuthType::NeonJWT) {
+            if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
                let jwt_token = self
                    .env
                    .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
@@ -152,7 +129,8 @@ impl PageServerNode {
            ));
        }

-        if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
+        if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
+        {
            // Keys are generated in the toplevel repo dir, pageservers' workdirs
            // are one level below that, so refer to keys with ../
            overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
@@ -389,10 +367,6 @@ impl PageServerNode {
                .remove("image_creation_threshold")
                .map(|x| x.parse::<usize>())
                .transpose()?,
-            image_layer_creation_check_threshold: settings
-                .remove("image_layer_creation_check_threshold")
-                .map(|x| x.parse::<u8>())
-                .transpose()?,
            pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
            walreceiver_connect_timeout: settings
                .remove("walreceiver_connect_timeout")
@@ -410,6 +384,11 @@ impl PageServerNode {
                .map(|x| x.parse::<bool>())
                .transpose()
                .context("Failed to parse 'trace_read_requests' as bool")?,
+            image_layer_compression: settings
+                .remove("image_layer_compression")
+                .map(serde_json::from_str)
+                .transpose()
+                .context("Failed to parse 'image_layer_compression' json")?,
            eviction_policy: settings
                .remove("eviction_policy")
                .map(serde_json::from_str)
@@ -505,12 +484,6 @@ impl PageServerNode {
                    .map(|x| x.parse::<usize>())
                    .transpose()
                    .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
-                image_layer_creation_check_threshold: settings
-                    .remove("image_layer_creation_check_threshold")
-                    .map(|x| x.parse::<u8>())
-                    .transpose()
-                    .context("Failed to parse 'image_creation_check_threshold' as integer")?,
-
                pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
                walreceiver_connect_timeout: settings
                    .remove("walreceiver_connect_timeout")
@@ -528,6 +501,11 @@ impl PageServerNode {
                    .map(|x| x.parse::<bool>())
                    .transpose()
                    .context("Failed to parse 'trace_read_requests' as bool")?,
+                image_layer_compression: settings
+                    .remove("image_layer_compression")
+                    .map(serde_json::from_str)
+                    .transpose()
+                    .context("Failed to parse 'image_layer_compression' json")?,
                eviction_policy: settings
                    .remove("eviction_policy")
                    .map(serde_json::from_str)
@@ -586,6 +564,13 @@ impl PageServerNode {
        Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
    }

+    pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
+        Ok(self
+            .http_client
+            .tenant_secondary_download(*tenant_id)
+            .await?)
+    }
+
    pub async fn timeline_create(
        &self,
        tenant_shard_id: TenantShardId,
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -38,9 +38,6 @@ const COMMAND: &str = "storage_controller";

 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;

-// Use a shorter pageserver unavailability interval than the default to speed up tests.
-const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
-
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -272,18 +269,13 @@ impl StorageController {
        // Run migrations on every startup, in case something changed.
        let database_url = self.setup_database().await?;

-        let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
-
        let mut args = vec![
            "-l",
            &self.listen,
            "-p",
            self.path.as_ref(),
-            "--dev",
            "--database-url",
            &database_url,
-            "--max-unavailable-interval",
-            &max_unavailable.to_string(),
        ]
        .into_iter()
        .map(|s| s.to_string())
@@ -476,7 +468,7 @@ impl StorageController {
    pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
        self.dispatch::<(), _>(
            Method::GET,
-            format!("debug/v1/tenant/{tenant_id}/locate"),
+            format!("control/v1/tenant/{tenant_id}/locate"),
            None,
        )
        .await
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -1,23 +0,0 @@
-[package]
-name = "storcon_cli"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-
-[dependencies]
-anyhow.workspace = true
-clap.workspace = true
-comfy-table.workspace = true
-hyper.workspace = true
-pageserver_api.workspace = true
-pageserver_client.workspace = true
-reqwest.workspace = true
-serde.workspace = true
-serde_json = { workspace = true, features = ["raw_value"] }
-thiserror.workspace = true
-tokio.workspace = true
-tracing.workspace = true
-utils.workspace = true
-workspace_hack.workspace = true
-
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,587 +0,0 @@
-use std::{collections::HashMap, str::FromStr};
-
-use clap::{Parser, Subcommand};
-use hyper::Method;
-use pageserver_api::{
-    controller_api::{
-        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
-        TenantDescribeResponse, TenantPolicyRequest,
-    },
-    models::{
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
-        TenantShardSplitRequest, TenantShardSplitResponse,
-    },
-    shard::{ShardStripeSize, TenantShardId},
-};
-use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
-use reqwest::Url;
-use serde::{de::DeserializeOwned, Serialize};
-use utils::id::{NodeId, TenantId};
-
-use pageserver_api::controller_api::{
-    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-    TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
-};
-
-#[derive(Subcommand, Debug)]
-enum Command {
-    /// Register a pageserver with the storage controller.  This shouldn't usually be necessary,
-    /// since pageservers auto-register when they start up
-    NodeRegister {
-        #[arg(long)]
-        node_id: NodeId,
-
-        #[arg(long)]
-        listen_pg_addr: String,
-        #[arg(long)]
-        listen_pg_port: u16,
-
-        #[arg(long)]
-        listen_http_addr: String,
-        #[arg(long)]
-        listen_http_port: u16,
-    },
-
-    /// Modify a node's configuration in the storage controller
-    NodeConfigure {
-        #[arg(long)]
-        node_id: NodeId,
-
-        /// Availability is usually auto-detected based on heartbeats.  Set 'offline' here to
-        /// manually mark a node offline
-        #[arg(long)]
-        availability: Option<NodeAvailabilityArg>,
-        /// Scheduling policy controls whether tenant shards may be scheduled onto this node.
-        #[arg(long)]
-        scheduling: Option<NodeSchedulingPolicy>,
-    },
-    /// Modify a tenant's policies in the storage controller
-    TenantPolicy {
-        #[arg(long)]
-        tenant_id: TenantId,
-        /// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
-        /// or is in the normal attached state with N secondary locations (`attached:N`)
-        #[arg(long)]
-        placement: Option<PlacementPolicyArg>,
-        /// Scheduling policy enables pausing the controller's scheduling activity involving this tenant.  `active` is normal,
-        /// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
-        /// all reconciliation activity including for scheduling changes already made.  `pause` and `stop` can make a tenant
-        /// unavailable, and are only for use in emergencies.
-        #[arg(long)]
-        scheduling: Option<ShardSchedulingPolicyArg>,
-    },
-    /// List nodes known to the storage controller
-    Nodes {},
-    /// List tenants known to the storage controller
-    Tenants {},
-    /// Create a new tenant in the storage controller, and by extension on pageservers.
-    TenantCreate {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-    /// Delete a tenant in the storage controller, and by extension on pageservers.
-    TenantDelete {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-    /// Split an existing tenant into a higher number of shards than its current shard count.
-    TenantShardSplit {
-        #[arg(long)]
-        tenant_id: TenantId,
-        #[arg(long)]
-        shard_count: u8,
-        /// Optional, in 8kiB pages.  e.g. set 2048 for 16MB stripes.
-        #[arg(long)]
-        stripe_size: Option<u32>,
-    },
-    /// Migrate the attached location for a tenant shard to a specific pageserver.
-    TenantShardMigrate {
-        #[arg(long)]
-        tenant_shard_id: TenantShardId,
-        #[arg(long)]
-        node: NodeId,
-    },
-    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
-    /// that is passed through to pageservers, and does not affect storage controller behavior.
-    TenantConfig {
-        #[arg(long)]
-        tenant_id: TenantId,
-        #[arg(long)]
-        config: String,
-    },
-    /// Attempt to balance the locations for a tenant across pageservers.  This is a client-side
-    /// alternative to the storage controller's scheduling optimization behavior.
-    TenantScatter {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-    /// Print details about a particular tenant, including all its shards' states.
-    TenantDescribe {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-}
-
-#[derive(Parser)]
-#[command(
-    author,
-    version,
-    about,
-    long_about = "CLI for Storage Controller Support/Debug"
-)]
-#[command(arg_required_else_help(true))]
-struct Cli {
-    #[arg(long)]
-    /// URL to storage controller.  e.g. http://127.0.0.1:1234 when using `neon_local`
-    api: Url,
-
-    #[arg(long)]
-    /// JWT token for authenticating with storage controller.  Depending on the API used, this
-    /// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
-    /// a token with both scopes to use with this tool.
-    jwt: Option<String>,
-
-    #[command(subcommand)]
-    command: Command,
-}
-
-#[derive(Debug, Clone)]
-struct PlacementPolicyArg(PlacementPolicy);
-
-impl FromStr for PlacementPolicyArg {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "detached" => Ok(Self(PlacementPolicy::Detached)),
-            "secondary" => Ok(Self(PlacementPolicy::Secondary)),
-            _ if s.starts_with("attached:") => {
-                let mut splitter = s.split(':');
-                let _prefix = splitter.next().unwrap();
-                match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
-                    Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
-                    None => Err(anyhow::anyhow!(
-                        "Invalid format '{s}', a valid example is 'attached:1'"
-                    )),
-                }
-            }
-            _ => Err(anyhow::anyhow!(
-                "Unknown placement policy '{s}', try detached,secondary,attached:<n>"
-            )),
-        }
-    }
-}
-
-#[derive(Debug, Clone)]
-struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
-
-impl FromStr for ShardSchedulingPolicyArg {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self(ShardSchedulingPolicy::Active)),
-            "essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
-            "pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
-            "stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
-            _ => Err(anyhow::anyhow!(
-                "Unknown scheduling policy '{s}', try active,essential,pause,stop"
-            )),
-        }
-    }
-}
-
-#[derive(Debug, Clone)]
-struct NodeAvailabilityArg(NodeAvailabilityWrapper);
-
-impl FromStr for NodeAvailabilityArg {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self(NodeAvailabilityWrapper::Active)),
-            "offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
-            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
-        }
-    }
-}
-
-struct Client {
-    base_url: Url,
-    jwt_token: Option<String>,
-    client: reqwest::Client,
-}
-
-impl Client {
-    fn new(base_url: Url, jwt_token: Option<String>) -> Self {
-        Self {
-            base_url,
-            jwt_token,
-            client: reqwest::ClientBuilder::new()
-                .build()
-                .expect("Failed to construct http client"),
-        }
-    }
-
-    /// Simple HTTP request wrapper for calling into storage controller
-    async fn dispatch<RQ, RS>(
-        &self,
-        method: hyper::Method,
-        path: String,
-        body: Option<RQ>,
-    ) -> mgmt_api::Result<RS>
-    where
-        RQ: Serialize + Sized,
-        RS: DeserializeOwned + Sized,
-    {
-        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
-        // for general purpose API access.
-        let url = Url::from_str(&format!(
-            "http://{}:{}/{path}",
-            self.base_url.host_str().unwrap(),
-            self.base_url.port().unwrap()
-        ))
-        .unwrap();
-
-        let mut builder = self.client.request(method, url);
-        if let Some(body) = body {
-            builder = builder.json(&body)
-        }
-        if let Some(jwt_token) = &self.jwt_token {
-            builder = builder.header(
-                reqwest::header::AUTHORIZATION,
-                format!("Bearer {jwt_token}"),
-            );
-        }
-
-        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
-        let response = response.error_from_body().await?;
-
-        response
-            .json()
-            .await
-            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
-    }
-}
-
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
-    let cli = Cli::parse();
-
-    let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
-
-    let mut trimmed = cli.api.to_string();
-    trimmed.pop();
-    let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
-
-    match cli.command {
-        Command::NodeRegister {
-            node_id,
-            listen_pg_addr,
-            listen_pg_port,
-            listen_http_addr,
-            listen_http_port,
-        } => {
-            storcon_client
-                .dispatch::<_, ()>(
-                    Method::POST,
-                    "control/v1/node".to_string(),
-                    Some(NodeRegisterRequest {
-                        node_id,
-                        listen_pg_addr,
-                        listen_pg_port,
-                        listen_http_addr,
-                        listen_http_port,
-                    }),
-                )
-                .await?;
-        }
-        Command::TenantCreate { tenant_id } => {
-            vps_client
-                .tenant_create(&TenantCreateRequest {
-                    new_tenant_id: TenantShardId::unsharded(tenant_id),
-                    generation: None,
-                    shard_parameters: ShardParameters::default(),
-                    placement_policy: Some(PlacementPolicy::Attached(1)),
-                    config: TenantConfig::default(),
-                })
-                .await?;
-        }
-        Command::TenantDelete { tenant_id } => {
-            let status = vps_client
-                .tenant_delete(TenantShardId::unsharded(tenant_id))
-                .await?;
-            tracing::info!("Delete status: {}", status);
-        }
-        Command::Nodes {} => {
-            let resp = storcon_client
-                .dispatch::<(), Vec<NodeDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/node".to_string(),
-                    None,
-                )
-                .await?;
-            let mut table = comfy_table::Table::new();
-            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
-            for node in resp {
-                table.add_row([
-                    format!("{}", node.id),
-                    node.listen_http_addr,
-                    format!("{:?}", node.scheduling),
-                    format!("{:?}", node.availability),
-                ]);
-            }
-            println!("{table}");
-        }
-        Command::NodeConfigure {
-            node_id,
-            availability,
-            scheduling,
-        } => {
-            let req = NodeConfigureRequest {
-                node_id,
-                availability: availability.map(|a| a.0),
-                scheduling,
-            };
-            storcon_client
-                .dispatch::<_, ()>(
-                    Method::PUT,
-                    format!("control/v1/node/{node_id}/config"),
-                    Some(req),
-                )
-                .await?;
-        }
-        Command::Tenants {} => {
-            let resp = storcon_client
-                .dispatch::<(), Vec<TenantDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/tenant".to_string(),
-                    None,
-                )
-                .await?;
-            let mut table = comfy_table::Table::new();
-            table.set_header([
-                "TenantId",
-                "ShardCount",
-                "StripeSize",
-                "Placement",
-                "Scheduling",
-            ]);
-            for tenant in resp {
-                let shard_zero = tenant.shards.into_iter().next().unwrap();
-                table.add_row([
-                    format!("{}", tenant.tenant_id),
-                    format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
-                    format!("{:?}", tenant.stripe_size),
-                    format!("{:?}", tenant.policy),
-                    format!("{:?}", shard_zero.scheduling_policy),
-                ]);
-            }
-
-            println!("{table}");
-        }
-        Command::TenantPolicy {
-            tenant_id,
-            placement,
-            scheduling,
-        } => {
-            let req = TenantPolicyRequest {
-                scheduling: scheduling.map(|s| s.0),
-                placement: placement.map(|p| p.0),
-            };
-            storcon_client
-                .dispatch::<_, ()>(
-                    Method::PUT,
-                    format!("control/v1/tenant/{tenant_id}/policy"),
-                    Some(req),
-                )
-                .await?;
-        }
-        Command::TenantShardSplit {
-            tenant_id,
-            shard_count,
-            stripe_size,
-        } => {
-            let req = TenantShardSplitRequest {
-                new_shard_count: shard_count,
-                new_stripe_size: stripe_size.map(ShardStripeSize),
-            };
-
-            let response = storcon_client
-                .dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
-                    Method::PUT,
-                    format!("control/v1/tenant/{tenant_id}/shard_split"),
-                    Some(req),
-                )
-                .await?;
-            println!(
-                "Split tenant {} into {} shards: {}",
-                tenant_id,
-                shard_count,
-                response
-                    .new_shards
-                    .iter()
-                    .map(|s| format!("{:?}", s))
-                    .collect::<Vec<_>>()
-                    .join(",")
-            );
-        }
-        Command::TenantShardMigrate {
-            tenant_shard_id,
-            node,
-        } => {
-            let req = TenantShardMigrateRequest {
-                tenant_shard_id,
-                node_id: node,
-            };
-
-            storcon_client
-                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
-                    Method::PUT,
-                    format!("control/v1/tenant/{tenant_shard_id}/migrate"),
-                    Some(req),
-                )
-                .await?;
-        }
-        Command::TenantConfig { tenant_id, config } => {
-            let tenant_conf = serde_json::from_str(&config)?;
-
-            vps_client
-                .tenant_config(&TenantConfigRequest {
-                    tenant_id,
-                    config: tenant_conf,
-                })
-                .await?;
-        }
-        Command::TenantScatter { tenant_id } => {
-            // Find the shards
-            let locate_response = storcon_client
-                .dispatch::<(), TenantLocateResponse>(
-                    Method::GET,
-                    format!("control/v1/tenant/{tenant_id}/locate"),
-                    None,
-                )
-                .await?;
-            let shards = locate_response.shards;
-
-            let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
-            let shard_count = shards.len();
-            for s in shards {
-                let entry = node_to_shards.entry(s.node_id).or_default();
-                entry.push(s.shard_id);
-            }
-
-            // Load list of available nodes
-            let nodes_resp = storcon_client
-                .dispatch::<(), Vec<NodeDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/node".to_string(),
-                    None,
-                )
-                .await?;
-
-            for node in nodes_resp {
-                if matches!(node.availability, NodeAvailabilityWrapper::Active) {
-                    node_to_shards.entry(node.id).or_default();
-                }
-            }
-
-            let max_shard_per_node = shard_count / node_to_shards.len();
-
-            loop {
-                let mut migrate_shard = None;
-                for shards in node_to_shards.values_mut() {
-                    if shards.len() > max_shard_per_node {
-                        // Pick the emptiest
-                        migrate_shard = Some(shards.pop().unwrap());
-                    }
-                }
-                let Some(migrate_shard) = migrate_shard else {
-                    break;
-                };
-
-                // Pick the emptiest node to migrate to
-                let mut destinations = node_to_shards
-                    .iter()
-                    .map(|(k, v)| (k, v.len()))
-                    .collect::<Vec<_>>();
-                destinations.sort_by_key(|i| i.1);
-                let (destination_node, destination_count) = *destinations.first().unwrap();
-                if destination_count + 1 > max_shard_per_node {
-                    // Even the emptiest destination doesn't have space: we're done
-                    break;
-                }
-                let destination_node = *destination_node;
-
-                node_to_shards
-                    .get_mut(&destination_node)
-                    .unwrap()
-                    .push(migrate_shard);
-
-                println!("Migrate {} -> {} ...", migrate_shard, destination_node);
-
-                storcon_client
-                    .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
-                        Method::PUT,
-                        format!("control/v1/tenant/{migrate_shard}/migrate"),
-                        Some(TenantShardMigrateRequest {
-                            tenant_shard_id: migrate_shard,
-                            node_id: destination_node,
-                        }),
-                    )
-                    .await?;
-                println!("Migrate {} -> {} OK", migrate_shard, destination_node);
-            }
-
-            // Spread the shards across the nodes
-        }
-        Command::TenantDescribe { tenant_id } => {
-            let describe_response = storcon_client
-                .dispatch::<(), TenantDescribeResponse>(
-                    Method::GET,
-                    format!("control/v1/tenant/{tenant_id}"),
-                    None,
-                )
-                .await?;
-            let shards = describe_response.shards;
-            let mut table = comfy_table::Table::new();
-            table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
-            for shard in shards {
-                let secondary = shard
-                    .node_secondary
-                    .iter()
-                    .map(|n| format!("{}", n))
-                    .collect::<Vec<_>>()
-                    .join(",");
-
-                let mut status_parts = Vec::new();
-                if shard.is_reconciling {
-                    status_parts.push("reconciling");
-                }
-
-                if shard.is_pending_compute_notification {
-                    status_parts.push("pending_compute");
-                }
-
-                if shard.is_splitting {
-                    status_parts.push("splitting");
-                }
-                let status = status_parts.join(",");
-
-                table.add_row([
-                    format!("{}", shard.tenant_shard_id),
-                    shard
-                        .node_attached
-                        .map(|n| format!("{}", n))
-                        .unwrap_or(String::new()),
-                    secondary,
-                    shard.last_error,
-                    status,
-                ]);
-            }
-            println!("{table}");
-        }
-    }
-
-    Ok(())
-}
--- a/diesel.toml
+++ b/diesel.toml
@@ -2,8 +2,8 @@
 # see https://diesel.rs/guides/configuring-diesel-cli

 [print_schema]
-file = "storage_controller/src/schema.rs"
+file = "control_plane/attachment_service/src/schema.rs"
 custom_type_derives = ["diesel::query_builder::QueryId"]

 [migrations_directory]
-dir = "storage_controller/migrations"
+dir = "control_plane/attachment_service/migrations"
--- a/docs/rfcs/031-sharding-static.md
+++ b/docs/rfcs/031-sharding-static.md
@@ -1,408 +0,0 @@
-# Sharding Phase 1: Static Key-space Sharding
-
-## Summary
-
-To enable databases with sizes approaching the capacity of a pageserver's disk,
-it is necessary to break up the storage for the database, or _shard_ it.
-
-Sharding in general is a complex area. This RFC aims to define an initial
-capability that will permit creating large-capacity databases using a static configuration
-defined at time of Tenant creation.
-
-## Motivation
-
-Currently, all data for a Tenant, including all its timelines, is stored on a single
-pageserver. The local storage required may be several times larger than the actual
-database size, due to LSM write inflation.
-
-If a database is larger than what one pageserver can hold, then it becomes impossible
-for the pageserver to hold it in local storage, as it must do to provide service to
-clients.
-
-### Prior art
-
-In Neon:
-
- Layer File Spreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Konstantin-21fd9b11b618475da5f39c61dd8ab7a4
- Layer File SPreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843
- Key Space partitioning: https://www.notion.so/neondatabase/One-Pager-Key-Space-Partitioning-Stas-8e3a28a600a04a25a68523f42a170677
-
-Prior art in other distributed systems is too broad to capture here: pretty much
-any scale out storage system does something like this.
-
-## Requirements
-
- Enable creating a large (for example, 16TiB) database without requiring dedicated
-  pageserver nodes.
- Share read/write bandwidth costs for large databases across pageservers, as well
-  as storage capacity, in order to avoid large capacity databases acting as I/O hotspots
-  that disrupt service to other tenants.
- Our data distribution scheme should handle sparse/nonuniform keys well, since postgres
-  does not write out a single contiguous ranges of page numbers.
-
-_Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database
-that a user might create on a current-gen enterprise SSD should also work well on
-Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the
-pageserver backend is not the limiting factor in the database size_.
-
-## Non Goals
-
- Independently distributing timelines within the same tenant. If a tenant has many
-  timelines, then sharding may be a less efficient mechanism for distributing load than
-  sharing out timelines between pageservers.
- Distributing work in the LSN dimension: this RFC focuses on the Key dimension only,
-  based on the idea that separate mechanisms will make sense for each dimension.
-
-## Impacted Components
-
-pageserver, control plane, postgres/smgr
-
-## Terminology
-
-**Key**: a postgres page number, qualified by relation. In the sense that the pageserver is a versioned key-value store,
-the page number is the key in that store. `Key` is a literal data type in existing code.
-
-**LSN dimension**: this just means the range of LSNs (history), when talking about the range
-of keys and LSNs as a two dimensional space.
-
-## Implementation
-
-### Key sharding vs. LSN sharding
-
-When we think of sharding across the two dimensional key/lsn space, this is an
-opportunity to think about how the two dimensions differ:
-
- Sharding the key space distributes the _write_ workload of ingesting data
-  and compacting. This work must be carefully managed so that exactly one
-  node owns a given key.
- Sharding the LSN space distributes the _historical read_ workload. This work
-  can be done by anyone without any special coordination, as long as they can
-  see the remote index and layers.
-
-The key sharding is the harder part, and also the more urgent one, to support larger
-capacity databases. Because distributing historical LSN read work is a relatively
-simpler problem that most users don't have, we defer it to future work. It is anticipated
-that some quite simple P2P offload model will enable distributing work for historical
-reads: a node which is low on space can call out to peer to ask it to download and
-serve reads from a historical layer.
-
-### Key mapping scheme
-
-Having decided to focus on key sharding, we must next decide how we will map
-keys to shards. It is proposed to use a "wide striping" approach, to obtain a good compromise
-between data locality and avoiding entire large relations mapping to the same shard.
-
-We will define two spaces:
-
- Key space: unsigned integer
- Shard space: integer from 0 to N-1, where we have N shards.
-
-### Key -> Shard mapping
-
-Keys are currently defined in the pageserver's getpage@lsn interface as follows:
-
-```
-pub struct Key {
-    pub field1: u8,
-    pub field2: u32,
-    pub field3: u32,
-    pub field4: u32,
-    pub field5: u8,
-    pub field6: u32,
-}
-
-
-fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: blknum,
-    }
-}
-```
-
-_Note: keys for relation metadata are ignored here, as this data will be mirrored to all
-shards. For distribution purposes, we only care about user data keys_
-
-The properties we want from our Key->Shard mapping are:
-
- Locality in `blknum`, such that adjacent `blknum` will usually map to
-  the same stripe and consequently land on the same shard, even though the overall
-  collection of blocks in a relation will be spread over many stripes and therefore
-  many shards.
- Avoid the same blknum on different relations landing on the same stripe, so that
-  with many small relations we do not end up aliasing data to the same stripe/shard.
- Avoid vulnerability to aliasing in the values of relation identity fields, such that
-  if there are patterns in the value of `relnode`, these do not manifest as patterns
-  in data placement.
-
-To accomplish this, the blknum is used to select a stripe, and stripes are
-assigned to shards in a pseudorandom order via a hash. The motivation for
-pseudo-random distribution (rather than sequential mapping of stripe to shard)
-is to avoid I/O hotspots when sequentially reading multiple relations: we don't want
-all relations' stripes to touch pageservers in the same order.
-
-To map a `Key` to a shard:
-
- Hash the `Key` field 4 (relNode).
- Divide field 6 (`blknum`) field by the stripe size in pages, and combine the
-  hash of this with the hash from the previous step.
- The total hash modulo the shard count gives the shard holding this key.
-
-Why don't we use the other fields in the Key?
-
- We ignore `forknum` for key mapping, because it distinguishes different classes of data
-  in the same relation, and we would like to keep the data in a relation together.
- We would like to use spcNode and dbNode, but cannot. Postgres database creation operations can refer to an existing database as a template, such that the created
-  database's blocks differ only by spcNode and dbNode from the original. To enable running
-  this type of creation without cross-pageserver communication, we must ensure that these
-  blocks map to the same shard -- we do this by excluding spcNode and dbNode from the hash.
-
-### Data placement examples
-
-For example, consider the extreme large databases cases of postgres data layout in a system with 8 shards
-and a stripe size of 32k pages:
-
- A single large relation: `blknum` division will break the data up into 4096
-  stripes, which will be scattered across the shards.
- 4096 relations of of 32k pages each: each relation will map to exactly one stripe,
-  and that stripe will be placed according to the hash of the key fields 4. The
-  data placement will be statistically uniform across shards.
-
-Data placement will be more uneven on smaller databases:
-
- A tenant with 2 shards and 2 relations of one stripe size each: there is a 50% chance
-  that both relations land on the same shard and no data lands on the other shard.
- A tenant with 8 shards and one relation of size 12 stripes: 4 shards will have double
-  the data of the other four shards.
-
-These uneven cases for small amounts of data do not matter, as long as the stripe size
-is an order of magnitude smaller than the amount of data we are comfortable holding
-in a single shard: if our system handles shard sizes up to 10-100GB, then it is not an issue if
-a tenant has some shards with 256MB size and some shards with 512MB size, even though
-the standard deviation of shard size within the tenant is very high. Our key mapping
-scheme provides a statistical guarantee that as the tenant's overall data size increases,
-uniformity of placement will improve.
-
-### Important Types
-
-#### `ShardIdentity`
-
-Provides the information needed to know whether a particular key belongs
-to a particular shard:
-
- Layout version
- Stripe size
- Shard count
- Shard index
-
-This structure's size is constant. Note that if we had used a differnet key
-mapping scheme such as consistent hashing with explicit hash ranges assigned
-to each shard, then the ShardIdentity's size would grow with the shard count: the simpler
-key mapping scheme used here enables a small fixed size ShardIdentity.
-
-### Pageserver changes
-
-#### Structural
-
-Everywhere the Pageserver currently deals with Tenants, it will move to dealing with
-`TenantShard`s, which are just a `Tenant` plus a `ShardIdentity` telling it which part
-of the keyspace it owns. An un-sharded tenant is just a `TenantShard` whose `ShardIdentity`
-covers the whole keyspace.
-
-When the pageserver writes layers and index_part.json to remote storage, it must
-include the shard index & count in the name, to avoid collisions (the count is
-necessary for future-proofing: the count will vary in time). These keys
-will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work
-exactly the same for TenantShards as it does for Tenants today: each shard will have
-its own generation number.
-
-#### Storage Format: Keys
-
-For tenants with >1 shard, layer files implicitly become sparse: within the key
-range described in the layer name, the layer file for a shard will only hold the
-content relevant to stripes assigned to the shard.
-
-For this reason, the LayerFileName within a tenant is no longer unique: different shards
-may use the same LayerFileName to refer to different data. We may solve this simply
-by including the shard number in the keys used for layers.
-
-The shard number will be included as a prefix (as part of tenant ID), like this:
-
-`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/<layer file name>-<generation>`
-
-`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/index_part.json-<generation>`
-
-Reasons for this particular format:
-
- Use of a prefix is convenient for implementation (no need to carry the shard ID everywhere
-  we construct a layer file name), and enables efficient listing of index_parts within
-  a particular shard-timeline prefix.
- Including the shard _count_ as well as shard number means that in future when we implement
-  shard splitting, it will be possible for a parent shard and one of its children to write
-  the same layer file without a name collision. For example, a parent shard 0_1 might split
-  into two (0_2, 1_2), and in the process of splitting shard 0_2 could write a layer or index_part
-  that is distinct from what shard 0_1 would have written at the same place.
-
-In practice, we expect shard counts to be relatively small, so a `u8` will be sufficient,
-and therefore the shard part of the path can be a fixed-length hex string like `{:02X}{:02X}`,
-for example a single-shard tenant's prefix will be `0001`.
-
-For backward compatibility, we may define a special `ShardIdentity` that has shard_count==0,
-and use this as a cue to construct paths with no prefix at all.
-
-#### Storage Format: Indices
-
-In the phase 1 described in this RFC, shards only reference layers they write themselves. However,
-when we implement shard splitting in future, it will be useful to enable shards to reference layers
-written by other shards (specifically the parent shard during a split), so that shards don't
-have to exhaustively copy all data into their own shard-prefixed keys.
-
-To enable this, the `IndexPart` structure will be extended to store the (shard number, shard count)
-tuple on each layer, such that it can construct paths for layers written by other shards. This
-naturally raises the question of who "owns" such layers written by ancestral shards: this problem
-will be addressed in phase 2.
-
-For backward compatibility, any index entry without shard information will be assumed to be
-in the legacy shardidentity.
-
-#### WAL Ingest
-
-In Phase 1, all shards will subscribe to the safekeeper to download WAL content. They will filter
-it down to the pages relevant to their shard:
-
- For ordinary user data writes, only retain a write if it matches the ShardIdentity
- For metadata describing relations etc, all shards retain these writes.
-
-The pageservers must somehow give the safekeeper correct feedback on remote_consistent_lsn:
-one solution here is for the 0th shard to periodically peek at the IndexParts for all the other shards,
-and have only the 0th shard populate remote_consistent_lsn. However, this is relatively
-expensive: if the safekeeper can be made shard-aware then it could be taught to use
-the max() of all shards' remote_consistent_lsns to decide when to trim the WAL.
-
-#### Compaction/GC
-
-No changes needed.
-
-The pageserver doesn't have to do anything special during compaction
-or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity.
-This will result in sparse layer files, containing keys only in the stripes that this
-shard owns. Where optimizations currently exist in compaction for spotting "gaps" in
-the key range, these should be updated to ignore gaps that are due to sharding, to
-avoid spuriously splitting up layers ito stripe-sized pieces.
-
-### Compute Endpoints
-
-Compute endpoints will need to:
-
- Accept a vector of connection strings as part of their configuration from the control plane
- Route pageserver requests according to mapping the hash of key to the correct
-  entry in the vector of connection strings.
-
-Doing this in compute rather than routing requests via a single pageserver is
-necessary to enable sharding tenants without adding latency from extra hops.
-
-### Control Plane
-
-Tenants, or _Projects_ in the control plane, will each own a set of TenantShards (this will
-be 1 for small tenants). Logic for placement of tenant shards is just the same as the current logic for placing
-tenants.
-
-Tenant lifecycle operations like deletion will require fanning-out to all the shards
-in the tenant. The same goes for timeline creation and deletion: a timeline should
-not be considered created until it has been created in all shards.
-
-#### Selectively enabling sharding for large tenants
-
-Initially, we will explicitly enable sharding for large tenants only.
-
-In future, this hint mechanism will become optional when we implement automatic
-re-sharding of tenants.
-
-## Future Phases
-
-This section exists to indicate what will likely come next after this phase.
-
-Phases 2a and 2b are amenable to execution in parallel.
-
-### Phase 2a: WAL fan-out
-
-**Problem**: when all shards consume the whole WAL, the network bandwidth used
-for transmitting the WAL from safekeeper to pageservers is multiplied by a factor
-of the shard count.
-
-Network bandwidth is not our most pressing bottleneck, but it is likely to become
-a problem if we set a modest shard count (~8) on a significant number of tenants,
-especially as those larger tenants which we shard are also likely to have higher
-write bandwidth than average.
-
-### Phase 2b: Shard Splitting
-
-**Problem**: the number of shards in a tenant is defined at creation time and cannot
-be changed. This causes excessive sharding for most small tenants, and an upper
-bound on scale for very large tenants.
-
-To address this, a _splitting_ feature will later be added. One shard can split its
-data into a number of children by doing a special compaction operation to generate
-image layers broken up child-shard-wise, and then writing out an `index_part.json` for
-each child. This will then require external coordination (by the control plane) to
-safely attach these new child shards and then move them around to distribute work.
-The opposite _merging_ operation can also be imagined, but is unlikely to be implemented:
-once a Tenant has been sharded, the marginal efficiency benefit of merging is unlikely to justify
-the risk/complexity of implementing such a rarely-encountered scenario.
-
-### Phase N (future): distributed historical reads
-
-**Problem**: while sharding based on key is good for handling changes in overall
-database size, it is less suitable for spiky/unpredictable changes in the read
-workload to historical layers. Sudden increases in historical reads could result
-in sudden increases in local disk capacity required for a TenantShard.
-
-Example: the extreme case of this would be to run a tenant for a year, then create branches
-with ancestors at monthly intervals. This could lead to a sudden 12x inflation in
-the on-disk capacity footprint of a TenantShard, since it would be serving reads
-from all those disparate historical layers.
-
-If we can respond fast enough, then key-sharding a tenant more finely can help with
-this, but splitting may be a relatively expensive operation and the increased historical
-read load may be transient.
-
-A separate mechanism for handling heavy historical reads could be something like
-a gossip mechanism for pageservers to communicate
-about their workload, and then a getpageatlsn offload mechanism where one pageserver can
-ask another to go read the necessary layers from remote storage to serve the read. This
-requires relativly little coordination because it is read-only: any node can service any
-read. All reads to a particular shard would still flow through one node, but the
-disk capactity & I/O impact of servicing the read would be distributed.
-
-## FAQ/Alternatives
-
-### Why stripe the data, rather than using contiguous ranges of keyspace for each shard?
-
-When a database is growing under a write workload, writes may predominantly hit the
-end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user
-is intensively re-writing a particular relation, if that relation lived in a particular
-shard then it would not achieve our goal of distributing the write work across shards.
-
-### Why not proxy read requests through one pageserver, so that endpoints don't have to change?
-
-1. This would not achieve scale-out of network bandwidth: a busy tenant with a large
-   database would still cause a load hotspot on the pageserver routing its read requests.
-2. The additional hop through the "proxy" pageserver would add latency and overall
-   resource cost (CPU, network bandwidth)
-
-### Layer File Spreading: use one pageserver as the owner of a tenant, and have it spread out work on a per-layer basis to peers
-
-In this model, there would be no explicit sharding of work, but the pageserver to which
-a tenant is attached would not hold all layers on its disk: instead, it would call out
-to peers to have them store some layers, and call out to those peers to request reads
-in those layers.
-
-This mechanism will work well for distributing work in the LSN dimension, but in the key
-space dimension it has the major limitation of requiring one node to handle all
-incoming writes, and compactions. Even if the write workload for a large database
-fits in one pageserver, it will still be a hotspot and such tenants may still
-de-facto require their own pageserver.
--- a/docs/rfcs/032-shard-splitting.md
+++ b/docs/rfcs/032-shard-splitting.md
@@ -1,479 +0,0 @@
-# Shard splitting
-
-## Summary
-
-This RFC describes a new pageserver API for splitting an existing tenant shard into
-multiple shards, and describes how to use this API to safely increase the total
-shard count of a tenant.
-
-## Motivation
-
-In the [sharding RFC](031-sharding-static.md), a mechanism was introduced to scale
-tenants beyond the capacity of a single pageserver by breaking up the key space
-into stripes, and distributing these stripes across many pageservers. However,
-the shard count was defined once at tenant creation time and not varied thereafter.
-
-In practice, the expected size of a database is rarely known at creation time, and
-it is inefficient to enable sharding for very small tenants: we need to be
-able to create a tenant with a small number of shards (such as 1), and later expand
-when it becomes clear that the tenant has grown in size to a point where sharding
-is beneficial.
-
-### Prior art
-
-Many distributed systems have the problem of choosing how many shards to create for
-tenants that do not specify an expected size up-front. There are a couple of general
-approaches:
-
- Write to a key space in order, and start a new shard when the highest key advances
-  past some point. This doesn't work well for Neon, because we write to our key space
-  in many different contiguous ranges (per relation), rather than in one contiguous
-  range. To adapt to this kind of model, we would need a sharding scheme where each
-  relation had its own range of shards, which would be inefficient for the common
-  case of databases with many small relations.
- Monitor the system, and automatically re-shard at some size threshold. For
-  example in Ceph, the [pg_autoscaler](https://github.com/ceph/ceph/blob/49c27499af4ee9a90f69fcc6bf3597999d6efc7b/src/pybind/mgr/pg_autoscaler/module.py)
-  component monitors the size of each RADOS Pool, and adjusts the number of Placement
-  Groups (Ceph's shard equivalent).
-
-## Requirements
-
- A configurable capacity limit per-shard is enforced.
- Changes in shard count do not interrupt service beyond requiring postgres
-  to reconnect (i.e. milliseconds).
- Human being does not have to choose shard count
-
-## Non Goals
-
- Shard splitting is always a tenant-global operation: we will not enable splitting
-  one shard while leaving others intact.
- The inverse operation (shard merging) is not described in this RFC. This is a lower
-  priority than splitting, because databases grow more often than they shrink, and
-  a database with many shards will still work properly if the stored data shrinks, just
-  with slightly more overhead (e.g. redundant WAL replication)
- Shard splitting is only initiated based on capacity bounds, not load. Splitting
-  a tenant based on load will make sense for some medium-capacity, high-load workloads,
-  but is more complex to reason about and likely is not desirable until we have
-  shard merging to reduce the shard count again if the database becomes less busy.
-
-## Impacted Components
-
-pageserver, storage controller
-
-(the _storage controller_ is the evolution of what was called `attachment_service` in our test environment)
-
-## Terminology
-
-**Parent** shards are the shards that exist before a split. **Child** shards are
-the new shards created during a split.
-
-**Shard** is synonymous with _tenant shard_.
-
-**Shard Index** is the 2-tuple of shard number and shard count, written in
-paths as {:02x}{:02x}, e.g. `0001`.
-
-## Background
-
-In the implementation section, a couple of existing aspects of sharding are important
-to remember:
-
- Shard identifiers contain the shard number and count, so that "shard 0 of 1" (`0001`) is
-  a distinct shard from "shard 0 of 2" (`0002`). This is the case in key paths, local
-  storage paths, and remote index metadata.
- Remote layer file paths contain the shard index of the shard that created them, and
-  remote indices contain the same index to enable building the layer file path. A shard's
-  index may reference layers that were created by another shard.
- Local tenant shard directories include the shard index. All layers downloaded by
-  a tenant shard are stored in this shard-prefixed path, even if those layers were
-  initially created by another shard: tenant shards do not read and write one anothers'
-  paths.
- The `Tenant` pageserver type represents one tenant _shard_, not the whole tenant.
-  This is for historical reasons and will be cleaned up in future, but the existing
-  name is used here to help comprehension when reading code.
-
-## Implementation
-
-Note: this section focuses on the correctness of the core split process. This will
-be fairly inefficient in a naive implementation, and several important optimizations
-are described in a later section.
-
-There are broadly two parts to the implementation:
-
-1. The pageserver split API, which splits one shard on one pageserver
-2. The overall tenant split proccess which is coordinated by the storage controller,
-   and calls into the pageserver split API as needed.
-
-### Pageserver Split API
-
-The pageserver will expose a new API endpoint at `/v1/tenant/:tenant_shard_id/shard_split`
-that takes the new total shard count in the body.
-
-The pageserver split API operates on one tenant shard, on one pageserver. External
-coordination is required to use it safely, this is described in the later
-'Split procedure' section.
-
-#### Preparation
-
-First identify the shard indices for the new child shards. These are deterministic,
-calculated from the parent shard's index, and the number of children being created (this
-is an input to the API, and validated to be a power of two). In a trivial example, splitting
-0001 in two always results in 0002 and 0102.
-
-Child shard indices are chosen such that the childrens' parts of the keyspace will
-be subsets of the parent's parts of the keyspace.
-
-#### Step 1: write new remote indices
-
-In remote storage, splitting is very simple: we may just write new index_part.json
-objects for each child shard, containing exactly the same layers as the parent shard.
-
-The children will have more data than they need, but this avoids any exhausive
-re-writing or copying of layer files.
-
-The index key path includes a generation number: the parent shard's current
-attached generation number will also be used for the child shards' indices. This
-makes the operation safely retryable: if everything crashes and restarts, we may
-call the split API again on the parent shard, and the result will be some new remote
-indices for the child shards, under a higher generation number.
-
-#### Step 2: start new `Tenant` objects
-
-A new `Tenant` object may be instantiated for each child shard, while the parent
-shard still exists. When calling the tenant_spawn function for this object,
-the remote index from step 1 will be read, and the child shard will start
-to ingest WAL to catch up from whatever was in the remote storage at step 1.
-
-We now wait for child shards' WAL ingestion to catch up with the parent shard,
-so that we can safely tear down the parent shard without risking an availability
-gap to clients reading recent LSNs.
-
-#### Step 3: tear down parent `Tenant` object
-
-Once child shards are running and have caught up with WAL ingest, we no longer
-need the parent shard. Note that clients may still be using it -- when we
-shut it down, any page_service handlers will also shut down, causing clients
-to disconnect. When the client reconnects, it will re-lookup the tenant,
-and hit the child shard instead of the parent (shard lookup from page_service
-should bias toward higher ShardCount shards).
-
-Note that at this stage the page service client has not yet been notified of
-any split. In the trivial single split example:
-
- Shard 0001 is gone: Tenant object torn down
- Shards 0002 and 0102 are running on the same pageserver where Shard 0001 used to live.
- Clients will continue to connect to that server thinking that shard 0001 is there,
-  and all requests will work, because any key that was in shard 0001 is definitely
-  available in either shard 0002 or shard 0102.
- Eventually, the storage controller (not the pageserver) will decide to migrate
-  some child shards away: at that point it will do a live migration, ensuring
-  that the client has an updated configuration before it detaches anything
-  from the original server.
-
-#### Complete
-
-When we send a 200 response to the split request, we are promising the caller:
-
- That the child shards are persistent in remote storage
- That the parent shard has been shut down
-
-This enables the caller to proceed with the overall shard split operation, which
-may involve other shards on other pageservers.
-
-### Storage Controller Split procedure
-
-Splitting a tenant requires calling the pageserver split API, and tracking
-enough state to ensure recovery + completion in the event of any component (pageserver
-or storage controller) crashing (or request timing out) during the split.
-
-1. call the split API on all existing shards. Ensure that the resulting
-   child shards are pinned to their pageservers until _all_ the split calls are done.
-   This pinning may be implemented as a "split bit" on the tenant shards, that
-   blocks any migrations, and also acts as a sign that if we restart, we must go
-   through some recovery steps to resume the split.
-2. Once all the split calls are done, we may unpin the child shards (clear
-   the split bit). The split is now complete: subsequent steps are just migrations,
-   not strictly part of the split.
-3. Try to schedule new pageserver locations for the child shards, using
-   a soft anti-affinity constraint to place shards from the same tenant onto different
-   pageservers.
-
-Updating computes about the new shard count is not necessary until we migrate
-any of the child shards away from the parent's location.
-
-### Recovering from failures
-
-#### Rolling back an incomplete split
-
-An incomplete shard split may be rolled back quite simply, by attaching the parent shards to pageservers,
-and detaching child shards. This will lose any WAL ingested into the children after the parents
-were detached earlier, but the parents will catch up.
-
-No special pageserver API is needed for this. From the storage controllers point of view, the
-procedure is:
-
-1. For all parent shards in the tenant, ensure they are attached
-2. For all child shards, ensure they are not attached
-3. Drop child shards from the storage controller's database, and clear the split bit on the parent shards.
-
-Any remote storage content for child shards is left behind. This is similar to other cases where
-we may leave garbage objects in S3 (e.g. when we upload a layer but crash before uploading an
-index that references it). Future online scrub/cleanup functionality can remove these objects, or
-they will be removed when the tenant is deleted, as tenant deletion lists all objects in the prefix,
-which would include any child shards that were rolled back.
-
-If any timelines had been created on child shards, they will be lost when rolling back. To mitigate
-this, we will **block timeline creation during splitting**, so that we can safely roll back until
-the split is complete, without risking losing timelines.
-
-Rolling back an incomplete split will happen automatically if a split fails due to some fatal
-reason, and will not be accessible via an API:
-
- A pageserver fails to complete its split API request after too many retries
- A pageserver returns a fatal unexpected error such as 400 or 500
- The storage controller database returns a non-retryable error
- Some internal invariant is violated in the storage controller split code
-
-#### Rolling back a complete split
-
-A complete shard split may be rolled back similarly to an incomplete split, with the following
-modifications:
-
- The parent shards will no longer exist in the storage controller database, so these must
-  be re-synthesized somehow: the hard part of this is figuring the parent shards' generations. This
-  may be accomplished either by probing in S3, or by retaining some tombstone state for deleted
-  shards in the storage controller database.
- Any timelines that were created after the split complete will disappear when rolling back
-  to the tenant shards. For this reason, rolling back after a complete split should only
-  be done due to serious issues where loss of recently created timelines is acceptable, or
-  in cases where we have confirmed that no timelines were created in the intervening period.
- Parent shards' layers must not have been deleted: this property will come "for free" when
-  we first roll out sharding, by simply not implementing deletion of parent layers after
-  a split. When we do implement such deletion (see "Cleaning up parent-shard layers" in the
-  Optimizations section), it should apply a TTL to layers such that we have a
-  defined walltime window in which rollback will be possible.
-
-The storage controller will expose an API for rolling back a complete split, for use
-in the field if we encounter some critical bug with a post-split tenant.
-
-#### Retrying API calls during Pageserver Restart
-
-When a pageserver restarts during a split API call, it may witness on-disk content for both parent and
-child shards from an ongoing split. This does not intrinsically break anything, and the
-pageserver may include all these shards in its `/re-attach` request to the storage controller.
-
-In order to support such restarts, it is important that the storage controller stores
-persistent records of each child shard before it calls into a pageserver, as these child shards
-may require generation increments via a `/re-attach` request.
-
-The pageserver restart will also result in a failed API call from the storage controller's point
-of view. Recall that if _any_ pageserver fails to split, the overall split operation may not
-complete, and all shards must remain pinned to their current pageserver locations until the
-split is done.
-
-The pageserver API calls during splitting will retry on transient errors, so that
-short availability gaps do not result in a failure of the overall operation. The
-split in progress will be automatically rolled back if the threshold for API
-retries is reached (e.g. if a pageserver stays offline for longer than a typical
-restart).
-
-#### Rollback on Storage Controller Restart
-
-On startup, the storage controller will inspect the split bit for tenant shards that
-it loads from the database. If any splits are in progress:
-
- Database content will be reverted to the parent shards
- Child shards will be dropped from memory
- The parent and child shards will be included in the general startup reconciliation that
-  the storage controller does: any child shards will be detached from pageservers because
-  they don't exist in the storage controller's expected set of shards, and parent shards
-  will be attached if they aren't already.
-
-#### Storage controller API request failures/retries
-
-The split request handler will implement idempotency: if the [`Tenant`] requested to split
-doesn't exist, we will check for the would-be child shards, and if they already exist,
-we consider the request complete.
-
-If a request is retried while the original request is still underway, then the split
-request handler will notice an InProgress marker in TenantManager, and return 503
-to encourage the client to backoff/retry. This is the same as the general pageserver
-API handling for calls that try to act on an InProgress shard.
-
-#### Compute start/restart during a split
-
-If a compute starts up during split, it will be configured with the old sharding
-configuration. This will work for reads irrespective of the progress of the split
-as long as no child hards have been migrated away from their original location, and
-this is guaranteed in the split procedure (see earlier section).
-
-#### Pageserver fails permanently during a split
-
-If a pageserver permanently fails (i.e. the storage controller availability state for it
-goes to Offline) while a split is in progress, the splitting operation will roll back, and
-during the roll back it will skip any API calls to the offline pageserver. If the offline
-pageserver becomes available again, any stale locations will be cleaned up via the normal reconciliation process (the `/re-attach` API).
-
-### Handling secondary locations
-
-For correctness, it is not necessary to split secondary locations. We can simply detach
-the secondary locations for parent shards, and then attach new secondary locations
-for child shards.
-
-Clearly this is not optimal, as it will result in re-downloads of layer files that
-were already present on disk. See "Splitting secondary locations"
-
-### Conditions to trigger a split
-
-The pageserver will expose a new API for reporting on shards that are candidates
-for split: this will return a top-N report of the largest tenant shards by
-physical size (remote size). This should exclude any tenants that are already
-at the maximum configured shard count.
-
-The API would look something like:
-`/v1/top_n_tenant?shard_count_lt=8&sort_by=resident_size`
-
-The storage controller will poll that API across all pageservers it manages at some appropriate interval (e.g. 60 seconds).
-
-A split operation will be started when the tenant exceeds some threshold. This threshold
-should be _less than_ how large we actually want shards to be, perhaps much less. That's to
-minimize the amount of work involved in splitting -- if we want 100GiB shards, we shouldn't
-wait for a tenant to exceed 100GiB before we split anything. Some data analysis of existing
-tenant size distribution may be useful here: if we can make a statement like "usually, if
-a tenant has exceeded 20GiB they're probably going to exceed 100GiB later", then we might
-make our policy to split a tenant at 20GiB.
-
-The finest split we can do is by factors of two, but we can do higher-cardinality splits
-too, and this will help to reduce the overhead of repeatedly re-splitting a tenant
-as it grows. An example of a very simple heuristic for early deployment of the splitting
-feature would be: "Split tenants into 8 shards when their physical size exceeds 64GiB": that
-would give us two kinds of tenant (1 shard and 8 shards), and the confidence that once we had
-split a tenant, it will not need re-splitting soon after.
-
-## Optimizations
-
-### Flush parent shard to remote storage during split
-
-Any data that is in WAL but not remote storage at time of split will need
-to be replayed by child shards when they start for the first time. To minimize
-this work, we may flush the parent shard to remote storage before writing the
-remote indices for child shards.
-
-It is important that this flush is subject to some time bounds: we may be splitting
-in response to a surge of write ingest, so it may be time-critical to split. A
-few seconds to flush latest data should be sufficient to optimize common cases without
-running the risk of holding up a split for a harmful length of time when a parent
-shard is being written heavily. If the flush doesn't complete in time, we may proceed
-to shut down the parent shard and carry on with the split.
-
-### Hard linking parent layers into child shard directories
-
-Before we start the Tenant objects for child shards, we may pre-populate their
-local storage directories with hard links to the layer files already present
-in the parent shard's local directory. When the child shard starts and downloads
-its remote index, it will find all those layer files already present on local disk.
-
-This avoids wasting download capacity and makes splitting faster, but more importantly
-it avoids taking up a factor of N more disk space when splitting 1 shard into N.
-
-This mechanism will work well in typical flows where shards are migrated away
-promptly after a split, but for the general case including what happens when
-layers are evicted and re-downloaded after a split, see the 'Proactive compaction'
-section below.
-
-### Filtering during compaction
-
-Compaction, especially image layer generation, should skip any keys that are
-present in a shard's layer files, but do not match the shard's ShardIdentity's
-is_key_local() check. This avoids carrying around data for longer than necessary
-in post-split compactions.
-
-This was already implemented in https://github.com/neondatabase/neon/pull/6246
-
-### Proactive compaction
-
-In remote storage, there is little reason to rewrite any data on a shard split:
-all the children can reference parent layers via the very cheap write of the child
-index_part.json.
-
-In local storage, things are more nuanced. During the initial split there is no
-capacity cost to duplicating parent layers, if we implement the hard linking
-optimization described above. However, as soon as any layers are evicted from
-local disk and re-downloaded, the downloaded layers will not be hard-links any more:
-they'll have real capacity footprint. That isn't a problem if we migrate child shards
-away from the parent node swiftly, but it risks a significant over-use of local disk
-space if we do not.
-
-For example, if we did an 8-way split of a shard, and then _didn't_ migrate 7 of
-the shards elsewhere, then churned all the layers in all the shards via eviction,
-then we would blow up the storage capacity used on the node by 8x. If we're splitting
-a 100GB shard, that could take the pageserver to the point of exhausting disk space.
-
-To avoid this scenario, we could implement a special compaction mode where we just
-read historic layers, drop unwanted keys, and write back the layer file. This
-is pretty expensive, but useful if we have split a large shard and are not going to
-migrate the child shards away.
-
-The heuristic conditions for triggering such a compaction are:
-
- A) eviction plus time: if a child shard
-  has existed for more than a time threshold, and has been requested to perform at least one eviction, then it becomes urgent for this child shard to execute a proactive compaction to reduce its storage footprint, at the cost of I/O load.
- B) resident size plus time: we may inspect the resident layers and calculate how
-  many of them include the overhead of storing pre-split keys. After some time
-  threshold (different to the one in case A) we still have such layers occupying
-  local disk space, then we should proactively compact them.
-
-### Cleaning up parent-shard layers
-
-It is functionally harmless to leave parent shard layers in remote storage indefinitely.
-They would be cleaned up in the event of the tenant's deletion.
-
-As an optimization to avoid leaking remote storage capacity (which costs money), we may
-lazily clean up parent shard layers once no child shards reference them.
-
-This may be done _very_ lazily: e.g. check every PITR interval. The cleanup procedure is:
-
- list all the key prefixes beginning with the tenant ID, and select those shard prefixes
-  which do not belong to the most-recently-split set of shards (_ancestral shards_, i.e. `shard*count < max(shard_count) over all shards)`, and those shard prefixes which do have the latest shard count (_current shards_)
- If there are no _ancestral shard_ prefixes found, we have nothing to clean up and
-  may drop out now.
- find the latest-generation index for each _current shard_, read all and accumulate the set of layers belonging to ancestral shards referenced by these indices.
- for all ancestral shards, list objects in the prefix and delete any layer which was not
-  referenced by a current shard.
-
-If this cleanup is scheduled for 1-2 PITR periods after the split, there is a good chance that child shards will have written their own image layers covering the whole keyspace, such that all parent shard layers will be deletable.
-
-The cleanup may be done by the scrubber (external process), or we may choose to have
-the zeroth shard in the latest generation do the work -- there is no obstacle to one shard
-reading the other shard's indices at runtime, and we do not require visibility of the
-latest index writes.
-
-Cleanup should be artificially delayed by some period (for example 24 hours) to ensure
-that we retain the option to roll back a split in case of bugs.
-
-### Splitting secondary locations
-
-We may implement a pageserver API similar to the main splitting API, which does a simpler
-operation for secondary locations: it would not write anything to S3, instead it would simply
-create the child shard directory on local disk, hard link in directories from the parent,
-and set up the in memory (TenantSlot) state for the children.
-
-Similar to attached locations, a subset of secondary locations will probably need re-locating
-after the split is complete, to avoid leaving multiple child shards on the same pageservers,
-where they may use excessive space for the tenant.
-
-## FAQ/Alternatives
-
-### What should the thresholds be set to?
-
-Shard size limit: the pre-sharding default capacity quota for databases was 200GiB, so this could be a starting point for the per-shard size limit.
-
-Max shard count:
-
- The safekeeper overhead to sharding is currently O(N) network bandwidth because
-  the un-filtered WAL is sent to all shards. To avoid this growing out of control,
-  a limit of 8 shards should be temporarily imposed until WAL filtering is implemented
-  on the safekeeper.
- there is also little benefit to increasing the shard count beyond the number
-  of pageservers in a region.
-
-### Is it worth just rewriting all the data during a split to simplify reasoning about space?
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -7,11 +7,6 @@ Below you will find a brief overview of each subdir in the source tree in alphab
 Neon storage broker, providing messaging between safekeepers and pageservers.
 [storage_broker.md](./storage_broker.md)

-`storage_controller`:
-
-Neon storage controller, manages a cluster of pageservers and exposes an API that enables
-managing a many-sharded tenant as a single entity.
-
 `/control_plane`:

 Local control plane.
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -40,7 +40,7 @@ macro_rules! register_hll {
    }};

    ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
-        $crate::register_hll!($N, $crate::opts!($NAME, $HELP))
+        $crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
    }};
 }

--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -2,14 +2,11 @@ use std::str::FromStr;

 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
-/// in [`storage_controller::http`]
+/// in [`attachment_service::http`]
 use serde::{Deserialize, Serialize};
-use utils::id::{NodeId, TenantId};
+use utils::id::NodeId;

-use crate::{
-    models::{ShardParameters, TenantConfig},
-    shard::{ShardStripeSize, TenantShardId},
-};
+use crate::{models::ShardParameters, shard::TenantShardId};

 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateResponseShard {
@@ -38,16 +35,10 @@ pub struct NodeRegisterRequest {
 pub struct NodeConfigureRequest {
    pub node_id: NodeId,

-    pub availability: Option<NodeAvailabilityWrapper>,
+    pub availability: Option<NodeAvailability>,
    pub scheduling: Option<NodeSchedulingPolicy>,
 }

-#[derive(Serialize, Deserialize)]
-pub struct TenantPolicyRequest {
-    pub placement: Option<PlacementPolicy>,
-    pub scheduling: Option<ShardSchedulingPolicy>,
-}
-
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantLocateResponseShard {
    pub shard_id: TenantShardId,
@@ -66,48 +57,6 @@ pub struct TenantLocateResponse {
    pub shard_params: ShardParameters,
 }

-#[derive(Serialize, Deserialize)]
-pub struct TenantDescribeResponse {
-    pub tenant_id: TenantId,
-    pub shards: Vec<TenantDescribeResponseShard>,
-    pub stripe_size: ShardStripeSize,
-    pub policy: PlacementPolicy,
-    pub config: TenantConfig,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct NodeDescribeResponse {
-    pub id: NodeId,
-
-    pub availability: NodeAvailabilityWrapper,
-    pub scheduling: NodeSchedulingPolicy,
-
-    pub listen_http_addr: String,
-    pub listen_http_port: u16,
-
-    pub listen_pg_addr: String,
-    pub listen_pg_port: u16,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantDescribeResponseShard {
-    pub tenant_shard_id: TenantShardId,
-
-    pub node_attached: Option<NodeId>,
-    pub node_secondary: Vec<NodeId>,
-
-    pub last_error: String,
-
-    /// A task is currently running to reconcile this tenant's intent state with the state on pageservers
-    pub is_reconciling: bool,
-    /// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending.
-    pub is_pending_compute_notification: bool,
-    /// A shard split is currently underway
-    pub is_splitting: bool,
-
-    pub scheduling_policy: ShardSchedulingPolicy,
-}
-
 /// Explicitly migrating a particular shard is a low level operation
 /// TODO: higher level "Reschedule tenant" operation where the request
 /// specifies some constraints, e.g. asking it to get off particular node(s)
@@ -117,94 +66,29 @@ pub struct TenantShardMigrateRequest {
    pub node_id: NodeId,
 }

-/// Utilisation score indicating how good a candidate a pageserver
-/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
-/// Lower values are better.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
-pub struct UtilizationScore(pub u64);
-
-impl UtilizationScore {
-    pub fn worst() -> Self {
-        UtilizationScore(u64::MAX)
-    }
-}
-
-#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
-#[serde(into = "NodeAvailabilityWrapper")]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 pub enum NodeAvailability {
    // Normal, happy state
-    Active(UtilizationScore),
+    Active,
    // Offline: Tenants shouldn't try to attach here, but they may assume that their
    // secondary locations on this node still exist.  Newly added nodes are in this
    // state until we successfully contact them.
    Offline,
 }

-impl PartialEq for NodeAvailability {
-    fn eq(&self, other: &Self) -> bool {
-        use NodeAvailability::*;
-        matches!((self, other), (Active(_), Active(_)) | (Offline, Offline))
-    }
-}
+impl FromStr for NodeAvailability {
+    type Err = anyhow::Error;

-impl Eq for NodeAvailability {}
-
-// This wrapper provides serde functionality and it should only be used to
-// communicate with external callers which don't know or care about the
-// utilisation score of the pageserver it is targeting.
-#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
-pub enum NodeAvailabilityWrapper {
-    Active,
-    Offline,
-}
-
-impl From<NodeAvailabilityWrapper> for NodeAvailability {
-    fn from(val: NodeAvailabilityWrapper) -> Self {
-        match val {
-            // Assume the worst utilisation score to begin with. It will later be updated by
-            // the heartbeats.
-            NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
-            NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "offline" => Ok(Self::Offline),
+            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
        }
    }
 }

-impl From<NodeAvailability> for NodeAvailabilityWrapper {
-    fn from(val: NodeAvailability) -> Self {
-        match val {
-            NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active,
-            NodeAvailability::Offline => NodeAvailabilityWrapper::Offline,
-        }
-    }
-}
-
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
-pub enum ShardSchedulingPolicy {
-    // Normal mode: the tenant's scheduled locations may be updated at will, including
-    // for non-essential optimization.
-    Active,
-
-    // Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
-    // For example, this still permits a node's attachment location to change to a secondary in
-    // response to a node failure, or to assign a new secondary if a node was removed.
-    Essential,
-
-    // No scheduling: leave the shard running wherever it currently is.  Even if the shard is
-    // unavailable, it will not be rescheduled to another node.
-    Pause,
-
-    // No reconciling: we will make no location_conf API calls to pageservers at all.  If the
-    // shard is unavailable, it stays that way.  If a node fails, this shard doesn't get failed over.
-    Stop,
-}
-
-impl Default for ShardSchedulingPolicy {
-    fn default() -> Self {
-        Self::Active
-    }
-}
-
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 pub enum NodeSchedulingPolicy {
    Active,
    Filling,
@@ -243,8 +127,11 @@ impl From<NodeSchedulingPolicy> for String {
 /// to create secondary locations.
 #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
 pub enum PlacementPolicy {
-    /// Normal live state: one attached pageserver and zero or more secondaries.
-    Attached(usize),
+    /// Cheapest way to attach a tenant: just one pageserver, no secondary
+    Single,
+    /// Production-ready way to attach a tenant: one attached pageserver and
+    /// some number of secondaries.
+    Double(usize),
    /// Create one secondary mode locations. This is useful when onboarding
    /// a tenant, or for an idle tenant that we might want to bring online quickly.
    Secondary,
@@ -266,14 +153,14 @@ mod test {
    /// Check stability of PlacementPolicy's serialization
    #[test]
    fn placement_policy_encoding() -> anyhow::Result<()> {
-        let v = PlacementPolicy::Attached(1);
+        let v = PlacementPolicy::Double(1);
        let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "{\"Attached\":1}");
+        assert_eq!(encoded, "{\"Double\":1}");
        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);

-        let v = PlacementPolicy::Detached;
+        let v = PlacementPolicy::Single;
        let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "\"Detached\"");
+        assert_eq!(encoded, "\"Single\"");
        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
        Ok(())
    }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -4,7 +4,6 @@ pub mod utilization;
 pub use utilization::PageserverUtilization;

 use std::{
-    borrow::Cow,
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
@@ -295,13 +294,13 @@ pub struct TenantConfig {
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
    pub trace_read_requests: Option<bool>,
+    pub image_layer_compression: Option<CompressionAlgorithm>,
    pub eviction_policy: Option<EvictionPolicy>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
    pub heatmap_period: Option<String>,
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
-    pub image_layer_creation_check_threshold: Option<u8>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -329,6 +328,23 @@ pub enum CompactionAlgorithm {
    Tiered,
 }

+#[derive(
+    Debug,
+    Clone,
+    Copy,
+    PartialEq,
+    Eq,
+    Serialize,
+    Deserialize,
+    strum_macros::FromRepr,
+    enum_map::Enum,
+)]
+#[repr(u8)]
+pub enum CompressionAlgorithm {
+    NoCompression,
+    LZ4,
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
    #[serde(with = "humantime_serde")]
@@ -579,7 +595,7 @@ pub struct TimelineInfo {
    pub walreceiver_status: String,
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize)]
 pub struct LayerMapInfo {
    pub in_memory_layers: Vec<InMemoryLayerInfo>,
    pub historic_layers: Vec<HistoricLayerInfo>,
@@ -597,7 +613,7 @@ pub enum LayerAccessKind {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerAccessStatFullDetails {
    pub when_millis_since_epoch: u64,
-    pub task_kind: Cow<'static, str>,
+    pub task_kind: &'static str,
    pub access_kind: LayerAccessKind,
 }

@@ -656,23 +672,23 @@ impl LayerResidenceEvent {
    }
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize)]
 pub struct LayerAccessStats {
    pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
-    pub task_kind_access_flag: Vec<Cow<'static, str>>,
+    pub task_kind_access_flag: Vec<&'static str>,
    pub first: Option<LayerAccessStatFullDetails>,
    pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
    pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum InMemoryLayerInfo {
    Open { lsn_start: Lsn },
    Frozen { lsn_start: Lsn, lsn_end: Lsn },
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum HistoricLayerInfo {
    Delta {
@@ -694,32 +710,6 @@ pub enum HistoricLayerInfo {
    },
 }

-impl HistoricLayerInfo {
-    pub fn layer_file_name(&self) -> &str {
-        match self {
-            HistoricLayerInfo::Delta {
-                layer_file_name, ..
-            } => layer_file_name,
-            HistoricLayerInfo::Image {
-                layer_file_name, ..
-            } => layer_file_name,
-        }
-    }
-    pub fn is_remote(&self) -> bool {
-        match self {
-            HistoricLayerInfo::Delta { remote, .. } => *remote,
-            HistoricLayerInfo::Image { remote, .. } => *remote,
-        }
-    }
-    pub fn set_remote(&mut self, value: bool) {
-        let field = match self {
-            HistoricLayerInfo::Delta { remote, .. } => remote,
-            HistoricLayerInfo::Image { remote, .. } => remote,
-        };
-        *field = value;
-    }
-}
-
 #[derive(Debug, Serialize, Deserialize)]
 pub struct DownloadRemoteLayersTaskSpawnRequest {
    pub max_concurrent_downloads: NonZeroUsize,
@@ -752,52 +742,6 @@ pub struct WalRedoManagerStatus {
    pub pid: Option<u32>,
 }

-/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
-/// a download job, timing out while waiting for it to run, and then inspecting this status to understand
-/// what's happening.
-#[derive(Default, Debug, Serialize, Deserialize, Clone)]
-pub struct SecondaryProgress {
-    /// The remote storage LastModified time of the heatmap object we last downloaded.
-    #[serde(
-        serialize_with = "opt_ser_rfc3339_millis",
-        deserialize_with = "opt_deser_rfc3339_millis"
-    )]
-    pub heatmap_mtime: Option<SystemTime>,
-
-    /// The number of layers currently on-disk
-    pub layers_downloaded: usize,
-    /// The number of layers in the most recently seen heatmap
-    pub layers_total: usize,
-
-    /// The number of layer bytes currently on-disk
-    pub bytes_downloaded: u64,
-    /// The number of layer bytes in the most recently seen heatmap
-    pub bytes_total: u64,
-}
-
-fn opt_ser_rfc3339_millis<S: serde::Serializer>(
-    ts: &Option<SystemTime>,
-    serializer: S,
-) -> Result<S::Ok, S::Error> {
-    match ts {
-        Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
-        None => serializer.serialize_none(),
-    }
-}
-
-fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
-where
-    D: serde::de::Deserializer<'de>,
-{
-    let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
-    match s {
-        None => Ok(None),
-        Some(s) => humantime::parse_rfc3339(&s)
-            .map_err(serde::de::Error::custom)
-            .map(Some),
-    }
-}
-
 pub mod virtual_file {
    #[derive(
        Copy,
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -7,7 +7,7 @@ use std::time::SystemTime;
 ///
 /// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
 /// not handle full u64 values properly.
-#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
+#[derive(serde::Serialize, Debug)]
 pub struct PageserverUtilization {
    /// Used disk space
    #[serde(serialize_with = "ser_saturating_u63")]
@@ -21,10 +21,7 @@ pub struct PageserverUtilization {
    /// When was this snapshot captured, pageserver local time.
    ///
    /// Use millis to give confidence that the value is regenerated often enough.
-    #[serde(
-        serialize_with = "ser_rfc3339_millis",
-        deserialize_with = "deser_rfc3339_millis"
-    )]
+    #[serde(serialize_with = "ser_rfc3339_millis")]
    pub captured_at: SystemTime,
 }

@@ -35,14 +32,6 @@ fn ser_rfc3339_millis<S: serde::Serializer>(
    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
 }

-fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
-where
-    D: serde::de::Deserializer<'de>,
-{
-    let s: String = serde::de::Deserialize::deserialize(deserializer)?;
-    humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
-}
-
 /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
 ///
 /// Instead of newtype, use this because a newtype would get require handling deserializing values
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -6,9 +6,7 @@
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;

-use crate::{
-    controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId,
-};
+use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};

 /// Upcall message sent by the pageserver to the configured `control_plane_api` on
 /// startup.
@@ -22,20 +20,12 @@ pub struct ReAttachRequest {
    pub register: Option<NodeRegisterRequest>,
 }

-fn default_mode() -> LocationConfigMode {
-    LocationConfigMode::AttachedSingle
-}
-
-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize)]
 pub struct ReAttachResponseTenant {
    pub id: TenantShardId,
-    /// Mandatory if LocationConfigMode is None or set to an Attached* mode
-    pub gen: Option<u32>,
-
-    /// Default value only for backward compat: this field should be set
-    #[serde(default = "default_mode")]
-    pub mode: LocationConfigMode,
+    pub gen: u32,
 }
+
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponse {
    pub tenants: Vec<ReAttachResponseTenant>,
--- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
+++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
@@ -1,6 +1,5 @@
 use anyhow::*;
 use clap::{value_parser, Arg, ArgMatches, Command};
-use postgres::Client;
 use std::{path::PathBuf, str::FromStr};
 use wal_craft::*;

@@ -9,8 +8,8 @@ fn main() -> Result<()> {
        .init();
    let arg_matches = cli().get_matches();

-    let wal_craft = |arg_matches: &ArgMatches, client: &mut Client| {
-        let intermediate_lsns = match arg_matches
+    let wal_craft = |arg_matches: &ArgMatches, client| {
+        let (intermediate_lsns, end_of_wal_lsn) = match arg_matches
            .get_one::<String>("type")
            .map(|s| s.as_str())
            .context("'type' is required")?
@@ -26,7 +25,6 @@ fn main() -> Result<()> {
            LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
            a => panic!("Unknown --type argument: {a}"),
        };
-        let end_of_wal_lsn = client.pg_current_wal_insert_lsn()?;
        for lsn in intermediate_lsns {
            println!("intermediate_lsn = {lsn}");
        }
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -5,6 +5,7 @@ use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
 use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+use std::cmp::Ordering;
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -231,52 +232,59 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow
 pub trait Crafter {
    const NAME: &'static str;

-    /// Generates WAL using the client `client`. Returns a vector of some valid
-    /// "interesting" intermediate LSNs which one may start reading from.
-    /// test_end_of_wal uses this to check various starting points.
-    ///
-    /// Note that postgres is generally keen about writing some WAL. While we
-    /// try to disable it (autovacuum, big wal_writer_delay, etc) it is always
-    /// possible, e.g. xl_running_xacts are dumped each 15s. So checks about
-    /// stable WAL end would be flaky unless postgres is shut down. For this
-    /// reason returning potential end of WAL here is pointless. Most of the
-    /// time this doesn't happen though, so it is reasonable to create needed
-    /// WAL structure and immediately kill postgres like test_end_of_wal does.
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>>;
+    /// Generates WAL using the client `client`. Returns a pair of:
+    /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from.
+    ///   May include or exclude Lsn(0) and the end-of-wal.
+    /// * The expected end-of-wal LSN.
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)>;
 }

-/// Wraps some WAL craft function, providing current LSN to it before the
-/// insertion and flushing WAL afterwards. Also pushes initial LSN to the
-/// result.
 fn craft_internal<C: postgres::GenericClient>(
    client: &mut C,
-    f: impl Fn(&mut C, PgLsn) -> anyhow::Result<Vec<PgLsn>>,
-) -> anyhow::Result<Vec<PgLsn>> {
+    f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec<PgLsn>, Option<PgLsn>)>,
+) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
    ensure_server_config(client)?;

    let initial_lsn = client.pg_current_wal_insert_lsn()?;
    info!("LSN initial = {}", initial_lsn);

-    let mut intermediate_lsns = f(client, initial_lsn)?;
+    let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
+    let last_lsn = match last_lsn {
+        None => client.pg_current_wal_insert_lsn()?,
+        Some(last_lsn) => {
+            let insert_lsn = client.pg_current_wal_insert_lsn()?;
+            match last_lsn.cmp(&insert_lsn) {
+                Ordering::Less => bail!(
+                    "Some records were inserted after the crafted WAL: {} vs {}",
+                    last_lsn,
+                    insert_lsn
+                ),
+                Ordering::Equal => last_lsn,
+                Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
+            }
+        }
+    };
    if !intermediate_lsns.starts_with(&[initial_lsn]) {
        intermediate_lsns.insert(0, initial_lsn);
    }

    // Some records may be not flushed, e.g. non-transactional logical messages.
-    //
-    // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
-    // because pg_current_wal_insert_lsn skips page headers.
    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
-    Ok(intermediate_lsns)
+    match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
+        Ordering::Less => bail!("Some records were flushed after the crafted WAL"),
+        Ordering::Equal => {}
+        Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
+    }
+    Ok((intermediate_lsns, last_lsn))
 }

 pub struct Simple;
 impl Crafter for Simple {
    const NAME: &'static str = "simple";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
        craft_internal(client, |client, _| {
            client.execute("CREATE table t(x int)", &[])?;
-            Ok(Vec::new())
+            Ok((Vec::new(), None))
        })
    }
 }
@@ -284,36 +292,29 @@ impl Crafter for Simple {
 pub struct LastWalRecordXlogSwitch;
 impl Crafter for LastWalRecordXlogSwitch {
    const NAME: &'static str = "last_wal_record_xlog_switch";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
-        // Do not use craft_internal because here we end up with flush_lsn exactly on
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+        // Do not use generate_internal because here we end up with flush_lsn exactly on
        // the segment boundary and insert_lsn after the initial page header, which is unusual.
        ensure_server_config(client)?;

        client.execute("CREATE table t(x int)", &[])?;
        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-        // pg_switch_wal returns end of last record of the switched segment,
-        // i.e. end of SWITCH itself.
-        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
-        let before_xlog_switch_u64 = u64::from(before_xlog_switch);
-        let next_segment = PgLsn::from(
-            before_xlog_switch_u64 - (before_xlog_switch_u64 % WAL_SEGMENT_SIZE as u64)
-                + WAL_SEGMENT_SIZE as u64,
-        );
+        let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+        let next_segment = PgLsn::from(0x0200_0000);
        ensure!(
-            xlog_switch_record_end <= next_segment,
-            "XLOG_SWITCH record ended after the expected segment boundary: {} > {}",
-            xlog_switch_record_end,
+            after_xlog_switch <= next_segment,
+            "XLOG_SWITCH message ended after the expected segment boundary: {} > {}",
+            after_xlog_switch,
            next_segment
        );
-        Ok(vec![before_xlog_switch, xlog_switch_record_end])
+        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
    }
 }

 pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
-/// Craft xlog SWITCH record ending at page boundary.
 impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
    const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
        // Do not use generate_internal because here we end up with flush_lsn exactly on
        // the segment boundary and insert_lsn after the initial page header, which is unusual.
        ensure_server_config(client)?;
@@ -360,29 +361,28 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {

        // Emit the XLOG_SWITCH
        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+        let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
        let next_segment = PgLsn::from(0x0200_0000);
        ensure!(
-            xlog_switch_record_end < next_segment,
-            "XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
-            xlog_switch_record_end,
+            after_xlog_switch < next_segment,
+            "XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}",
+            after_xlog_switch,
            next_segment
        );
        ensure!(
-            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
+            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
            "XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
-            xlog_switch_record_end,
-            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
+            after_xlog_switch,
+            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ
        );
-        Ok(vec![before_xlog_switch, xlog_switch_record_end])
+        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
    }
 }

-/// Write ~16MB logical message; it should cross WAL segment.
-fn craft_seg_size_logical_message(
+fn craft_single_logical_message(
    client: &mut impl postgres::GenericClient,
    transactional: bool,
-) -> anyhow::Result<Vec<PgLsn>> {
+) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
    craft_internal(client, |client, initial_lsn| {
        ensure!(
            initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
@@ -405,24 +405,34 @@ fn craft_seg_size_logical_message(
            "Logical message crossed two segments"
        );

-        Ok(vec![message_lsn])
+        if transactional {
+            // Transactional logical messages are part of a transaction, so the one above is
+            // followed by a small COMMIT record.
+
+            let after_message_lsn = client.pg_current_wal_insert_lsn()?;
+            ensure!(
+                message_lsn < after_message_lsn,
+                "No record found after the emitted message"
+            );
+            Ok((vec![message_lsn], Some(after_message_lsn)))
+        } else {
+            Ok((Vec::new(), Some(message_lsn)))
+        }
    })
 }

 pub struct WalRecordCrossingSegmentFollowedBySmallOne;
 impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
    const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
-        // Transactional message crossing WAL segment will be followed by small
-        // commit record.
-        craft_seg_size_logical_message(client, true)
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+        craft_single_logical_message(client, true)
    }
 }

 pub struct LastWalRecordCrossingSegment;
 impl Crafter for LastWalRecordCrossingSegment {
    const NAME: &'static str = "last_wal_record_crossing_segment";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
-        craft_seg_size_logical_message(client, false)
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+        craft_single_logical_message(client, false)
    }
 }
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -11,15 +11,13 @@ use utils::const_assert;
 use utils::lsn::Lsn;

 fn init_logging() {
-    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(format!(
-        "crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"
-    )))
+    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
+        format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
+    ))
    .is_test(true)
    .try_init();
 }

-/// Test that find_end_of_wal returns the same results as pg_dump on various
-/// WALs created by Crafter.
 fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    use crate::*;

@@ -40,13 +38,13 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    }
    cfg.initdb().unwrap();
    let srv = cfg.start_server().unwrap();
-    let intermediate_lsns = C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
+    let (intermediate_lsns, expected_end_of_wal_partial) =
+        C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
    let intermediate_lsns: Vec<Lsn> = intermediate_lsns
        .iter()
        .map(|&lsn| u64::from(lsn).into())
        .collect();
-    // Kill postgres. Note that it might have inserted to WAL something after
-    // 'craft' did its job.
+    let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
    srv.kill();

    // Check find_end_of_wal on the initial WAL
@@ -58,7 +56,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
        .filter(|fname| IsXLogFileName(fname))
        .max()
        .unwrap();
-    let expected_end_of_wal = find_pg_waldump_end_of_wal(&cfg, &last_segment);
+    check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
    for start_lsn in intermediate_lsns
        .iter()
        .chain(std::iter::once(&expected_end_of_wal))
@@ -93,7 +91,11 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    }
 }

-fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
+fn check_pg_waldump_end_of_wal(
+    cfg: &crate::Conf,
+    last_segment: &str,
+    expected_end_of_wal: Lsn,
+) {
    // Get the actual end of WAL by pg_waldump
    let waldump_output = cfg
        .pg_waldump("000000010000000000000001", last_segment)
@@ -111,8 +113,11 @@ fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
        }
    };
    let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
-    info!("waldump erred on {}", waldump_wal_end);
-    waldump_wal_end
+    info!(
+        "waldump erred on {}, expected wal end at {}",
+        waldump_wal_end, expected_end_of_wal
+    );
+    assert_eq!(waldump_wal_end, expected_end_of_wal);
 }

 fn check_end_of_wal(
@@ -205,9 +210,9 @@ pub fn test_update_next_xid() {
 #[test]
 pub fn test_encode_logical_message() {
    let expected = [
-        64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38,
-        0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102,
-        105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
+        64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
+        38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
+        101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
    ];
    let actual = encode_logical_message("prefix", "message");
    assert_eq!(expected, actual[..]);
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -18,7 +18,6 @@ camino.workspace = true
 humantime.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
-rand.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -157,8 +157,9 @@ impl AzureBlobStorage {
            let mut bufs = Vec::new();
            while let Some(part) = response.next().await {
                let part = part?;
+                let etag_str: &str = part.blob.properties.etag.as_ref();
                if etag.is_none() {
-                    etag = Some(part.blob.properties.etag);
+                    etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
                }
                if last_modified.is_none() {
                    last_modified = Some(part.blob.properties.last_modified.into());
@@ -173,16 +174,6 @@ impl AzureBlobStorage {
                    .map_err(|e| DownloadError::Other(e.into()))?;
                bufs.push(data);
            }
-
-            if bufs.is_empty() {
-                return Err(DownloadError::Other(anyhow::anyhow!(
-                    "Azure GET response contained no buffers"
-                )));
-            }
-            // unwrap safety: if these were None, bufs would be empty and we would have returned an error already
-            let etag = etag.unwrap();
-            let last_modified = last_modified.unwrap();
-
            Ok(Download {
                download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
                etag,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -42,9 +42,6 @@ pub use self::{
 };
 use s3_bucket::RequestKind;

-/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
-pub use azure_core::Etag;
-
 pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};

 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
@@ -294,9 +291,9 @@ pub type DownloadStream =
 pub struct Download {
    pub download_stream: DownloadStream,
    /// The last time the file was modified (`last-modified` HTTP header)
-    pub last_modified: SystemTime,
+    pub last_modified: Option<SystemTime>,
    /// A way to identify this specific version of the resource (`etag` HTTP header)
-    pub etag: Etag,
+    pub etag: Option<String>,
    /// Extra key-value data, associated with the current remote file.
    pub metadata: Option<StorageMetadata>,
 }
@@ -565,16 +562,6 @@ impl GenericRemoteStorage {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct StorageMetadata(HashMap<String, String>);

-impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
-    fn from(arr: [(&str, &str); N]) -> Self {
-        let map: HashMap<String, String> = arr
-            .iter()
-            .map(|(k, v)| (k.to_string(), v.to_string()))
-            .collect();
-        Self(map)
-    }
-}
-
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -10,7 +10,7 @@ use std::{
    io::ErrorKind,
    num::NonZeroU32,
    pin::Pin,
-    time::{Duration, SystemTime, UNIX_EPOCH},
+    time::{Duration, SystemTime},
 };

 use anyhow::{bail, ensure, Context};
@@ -30,7 +30,6 @@ use crate::{
 };

 use super::{RemoteStorage, StorageMetadata};
-use crate::Etag;

 const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";

@@ -198,7 +197,6 @@ impl LocalFs {
            fs::OpenOptions::new()
                .write(true)
                .create(true)
-                .truncate(true)
                .open(&temp_file_path)
                .await
                .with_context(|| {
@@ -408,37 +406,35 @@ impl RemoteStorage for LocalFs {
        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError> {
        let target_path = from.with_base(&self.storage_root);
+        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
+            let source = ReaderStream::new(
+                fs::OpenOptions::new()
+                    .read(true)
+                    .open(&target_path)
+                    .await
+                    .with_context(|| {
+                        format!("Failed to open source file {target_path:?} to use in the download")
+                    })
+                    .map_err(DownloadError::Other)?,
+            );

-        let file_metadata = file_metadata(&target_path).await?;
-
-        let source = ReaderStream::new(
-            fs::OpenOptions::new()
-                .read(true)
-                .open(&target_path)
+            let metadata = self
+                .read_storage_metadata(&target_path)
                .await
-                .with_context(|| {
-                    format!("Failed to open source file {target_path:?} to use in the download")
-                })
-                .map_err(DownloadError::Other)?,
-        );
+                .map_err(DownloadError::Other)?;

-        let metadata = self
-            .read_storage_metadata(&target_path)
-            .await
-            .map_err(DownloadError::Other)?;
+            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);

-        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
-        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
-
-        let etag = mock_etag(&file_metadata);
-        Ok(Download {
-            metadata,
-            last_modified: file_metadata
-                .modified()
-                .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
-            etag,
-            download_stream: Box::pin(source),
-        })
+            Ok(Download {
+                metadata,
+                last_modified: None,
+                etag: None,
+                download_stream: Box::pin(source),
+            })
+        } else {
+            Err(DownloadError::NotFound)
+        }
    }

    async fn download_byte_range(
@@ -456,51 +452,50 @@ impl RemoteStorage for LocalFs {
                return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
            }
        }
-
        let target_path = from.with_base(&self.storage_root);
-        let file_metadata = file_metadata(&target_path).await?;
-        let mut source = tokio::fs::OpenOptions::new()
-            .read(true)
-            .open(&target_path)
-            .await
-            .with_context(|| {
-                format!("Failed to open source file {target_path:?} to use in the download")
+        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
+            let mut source = tokio::fs::OpenOptions::new()
+                .read(true)
+                .open(&target_path)
+                .await
+                .with_context(|| {
+                    format!("Failed to open source file {target_path:?} to use in the download")
+                })
+                .map_err(DownloadError::Other)?;
+
+            let len = source
+                .metadata()
+                .await
+                .context("query file length")
+                .map_err(DownloadError::Other)?
+                .len();
+
+            source
+                .seek(io::SeekFrom::Start(start_inclusive))
+                .await
+                .context("Failed to seek to the range start in a local storage file")
+                .map_err(DownloadError::Other)?;
+
+            let metadata = self
+                .read_storage_metadata(&target_path)
+                .await
+                .map_err(DownloadError::Other)?;
+
+            let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
+            let source = ReaderStream::new(source);
+
+            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+
+            Ok(Download {
+                metadata,
+                last_modified: None,
+                etag: None,
+                download_stream: Box::pin(source),
            })
-            .map_err(DownloadError::Other)?;
-
-        let len = source
-            .metadata()
-            .await
-            .context("query file length")
-            .map_err(DownloadError::Other)?
-            .len();
-
-        source
-            .seek(io::SeekFrom::Start(start_inclusive))
-            .await
-            .context("Failed to seek to the range start in a local storage file")
-            .map_err(DownloadError::Other)?;
-
-        let metadata = self
-            .read_storage_metadata(&target_path)
-            .await
-            .map_err(DownloadError::Other)?;
-
-        let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
-        let source = ReaderStream::new(source);
-
-        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
-        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
-
-        let etag = mock_etag(&file_metadata);
-        Ok(Download {
-            metadata,
-            last_modified: file_metadata
-                .modified()
-                .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
-            etag,
-            download_stream: Box::pin(source),
-        })
+        } else {
+            Err(DownloadError::NotFound)
+        }
    }

    async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
@@ -615,22 +610,13 @@ async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<
    Ok(())
 }

-async fn file_metadata(file_path: &Utf8Path) -> Result<std::fs::Metadata, DownloadError> {
-    tokio::fs::metadata(&file_path).await.map_err(|e| {
-        if e.kind() == ErrorKind::NotFound {
-            DownloadError::NotFound
-        } else {
-            DownloadError::BadInput(e.into())
-        }
-    })
-}
-
-// Use mtime as stand-in for ETag.  We could calculate a meaningful one by md5'ing the contents of files we
-// read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests
-// quickly, with less overhead than using a mock S3 server.
-fn mock_etag(meta: &std::fs::Metadata) -> Etag {
-    let mtime = meta.modified().expect("Filesystem mtime missing");
-    format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis()).into()
+fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
+    if file_path.exists() {
+        ensure!(file_path.is_file(), "file path '{file_path}' is not a file");
+        Ok(true)
+    } else {
+        Ok(false)
+    }
 }

 #[cfg(test)]
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -35,8 +35,8 @@ use aws_sdk_s3::{
 };
 use aws_smithy_async::rt::sleep::TokioSleep;

+use aws_smithy_types::byte_stream::ByteStream;
 use aws_smithy_types::{body::SdkBody, DateTime};
-use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
 use bytes::Bytes;
 use futures::stream::Stream;
 use hyper::Body;
@@ -287,17 +287,8 @@ impl S3Bucket {
        let remaining = self.timeout.saturating_sub(started_at.elapsed());

        let metadata = object_output.metadata().cloned().map(StorageMetadata);
-        let etag = object_output
-            .e_tag
-            .ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))?
-            .into();
-        let last_modified = object_output
-            .last_modified
-            .ok_or(DownloadError::Other(anyhow::anyhow!(
-                "Missing LastModified header"
-            )))?
-            .try_into()
-            .map_err(|e: ConversionError| DownloadError::Other(e.into()))?;
+        let etag = object_output.e_tag;
+        let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());

        let body = object_output.body;
        let body = ByteStreamAsStream::from(body);
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -57,6 +57,7 @@ enum MaybeEnabledStorage {
    Disabled,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -85,6 +86,7 @@ struct AzureWithTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -146,6 +148,7 @@ struct AzureWithSimpleTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -118,7 +118,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    // A little check to ensure that our clock is not too far off from the S3 clock
    {
        let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
-        let last_modified = dl.last_modified;
+        let last_modified = dl.last_modified.unwrap();
        let half_wt = WAIT_TIME.mul_f32(0.5);
        let t0_hwt = t0 + half_wt;
        let t1_hwt = t1 - half_wt;
@@ -219,6 +219,7 @@ enum MaybeEnabledStorage {
    Disabled,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -247,6 +248,7 @@ struct S3WithTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -308,6 +310,7 @@ struct S3WithSimpleTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
--- a/libs/tenant_size_model/tests/tests.rs
+++ b/libs/tenant_size_model/tests/tests.rs
@@ -247,7 +247,7 @@ fn scenario_4() {
    //
    // This is in total 5000 + 1000 + 5000 + 1000 = 12000
    //
-    // (If we used the method from the previous scenario, and
+    // (If we used the the method from the previous scenario, and
    // kept only snapshot at the branch point, we'd need to keep
    // all the WAL between 10000-18000 on the main branch, so
    // the total size would be 5000 + 1000 + 8000 = 14000. The
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -13,7 +13,6 @@ testing = ["fail/failpoints"]
 [dependencies]
 arc-swap.workspace = true
 sentry.workspace = true
-async-compression.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
@@ -37,7 +36,6 @@ serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
-tokio-tar.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
@@ -48,7 +46,6 @@ strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
 uuid.workspace = true
-walkdir.workspace = true

 pq_proto.workspace = true
 postgres_connection.workspace = true
--- a/libs/utils/src/history_buffer.rs
+++ b/libs/utils/src/history_buffer.rs
@@ -47,10 +47,9 @@ impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
    }
 }

-#[derive(serde::Serialize, serde::Deserialize)]
+#[derive(serde::Serialize)]
 struct SerdeRepr<T> {
    buffer: Vec<T>,
-    buffer_size: usize,
    drop_count: u64,
 }

@@ -62,7 +61,6 @@ where
        let HistoryBufferWithDropCounter { buffer, drop_count } = value;
        SerdeRepr {
            buffer: buffer.iter().cloned().collect(),
-            buffer_size: L,
            drop_count: *drop_count,
        }
    }
@@ -80,52 +78,19 @@ where
    }
 }

-impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
-where
-    T: Clone + serde::Deserialize<'de>,
-{
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let SerdeRepr {
-            buffer: des_buffer,
-            drop_count,
-            buffer_size,
-        } = SerdeRepr::<T>::deserialize(deserializer)?;
-        if buffer_size != L {
-            use serde::de::Error;
-            return Err(D::Error::custom(format!(
-                "invalid buffer_size, expecting {L} got {buffer_size}"
-            )));
-        }
-        let mut buffer = HistoryBuffer::new();
-        buffer.extend(des_buffer);
-        Ok(HistoryBufferWithDropCounter { buffer, drop_count })
-    }
-}
-
 #[cfg(test)]
 mod test {
    use super::HistoryBufferWithDropCounter;

    #[test]
    fn test_basics() {
-        let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
+        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
        b.write(1);
        b.write(2);
        b.write(3);
        assert!(b.iter().any(|e| *e == 2));
        assert!(b.iter().any(|e| *e == 3));
        assert!(!b.iter().any(|e| *e == 1));
-
-        // round-trip serde
-        let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
-            serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
-        assert_eq!(
-            round_tripped.iter().cloned().collect::<Vec<_>>(),
-            b.iter().cloned().collect::<Vec<_>>()
-        );
    }

    #[test]
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -245,7 +245,7 @@ impl std::io::Write for ChannelWriter {
    }
 }

-pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    SERVE_METRICS_COUNT.inc();

    let started_at = std::time::Instant::now();
@@ -367,6 +367,7 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
        .middleware(Middleware::post_with_info(
            add_request_id_header_to_response,
        ))
+        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
        .err_handler(route_error_handler)
 }

--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -87,8 +87,6 @@ pub mod failpoint_support;

 pub mod yielding_loop;

-pub mod zstd;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -63,7 +63,6 @@ impl UnwrittenLockFile {
 pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
    let lock_file = fs::OpenOptions::new()
        .create(true) // O_CREAT
-        .truncate(true)
        .write(true)
        .open(lock_file_path)
        .context("open lock file")?;
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -29,10 +29,12 @@ pub struct PageserverFeedback {
    // Serialize with RFC3339 format.
    #[serde(with = "serde_systemtime")]
    pub replytime: SystemTime,
-    /// Used to track feedbacks from different shards. Always zero for unsharded tenants.
-    pub shard_number: u32,
 }

+// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
+// Do not remove previously available fields because this might be backwards incompatible.
+pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
+
 impl PageserverFeedback {
    pub fn empty() -> PageserverFeedback {
        PageserverFeedback {
@@ -41,7 +43,6 @@ impl PageserverFeedback {
            remote_consistent_lsn: Lsn::INVALID,
            disk_consistent_lsn: Lsn::INVALID,
            replytime: *PG_EPOCH,
-            shard_number: 0,
        }
    }

@@ -58,26 +59,17 @@ impl PageserverFeedback {
    //
    // TODO: change serialized fields names once all computes migrate to rename.
    pub fn serialize(&self, buf: &mut BytesMut) {
-        let buf_ptr = buf.len();
-        buf.put_u8(0); // # of keys, will be filled later
-        let mut nkeys = 0;
-
-        nkeys += 1;
+        buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
        buf.put_slice(b"current_timeline_size\0");
        buf.put_i32(8);
        buf.put_u64(self.current_timeline_size);

-        nkeys += 1;
        buf.put_slice(b"ps_writelsn\0");
        buf.put_i32(8);
        buf.put_u64(self.last_received_lsn.0);
-
-        nkeys += 1;
        buf.put_slice(b"ps_flushlsn\0");
        buf.put_i32(8);
        buf.put_u64(self.disk_consistent_lsn.0);
-
-        nkeys += 1;
        buf.put_slice(b"ps_applylsn\0");
        buf.put_i32(8);
        buf.put_u64(self.remote_consistent_lsn.0);
@@ -88,19 +80,9 @@ impl PageserverFeedback {
            .expect("failed to serialize pg_replytime earlier than PG_EPOCH")
            .as_micros() as i64;

-        nkeys += 1;
        buf.put_slice(b"ps_replytime\0");
        buf.put_i32(8);
        buf.put_i64(timestamp);
-
-        if self.shard_number > 0 {
-            nkeys += 1;
-            buf.put_slice(b"shard_number\0");
-            buf.put_i32(4);
-            buf.put_u32(self.shard_number);
-        }
-
-        buf[buf_ptr] = nkeys;
    }

    // Deserialize PageserverFeedback message
@@ -143,8 +125,9 @@ impl PageserverFeedback {
                }
                b"shard_number" => {
                    let len = buf.get_i32();
-                    assert_eq!(len, 4);
-                    rf.shard_number = buf.get_u32();
+                    // TODO: this will be implemented in the next update,
+                    //  for now, we just skip the value.
+                    buf.advance(len as usize);
                }
                _ => {
                    let len = buf.get_i32();
@@ -217,7 +200,10 @@ mod tests {
        rf.serialize(&mut data);

        // Add an extra field to the buffer and adjust number of keys
-        data[0] += 1;
+        if let Some(first) = data.first_mut() {
+            *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
+        }
+
        data.put_slice(b"new_field_one\0");
        data.put_i32(8);
        data.put_u64(42);
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -182,18 +182,6 @@ where
        }
    }

-    /// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
-    pub fn would_wait_for(&self, num: V) -> Result<(), V> {
-        let internal = self.internal.lock().unwrap();
-        let cnt = internal.current.cnt_value();
-        drop(internal);
-        if cnt >= num {
-            Ok(())
-        } else {
-            Err(cnt)
-        }
-    }
-
    /// Register and return a channel that will be notified when a number arrives,
    /// or None, if it has already arrived.
    fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -110,49 +110,6 @@ impl<T> OnceCell<T> {
        }
    }

-    /// Returns a guard to an existing initialized value, or returns an unique initialization
-    /// permit which can be used to initialize this `OnceCell` using `OnceCell::set`.
-    pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
-        // It looks like OnceCell::get_or_init could be implemented using this method instead of
-        // duplication. However, that makes the future be !Send due to possibly holding on to the
-        // MutexGuard over an await point.
-        loop {
-            let sem = {
-                let guard = self.inner.lock().unwrap();
-                if guard.value.is_some() {
-                    return Ok(Guard(guard));
-                }
-                guard.init_semaphore.clone()
-            };
-
-            {
-                let permit = {
-                    // increment the count for the duration of queued
-                    let _guard = CountWaitingInitializers::start(self);
-                    sem.acquire().await
-                };
-
-                let Ok(permit) = permit else {
-                    let guard = self.inner.lock().unwrap();
-                    if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
-                        // there was a take_and_deinit in between
-                        continue;
-                    }
-                    assert!(
-                        guard.value.is_some(),
-                        "semaphore got closed, must be initialized"
-                    );
-                    return Ok(Guard(guard));
-                };
-
-                permit.forget();
-            }
-
-            let permit = InitPermit(sem);
-            return Err(permit);
-        }
-    }
-
    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
    /// to complete initializing the inner value.
    ///
@@ -245,7 +202,7 @@ impl<'a, T> Guard<'a, T> {
    ///
    /// The permit will be on a semaphore part of the new internal value, and any following
    /// [`OnceCell::get_or_init`] will wait on it to complete.
-    pub fn take_and_deinit(mut self) -> (T, InitPermit) {
+    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
        let mut swapped = Inner::default();
        let sem = swapped.init_semaphore.clone();
        // acquire and forget right away, moving the control over to InitPermit
@@ -524,39 +481,4 @@ mod tests {

        assert_eq!("t1", *cell.get().unwrap());
    }
-
-    #[tokio::test(start_paused = true)]
-    async fn detached_init_smoke() {
-        let target = OnceCell::default();
-
-        let Err(permit) = target.get_or_init_detached().await else {
-            unreachable!("it is not initialized")
-        };
-
-        tokio::time::timeout(
-            std::time::Duration::from_secs(3600 * 24 * 7 * 365),
-            target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }),
-        )
-        .await
-        .expect_err("should timeout since we are already holding the permit");
-
-        target.set(42, permit);
-
-        let (_answer, permit) = {
-            let guard = target
-                .get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
-                .await
-                .unwrap();
-
-            assert_eq!(*guard, 42);
-
-            guard.take_and_deinit()
-        };
-
-        assert!(target.get().is_none());
-
-        target.set(11, permit);
-
-        assert_eq!(*target.get().unwrap(), 11);
-    }
 }
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -1,60 +1,27 @@
 use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};

-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub enum VecMapOrdering {
-    Greater,
-    GreaterOrEqual,
-}
-
 /// Ordered map datastructure implemented in a Vec.
 /// Append only - can only add keys that are larger than the
 /// current max key.
-/// Ordering can be adjusted using [`VecMapOrdering`]
-/// during `VecMap` construction.
 #[derive(Clone, Debug)]
-pub struct VecMap<K, V> {
-    data: Vec<(K, V)>,
-    ordering: VecMapOrdering,
-}
+pub struct VecMap<K, V>(Vec<(K, V)>);

 impl<K, V> Default for VecMap<K, V> {
    fn default() -> Self {
-        VecMap {
-            data: Default::default(),
-            ordering: VecMapOrdering::Greater,
-        }
+        VecMap(Default::default())
    }
 }

-#[derive(thiserror::Error, Debug)]
-pub enum VecMapError {
-    #[error("Key violates ordering constraint")]
-    InvalidKey,
-    #[error("Mismatched ordering constraints")]
-    ExtendOrderingError,
-}
+#[derive(Debug)]
+pub struct InvalidKey;

 impl<K: Ord, V> VecMap<K, V> {
-    pub fn new(ordering: VecMapOrdering) -> Self {
-        Self {
-            data: Vec::new(),
-            ordering,
-        }
-    }
-
-    pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self {
-        Self {
-            data: Vec::with_capacity(capacity),
-            ordering,
-        }
-    }
-
    pub fn is_empty(&self) -> bool {
-        self.data.is_empty()
+        self.0.is_empty()
    }

    pub fn as_slice(&self) -> &[(K, V)] {
-        self.data.as_slice()
+        self.0.as_slice()
    }

    /// This function may panic if given a range where the lower bound is
@@ -62,7 +29,7 @@ impl<K: Ord, V> VecMap<K, V> {
    pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
        use std::ops::Bound::*;

-        let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key);
+        let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key);

        let start_idx = match range.start_bound() {
            Unbounded => 0,
@@ -74,7 +41,7 @@ impl<K: Ord, V> VecMap<K, V> {
        };

        let end_idx = match range.end_bound() {
-            Unbounded => self.data.len(),
+            Unbounded => self.0.len(),
            Included(k) => match binary_search(k) {
                Ok(idx) => idx + 1,
                Err(idx) => idx,
@@ -82,30 +49,34 @@ impl<K: Ord, V> VecMap<K, V> {
            Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
        };

-        &self.data[start_idx..end_idx]
+        &self.0[start_idx..end_idx]
    }

    /// Add a key value pair to the map.
-    /// If `key` is not respective of the `self` ordering the
-    /// pair will not be added and `InvalidKey` error will be returned.
-    pub fn append(&mut self, key: K, value: V) -> Result<usize, VecMapError> {
-        self.validate_key_order(&key)?;
+    /// If `key` is less than or equal to the current maximum key
+    /// the pair will not be added and InvalidKey error will be returned.
+    pub fn append(&mut self, key: K, value: V) -> Result<usize, InvalidKey> {
+        if let Some((last_key, _last_value)) = self.0.last() {
+            if &key <= last_key {
+                return Err(InvalidKey);
+            }
+        }

        let delta_size = self.instrument_vec_op(|vec| vec.push((key, value)));
        Ok(delta_size)
    }

    /// Update the maximum key value pair or add a new key value pair to the map.
-    /// If `key` is not respective of the `self` ordering no updates or additions
-    /// will occur and `InvalidKey` error will be returned.
+    /// If `key` is less than the current maximum key no updates or additions
+    /// will occur and InvalidKey error will be returned.
    pub fn append_or_update_last(
        &mut self,
        key: K,
        mut value: V,
-    ) -> Result<(Option<V>, usize), VecMapError> {
-        if let Some((last_key, last_value)) = self.data.last_mut() {
+    ) -> Result<(Option<V>, usize), InvalidKey> {
+        if let Some((last_key, last_value)) = self.0.last_mut() {
            match key.cmp(last_key) {
-                Ordering::Less => return Err(VecMapError::InvalidKey),
+                Ordering::Less => return Err(InvalidKey),
                Ordering::Equal => {
                    std::mem::swap(last_value, &mut value);
                    const DELTA_SIZE: usize = 0;
@@ -129,67 +100,40 @@ impl<K: Ord, V> VecMap<K, V> {
        V: Clone,
    {
        let split_idx = self
-            .data
+            .0
            .binary_search_by_key(&cutoff, extract_key)
            .unwrap_or_else(std::convert::identity);

        (
-            VecMap {
-                data: self.data[..split_idx].to_vec(),
-                ordering: self.ordering,
-            },
-            VecMap {
-                data: self.data[split_idx..].to_vec(),
-                ordering: self.ordering,
-            },
+            VecMap(self.0[..split_idx].to_vec()),
+            VecMap(self.0[split_idx..].to_vec()),
        )
    }

    /// Move items from `other` to the end of `self`, leaving `other` empty.
-    /// If the `other` ordering is different from `self` ordering
-    /// `ExtendOrderingError` error will be returned.
-    /// If any keys in `other` is not respective of the ordering defined in
-    /// `self`, `InvalidKey` error will be returned and no mutation will occur.
-    pub fn extend(&mut self, other: &mut Self) -> Result<usize, VecMapError> {
-        if self.ordering != other.ordering {
-            return Err(VecMapError::ExtendOrderingError);
-        }
+    /// If any keys in `other` is less than or equal to any key in `self`,
+    /// `InvalidKey` error will be returned and no mutation will occur.
+    pub fn extend(&mut self, other: &mut Self) -> Result<usize, InvalidKey> {
+        let self_last_opt = self.0.last().map(extract_key);
+        let other_first_opt = other.0.last().map(extract_key);

-        let other_first_opt = other.data.last().map(extract_key);
-        if let Some(other_first) = other_first_opt {
-            self.validate_key_order(other_first)?;
-        }
-
-        let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data));
-        Ok(delta_size)
-    }
-
-    /// Validate the current last key in `self` and key being
-    /// inserted against the order defined in `self`.
-    fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> {
-        if let Some(last_key) = self.data.last().map(extract_key) {
-            match (&self.ordering, &key.cmp(last_key)) {
-                (VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => {
-                    return Err(VecMapError::InvalidKey);
-                }
-                (VecMapOrdering::Greater, Ordering::Greater) => {}
-                (VecMapOrdering::GreaterOrEqual, Ordering::Less) => {
-                    return Err(VecMapError::InvalidKey);
-                }
-                (VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {}
+        if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) {
+            if self_last >= other_first {
+                return Err(InvalidKey);
            }
        }

-        Ok(())
+        let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0));
+        Ok(delta_size)
    }

    /// Instrument an operation on the underlying [`Vec`].
    /// Will panic if the operation decreases capacity.
    /// Returns the increase in memory usage caused by the op.
    fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
-        let old_cap = self.data.capacity();
-        op(&mut self.data);
-        let new_cap = self.data.capacity();
+        let old_cap = self.0.capacity();
+        op(&mut self.0);
+        let new_cap = self.0.capacity();

        match old_cap.cmp(&new_cap) {
            Ordering::Less => {
@@ -201,36 +145,6 @@ impl<K: Ord, V> VecMap<K, V> {
            Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"),
        }
    }
-
-    /// Similar to `from_iter` defined in `FromIter` trait except
-    /// that it accepts an [`VecMapOrdering`]
-    pub fn from_iter<I: IntoIterator<Item = (K, V)>>(iter: I, ordering: VecMapOrdering) -> Self {
-        let iter = iter.into_iter();
-        let initial_capacity = {
-            match iter.size_hint() {
-                (lower_bound, None) => lower_bound,
-                (_, Some(upper_bound)) => upper_bound,
-            }
-        };
-
-        let mut vec_map = VecMap::with_capacity(initial_capacity, ordering);
-        for (key, value) in iter {
-            vec_map
-                .append(key, value)
-                .expect("The passed collection needs to be sorted!");
-        }
-
-        vec_map
-    }
-}
-
-impl<K: Ord, V> IntoIterator for VecMap<K, V> {
-    type Item = (K, V);
-    type IntoIter = std::vec::IntoIter<(K, V)>;
-
-    fn into_iter(self) -> Self::IntoIter {
-        self.data.into_iter()
-    }
 }

 fn extract_key<K, V>(entry: &(K, V)) -> &K {
@@ -241,7 +155,7 @@ fn extract_key<K, V>(entry: &(K, V)) -> &K {
 mod tests {
    use std::{collections::BTreeMap, ops::Bound};

-    use super::{VecMap, VecMapOrdering};
+    use super::VecMap;

    #[test]
    fn unbounded_range() {
@@ -396,59 +310,5 @@ mod tests {
        left.extend(&mut one_map).unwrap_err();
        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
        assert_eq!(one_map.as_slice(), &[(1, ())]);
-
-        let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual);
-        map_greater_or_equal.append(2, ()).unwrap();
-        map_greater_or_equal.append(2, ()).unwrap();
-
-        left.extend(&mut map_greater_or_equal).unwrap_err();
-        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
-        assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]);
-    }
-
-    #[test]
-    fn extend_with_ordering() {
-        let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual);
-        left.append(0, ()).unwrap();
-        assert_eq!(left.as_slice(), &[(0, ())]);
-
-        let mut greater_right = VecMap::new(VecMapOrdering::Greater);
-        greater_right.append(0, ()).unwrap();
-        left.extend(&mut greater_right).unwrap_err();
-        assert_eq!(left.as_slice(), &[(0, ())]);
-
-        let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual);
-        greater_or_equal_right.append(2, ()).unwrap();
-        greater_or_equal_right.append(2, ()).unwrap();
-        left.extend(&mut greater_or_equal_right).unwrap();
-        assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]);
-    }
-
-    #[test]
-    fn vec_map_from_sorted() {
-        let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())];
-        let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater);
-        assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]);
-
-        let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())];
-        let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
-        assert_eq!(
-            vec_map.as_slice(),
-            &[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]
-        );
-    }
-
-    #[test]
-    #[should_panic]
-    fn vec_map_from_unsorted_greater() {
-        let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())];
-        let _ = VecMap::from_iter(vec, VecMapOrdering::Greater);
-    }
-
-    #[test]
-    #[should_panic]
-    fn vec_map_from_unsorted_greater_or_equal() {
-        let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())];
-        let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
    }
 }
--- a/libs/utils/src/zstd.rs
+++ b/libs/utils/src/zstd.rs
@@ -1,78 +0,0 @@
-use std::io::SeekFrom;
-
-use anyhow::{Context, Result};
-use async_compression::{
-    tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
-    zstd::CParameter,
-    Level,
-};
-use camino::Utf8Path;
-use nix::NixPath;
-use tokio::{
-    fs::{File, OpenOptions},
-    io::AsyncBufRead,
-    io::AsyncSeekExt,
-    io::AsyncWriteExt,
-};
-use tokio_tar::{Archive, Builder, HeaderMode};
-use walkdir::WalkDir;
-
-/// Creates a Zstandard tarball.
-pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
-    let file = OpenOptions::new()
-        .create(true)
-        .truncate(true)
-        .read(true)
-        .write(true)
-        .open(&tarball)
-        .await
-        .with_context(|| format!("tempfile creation {tarball}"))?;
-
-    let mut paths = Vec::new();
-    for entry in WalkDir::new(path) {
-        let entry = entry?;
-        let metadata = entry.metadata().expect("error getting dir entry metadata");
-        // Also allow directories so that we also get empty directories
-        if !(metadata.is_file() || metadata.is_dir()) {
-            continue;
-        }
-        let path = entry.into_path();
-        paths.push(path);
-    }
-    // Do a sort to get a more consistent listing
-    paths.sort_unstable();
-    let zstd = ZstdEncoder::with_quality_and_params(
-        file,
-        Level::Default,
-        &[CParameter::enable_long_distance_matching(true)],
-    );
-    let mut builder = Builder::new(zstd);
-    // Use reproducible header mode
-    builder.mode(HeaderMode::Deterministic);
-    for p in paths {
-        let rel_path = p.strip_prefix(path)?;
-        if rel_path.is_empty() {
-            // The top directory should not be compressed,
-            // the tar crate doesn't like that
-            continue;
-        }
-        builder.append_path_with_name(&p, rel_path).await?;
-    }
-    let mut zstd = builder.into_inner().await?;
-    zstd.shutdown().await?;
-    let mut compressed = zstd.into_inner();
-    let compressed_len = compressed.metadata().await?.len();
-    compressed.seek(SeekFrom::Start(0)).await?;
-    Ok((compressed, compressed_len))
-}
-
-/// Creates a Zstandard tarball.
-pub async fn extract_zst_tarball(
-    path: &Utf8Path,
-    tarball: impl AsyncBufRead + Unpin,
-) -> Result<()> {
-    let decoder = Box::pin(ZstdDecoder::new(tarball));
-    let mut archive = Archive::new(decoder);
-    archive.unpack(path).await?;
-    Ok(())
-}
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -69,7 +69,7 @@ pub struct Config {
    /// should be removed once we have a better solution there.
    sys_buffer_bytes: u64,

-    /// Minimum fraction of total system memory reserved *before* the cgroup threshold; in
+    /// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
    /// other words, providing a ceiling for the highest value of the threshold by enforcing that
    /// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
    /// threshold.
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
    }
 }

-extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekeeper) {
+extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer) {
    unsafe {
        let callback_data = (*(*wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).process_safekeeper_feedback(&mut (*wp), &mut (*sk));
+        (*api).process_safekeeper_feedback(&mut (*wp))
    }
 }

--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -142,7 +142,7 @@ pub trait ApiImpl {
        todo!()
    }

-    fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer, _sk: &mut Safekeeper) {
+    fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer) {
        todo!()
    }

--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -37,6 +37,7 @@ humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
 leaky-bucket.workspace = true
+lz4_flex.workspace = true
 md5.workspace = true
 nix.workspace = true
 # hack to get the number of worker threads tokio uses
@@ -59,7 +60,6 @@ signal-hook.workspace = true
 smallvec = { workspace = true, features = ["write"] }
 svg_fmt.workspace = true
 sync_wrapper.workspace = true
-sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
@@ -90,9 +90,6 @@ enumset = { workspace = true, features = ["serde"]}
 strum.workspace = true
 strum_macros.workspace = true

-[target.'cfg(target_os = "linux")'.dependencies]
-procfs.workspace = true
-
 [dev-dependencies]
 criterion.workspace = true
 hex-literal.workspace = true
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -1,156 +1,160 @@
-//! Quantify a single walredo manager's throughput under N concurrent callers.
+//! Simple benchmarking around walredo.
 //!
-//! The benchmark implementation ([`bench_impl`]) is parametrized by
-//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
-//! - `n_redos` => number of times the benchmark shell execute the `redo_work`
-//! - `nclients` => number of clients (more on this shortly).
+//! Right now they hope to just set a baseline. Later we can try to expand into latency and
+//! throughput after figuring out the coordinated omission problems below.
 //!
-//! The benchmark impl sets up a multi-threaded tokio runtime with default parameters.
-//! It spawns `nclients` times [`client`] tokio tasks.
-//! Each task executes the `redo_work` `n_redos/nclients` times.
+//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
+//! logging what happens when a sequential scan is requested on a small table, then picking out two
+//! suitable from logs.
 //!
-//! We exercise the following combinations:
-//! - `redo_work = short / medium``
-//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
 //!
-//! We let `criterion` determine the `n_redos` using `iter_custom`.
-//! The idea is that for each `(redo_work, nclients)` combination,
-//! criterion will run the `bench_impl` multiple times with different `n_redos`.
-//! The `bench_impl` reports the aggregate wall clock time from the clients' perspective.
-//! Criterion will divide that by `n_redos` to compute the "time per iteration".
-//! In our case, "time per iteration" means "time per redo_work execution".
-//!
-//! NB: the way by which `iter_custom` determines the "number of iterations"
-//! is called sampling. Apparently the idea here is to detect outliers.
-//! We're not sure whether the current choice of sampling method makes sense.
-//! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples
-//!
-//! # Reference Numbers
-//!
-//! 2024-04-04 on i3en.3xlarge
-//!
-//! ```text
-//! short/1                 time:   [25.925 µs 26.060 µs 26.209 µs]
-//! short/2                 time:   [31.277 µs 31.483 µs 31.722 µs]
-//! short/4                 time:   [45.496 µs 45.831 µs 46.182 µs]
-//! short/8                 time:   [84.298 µs 84.920 µs 85.566 µs]
-//! short/16                time:   [185.04 µs 186.41 µs 187.88 µs]
-//! short/32                time:   [385.01 µs 386.77 µs 388.70 µs]
-//! short/64                time:   [770.24 µs 773.04 µs 776.04 µs]
-//! short/128               time:   [1.5017 ms 1.5064 ms 1.5113 ms]
-//! medium/1                time:   [106.65 µs 107.20 µs 107.85 µs]
-//! medium/2                time:   [153.28 µs 154.24 µs 155.56 µs]
-//! medium/4                time:   [325.67 µs 327.01 µs 328.71 µs]
-//! medium/8                time:   [646.82 µs 650.17 µs 653.91 µs]
-//! medium/16               time:   [1.2645 ms 1.2701 ms 1.2762 ms]
-//! medium/32               time:   [2.4409 ms 2.4550 ms 2.4692 ms]
-//! medium/64               time:   [4.6814 ms 4.7114 ms 4.7408 ms]
-//! medium/128              time:   [8.7790 ms 8.9037 ms 9.0282 ms]
-//! ```
+//! Reference data (git blame to see commit) on an i3en.3xlarge
+// ```text
+//! short/short/1           time:   [39.175 µs 39.348 µs 39.536 µs]
+//! short/short/2           time:   [51.227 µs 51.487 µs 51.755 µs]
+//! short/short/4           time:   [76.048 µs 76.362 µs 76.674 µs]
+//! short/short/8           time:   [128.94 µs 129.82 µs 130.74 µs]
+//! short/short/16          time:   [227.84 µs 229.00 µs 230.28 µs]
+//! short/short/32          time:   [455.97 µs 457.81 µs 459.90 µs]
+//! short/short/64          time:   [902.46 µs 904.84 µs 907.32 µs]
+//! short/short/128         time:   [1.7416 ms 1.7487 ms 1.7561 ms]
+//! ``
+
+use std::sync::Arc;

 use bytes::{Buf, Bytes};
-use criterion::{BenchmarkId, Criterion};
-use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
-use pageserver_api::{key::Key, shard::TenantShardId};
-use std::{
-    sync::Arc,
-    time::{Duration, Instant},
+use pageserver::{
+    config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
 };
-use tokio::{sync::Barrier, task::JoinSet};
+use pageserver_api::shard::TenantShardId;
+use tokio::task::JoinSet;
 use utils::{id::TenantId, lsn::Lsn};

-fn bench(c: &mut Criterion) {
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("short");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::short_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
-        }
-    }
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};

-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("medium");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::medium_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
-        }
-    }
-}
-criterion::criterion_group!(benches, bench);
-criterion::criterion_main!(benches);
+fn redo_scenarios(c: &mut Criterion) {
+    // logging should be enabled when adding more inputs, since walredo will only report malformed
+    // input to the stderr.
+    // utils::logging::init(utils::logging::LogFormat::Plain).unwrap();

-// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
    let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();

    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
    let conf = Box::leak(Box::new(conf));
    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());

+    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
+
+    let manager = Arc::new(manager);
+
+    {
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .unwrap();
+        tracing::info!("executing first");
+        rt.block_on(short().execute(&manager)).unwrap();
+        tracing::info!("first executed");
+    }
+
+    let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];
+
+    let mut group = c.benchmark_group("short");
+    group.sampling_mode(criterion::SamplingMode::Flat);
+
+    for thread_count in thread_counts {
+        group.bench_with_input(
+            BenchmarkId::new("short", thread_count),
+            &thread_count,
+            |b, thread_count| {
+                add_multithreaded_walredo_requesters(b, *thread_count, &manager, short);
+            },
+        );
+    }
+    drop(group);
+
+    let mut group = c.benchmark_group("medium");
+    group.sampling_mode(criterion::SamplingMode::Flat);
+
+    for thread_count in thread_counts {
+        group.bench_with_input(
+            BenchmarkId::new("medium", thread_count),
+            &thread_count,
+            |b, thread_count| {
+                add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium);
+            },
+        );
+    }
+    drop(group);
+}
+
+/// Sets up a multi-threaded tokio runtime with default worker thread count,
+/// then, spawn `requesters` tasks that repeatedly:
+/// - get input from `input_factor()`
+/// - call `manager.request_redo()` with their input
+///
+/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
+///
+/// Using tokio's default worker thread count means the results will differ on machines
+/// with different core countrs. We don't care about that, the performance will always
+/// be different on different hardware. To compare performance of different software versions,
+/// use the same hardware.
+fn add_multithreaded_walredo_requesters(
+    b: &mut criterion::Bencher,
+    nrequesters: usize,
+    manager: &Arc<PostgresRedoManager>,
+    input_factory: fn() -> Request,
+) {
+    assert_ne!(nrequesters, 0);
+
    let rt = tokio::runtime::Builder::new_multi_thread()
        .enable_all()
        .build()
        .unwrap();

-    let start = Arc::new(Barrier::new(nclients as usize));
+    let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));

-    let mut tasks = JoinSet::new();
-
-    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
-    let manager = Arc::new(manager);
-
-    for _ in 0..nclients {
-        rt.block_on(async {
-            tasks.spawn(client(
-                Arc::clone(&manager),
-                Arc::clone(&start),
-                Arc::clone(&redo_work),
-                // divide the amount of work equally among the clients
-                n_redos / nclients,
-            ))
+    let mut requesters = JoinSet::new();
+    for _ in 0..nrequesters {
+        let _entered = rt.enter();
+        let manager = manager.clone();
+        let barrier = barrier.clone();
+        requesters.spawn(async move {
+            loop {
+                let input = input_factory();
+                barrier.wait().await;
+                let page = input.execute(&manager).await.unwrap();
+                assert_eq!(page.remaining(), 8192);
+                barrier.wait().await;
+            }
        });
    }

-    rt.block_on(async move {
-        let mut total_wallclock_time = std::time::Duration::from_millis(0);
-        while let Some(res) = tasks.join_next().await {
-            total_wallclock_time += res.unwrap();
-        }
-        total_wallclock_time
-    })
+    let do_one_iteration = || {
+        rt.block_on(async {
+            barrier.wait().await;
+            // wait for work to complete
+            barrier.wait().await;
+        })
+    };
+
+    b.iter_batched(
+        || {
+            // warmup
+            do_one_iteration();
+        },
+        |()| {
+            // work loop
+            do_one_iteration();
+        },
+        criterion::BatchSize::PerIteration,
+    );
+
+    rt.block_on(requesters.shutdown());
 }

-async fn client(
-    mgr: Arc<PostgresRedoManager>,
-    start: Arc<Barrier>,
-    redo_work: Arc<Request>,
-    n_redos: u64,
-) -> Duration {
-    start.wait().await;
-    let start = Instant::now();
-    for _ in 0..n_redos {
-        let page = redo_work.execute(&mgr).await.unwrap();
-        assert_eq!(page.remaining(), 8192);
-        // The real pageserver will rarely if ever do 2 walredos in a row without
-        // yielding to the executor.
-        tokio::task::yield_now().await;
-    }
-    start.elapsed()
-}
+criterion_group!(benches, redo_scenarios);
+criterion_main!(benches);

 macro_rules! lsn {
    ($input:expr) => {{
@@ -162,46 +166,12 @@ macro_rules! lsn {
    }};
 }

-/// Simple wrapper around `WalRedoManager::request_redo`.
-///
-/// In benchmarks this is cloned around.
-#[derive(Clone)]
-struct Request {
-    key: Key,
-    lsn: Lsn,
-    base_img: Option<(Lsn, Bytes)>,
-    records: Vec<(Lsn, NeonWalRecord)>,
-    pg_version: u32,
-}
-
-impl Request {
-    async fn execute(&self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
-        let Request {
-            key,
-            lsn,
-            base_img,
-            records,
-            pg_version,
-        } = self;
-
-        // TODO: avoid these clones
-        manager
-            .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
-            .await
-    }
-
-    fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
-        let rec = Bytes::from_static(bytes);
-        NeonWalRecord::Postgres { will_init, rec }
-    }
-
-    /// Short payload, 1132 bytes.
-    // pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
-    // for null bytes.
-    #[allow(clippy::octal_escapes)]
-    pub fn short_input() -> Request {
-        let pg_record = Self::pg_record;
-        Request {
+/// Short payload, 1132 bytes.
+// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
+// for null bytes.
+#[allow(clippy::octal_escapes)]
+fn short() -> Request {
+    Request {
        key: Key {
            field1: 0,
            field2: 1663,
@@ -224,14 +194,13 @@ impl Request {
        ],
        pg_version: 14,
    }
-    }
+}

-    /// Medium sized payload, serializes as 26393 bytes.
-    // see [`short`]
-    #[allow(clippy::octal_escapes)]
-    pub fn medium_input() -> Request {
-        let pg_record = Self::pg_record;
-        Request {
+/// Medium sized payload, serializes as 26393 bytes.
+// see [`short`]
+#[allow(clippy::octal_escapes)]
+fn medium() -> Request {
+    Request {
        key: Key {
            field1: 0,
            field2: 1663,
@@ -473,5 +442,37 @@ impl Request {
        ],
        pg_version: 14,
    }
+}
+
+fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
+    let rec = Bytes::from_static(bytes);
+    NeonWalRecord::Postgres { will_init, rec }
+}
+
+/// Simple wrapper around `WalRedoManager::request_redo`.
+///
+/// In benchmarks this is cloned around.
+#[derive(Clone)]
+struct Request {
+    key: Key,
+    lsn: Lsn,
+    base_img: Option<(Lsn, Bytes)>,
+    records: Vec<(Lsn, NeonWalRecord)>,
+    pg_version: u32,
+}
+
+impl Request {
+    async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
+        let Request {
+            key,
+            lsn,
+            base_img,
+            records,
+            pg_version,
+        } = self;
+
+        manager
+            .request_redo(key, lsn, base_img, records, pg_version)
+            .await
    }
 }
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -128,12 +128,12 @@ impl Client {

    pub async fn timeline_info(
        &self,
-        tenant_shard_id: TenantShardId,
+        tenant_id: TenantId,
        timeline_id: TimelineId,
        force_await_logical_size: ForceAwaitLogicalSize,
    ) -> Result<pageserver_api::models::TimelineInfo> {
        let uri = format!(
-            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
            self.mgmt_api_endpoint
        );

@@ -151,11 +151,11 @@ impl Client {

    pub async fn keyspace(
        &self,
-        tenant_shard_id: TenantShardId,
+        tenant_id: TenantId,
        timeline_id: TimelineId,
    ) -> Result<pageserver_api::models::partitioning::Partitioning> {
        let uri = format!(
-            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace",
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
            self.mgmt_api_endpoint
        );
        self.get(&uri)
@@ -169,7 +169,7 @@ impl Client {
        self.request(Method::GET, uri, ()).await
    }

-    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
+    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
        &self,
        method: Method,
        uri: U,
@@ -181,16 +181,7 @@ impl Client {
        } else {
            req
        };
-        req.json(&body).send().await.map_err(Error::ReceiveBody)
-    }
-
-    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
-        &self,
-        method: Method,
-        uri: U,
-        body: B,
-    ) -> Result<reqwest::Response> {
-        let res = self.request_noerror(method, uri, body).await?;
+        let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?;
        let response = res.error_from_body().await?;
        Ok(response)
    }
@@ -249,26 +240,13 @@ impl Client {
        Ok(())
    }

-    pub async fn tenant_secondary_download(
-        &self,
-        tenant_id: TenantShardId,
-        wait: Option<std::time::Duration>,
-    ) -> Result<(StatusCode, SecondaryProgress)> {
-        let mut path = reqwest::Url::parse(&format!(
+    pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
+        let uri = format!(
            "{}/v1/tenant/{}/secondary/download",
            self.mgmt_api_endpoint, tenant_id
-        ))
-        .expect("Cannot build URL");
-
-        if let Some(wait) = wait {
-            path.query_pairs_mut()
-                .append_pair("wait_ms", &format!("{}", wait.as_millis()));
-        }
-
-        let response = self.request(Method::POST, path, ()).await?;
-        let status = response.status();
-        let progress: SecondaryProgress = response.json().await.map_err(Error::ReceiveBody)?;
-        Ok((status, progress))
+        );
+        self.request(Method::POST, &uri, ()).await?;
+        Ok(())
    }

    pub async fn location_config(
@@ -438,77 +416,4 @@ impl Client {
            .await
            .map_err(Error::ReceiveBody)
    }
-
-    pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
-        let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
-        self.get(uri)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
-    pub async fn layer_map_info(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-    ) -> Result<LayerMapInfo> {
-        let uri = format!(
-            "{}/v1/tenant/{}/timeline/{}/layer",
-            self.mgmt_api_endpoint, tenant_shard_id, timeline_id,
-        );
-        self.get(&uri)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
-    pub async fn layer_evict(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        layer_file_name: &str,
-    ) -> Result<bool> {
-        let uri = format!(
-            "{}/v1/tenant/{}/timeline/{}/layer/{}",
-            self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
-        );
-        let resp = self.request_noerror(Method::DELETE, &uri, ()).await?;
-        match resp.status() {
-            StatusCode::OK => Ok(true),
-            StatusCode::NOT_MODIFIED => Ok(false),
-            // TODO: dedupe this pattern / introduce separate error variant?
-            status => Err(match resp.json::<HttpErrorBody>().await {
-                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
-                Err(_) => {
-                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
-                }
-            }),
-        }
-    }
-
-    pub async fn layer_ondemand_download(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        layer_file_name: &str,
-    ) -> Result<bool> {
-        let uri = format!(
-            "{}/v1/tenant/{}/timeline/{}/layer/{}",
-            self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
-        );
-        let resp = self.request_noerror(Method::GET, &uri, ()).await?;
-        match resp.status() {
-            StatusCode::OK => Ok(true),
-            StatusCode::NOT_MODIFIED => Ok(false),
-            // TODO: dedupe this pattern / introduce separate error variant?
-            status => Err(match resp.json::<HttpErrorBody>().await {
-                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
-                Err(_) => {
-                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
-                }
-            }),
-        }
-    }
 }
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -43,8 +43,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
    fanout: u64,
    ctx: &E::RequestContext,
 ) -> anyhow::Result<()> {
-    assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}");
-    let exp_base = fanout.max(2);
+    assert!(fanout >= 2);
    // Start at L0
    let mut current_level_no = 0;
    let mut current_level_target_height = target_file_size;
@@ -107,7 +106,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
            break;
        }
        current_level_no += 1;
-        current_level_target_height = current_level_target_height.saturating_mul(exp_base);
+        current_level_target_height = current_level_target_height.saturating_mul(fanout);
    }
    Ok(())
 }
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -1,5 +1,4 @@
 use anyhow::Context;
-use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
 use pageserver_client::page_service::BasebackupRequest;

@@ -96,7 +95,7 @@ async fn main_impl(
            let timeline = *timeline;
            let info = mgmt_api_client
                .timeline_info(
-                    TenantShardId::unsharded(timeline.tenant_id),
+                    timeline.tenant_id,
                    timeline.timeline_id,
                    ForceAwaitLogicalSize::No,
                )
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -4,7 +4,6 @@ use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::models::PagestreamGetPageRequest;

-use pageserver_api::shard::TenantShardId;
 use tokio_util::sync::CancellationToken;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
@@ -174,10 +173,7 @@ async fn main_impl(
                let timeline = *timeline;
                async move {
                    let partitioning = mgmt_api_client
-                        .keyspace(
-                            TenantShardId::unsharded(timeline.tenant_id),
-                            timeline.timeline_id,
-                        )
+                        .keyspace(timeline.tenant_id, timeline.timeline_id)
                        .await?;
                    let lsn = partitioning.at_lsn;
                    let start = Instant::now();
--- a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
+++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
@@ -1,272 +0,0 @@
-use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
-
-use pageserver_client::mgmt_api;
-use rand::seq::SliceRandom;
-use tracing::{debug, info};
-use utils::id::{TenantTimelineId, TimelineId};
-
-use tokio::{
-    sync::{mpsc, OwnedSemaphorePermit},
-    task::JoinSet,
-};
-
-use std::{
-    num::NonZeroUsize,
-    sync::{
-        atomic::{AtomicU64, Ordering},
-        Arc,
-    },
-    time::{Duration, Instant},
-};
-
-/// Evict & on-demand download random layers.
-#[derive(clap::Parser)]
-pub(crate) struct Args {
-    #[clap(long, default_value = "http://localhost:9898")]
-    mgmt_api_endpoint: String,
-    #[clap(long)]
-    pageserver_jwt: Option<String>,
-    #[clap(long)]
-    runtime: Option<humantime::Duration>,
-    #[clap(long, default_value = "1")]
-    tasks_per_target: NonZeroUsize,
-    #[clap(long, default_value = "1")]
-    concurrency_per_target: NonZeroUsize,
-    /// Probability for sending `latest=true` in the request (uniform distribution).
-    #[clap(long)]
-    limit_to_first_n_targets: Option<usize>,
-    /// Before starting the benchmark, live-reconfigure the pageserver to use the given
-    /// [`pageserver_api::models::virtual_file::IoEngineKind`].
-    #[clap(long)]
-    set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
-    targets: Option<Vec<TenantTimelineId>>,
-}
-
-pub(crate) fn main(args: Args) -> anyhow::Result<()> {
-    let rt = tokio::runtime::Builder::new_multi_thread()
-        .enable_all()
-        .build()?;
-    let task = rt.spawn(main_impl(args));
-    rt.block_on(task).unwrap().unwrap();
-    Ok(())
-}
-
-#[derive(Debug, Default)]
-struct LiveStats {
-    evictions: AtomicU64,
-    downloads: AtomicU64,
-    timeline_restarts: AtomicU64,
-}
-
-impl LiveStats {
-    fn eviction_done(&self) {
-        self.evictions.fetch_add(1, Ordering::Relaxed);
-    }
-    fn download_done(&self) {
-        self.downloads.fetch_add(1, Ordering::Relaxed);
-    }
-    fn timeline_restart_done(&self) {
-        self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
-    }
-}
-
-async fn main_impl(args: Args) -> anyhow::Result<()> {
-    let args: &'static Args = Box::leak(Box::new(args));
-
-    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
-        args.mgmt_api_endpoint.clone(),
-        args.pageserver_jwt.as_deref(),
-    ));
-
-    if let Some(engine_str) = &args.set_io_engine {
-        mgmt_api_client.put_io_engine(engine_str).await?;
-    }
-
-    // discover targets
-    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
-        &mgmt_api_client,
-        crate::util::cli::targets::Spec {
-            limit_to_first_n_targets: args.limit_to_first_n_targets,
-            targets: args.targets.clone(),
-        },
-    )
-    .await?;
-
-    let mut tasks = JoinSet::new();
-
-    let live_stats = Arc::new(LiveStats::default());
-    tasks.spawn({
-        let live_stats = Arc::clone(&live_stats);
-        async move {
-            let mut last_at = Instant::now();
-            loop {
-                tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
-                let now = Instant::now();
-                let delta: Duration = now - last_at;
-                last_at = now;
-
-                let LiveStats {
-                    evictions,
-                    downloads,
-                    timeline_restarts,
-                } = &*live_stats;
-                let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
-                let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
-                let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
-                info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
-            }
-        }
-    });
-
-    for tl in timelines {
-        for _ in 0..args.tasks_per_target.get() {
-            tasks.spawn(timeline_actor(
-                args,
-                Arc::clone(&mgmt_api_client),
-                tl,
-                Arc::clone(&live_stats),
-            ));
-        }
-    }
-
-    while let Some(res) = tasks.join_next().await {
-        res.unwrap();
-    }
-    Ok(())
-}
-
-async fn timeline_actor(
-    args: &'static Args,
-    mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
-    timeline: TenantTimelineId,
-    live_stats: Arc<LiveStats>,
-) {
-    // TODO: support sharding
-    let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
-
-    struct Timeline {
-        joinset: JoinSet<()>,
-        layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
-        concurrency: Arc<tokio::sync::Semaphore>,
-    }
-    loop {
-        debug!("restarting timeline");
-        let layer_map_info = mgmt_api_client
-            .layer_map_info(tenant_shard_id, timeline.timeline_id)
-            .await
-            .unwrap();
-        let concurrency = Arc::new(tokio::sync::Semaphore::new(
-            args.concurrency_per_target.get(),
-        ));
-
-        let mut joinset = JoinSet::new();
-        let layers = layer_map_info
-            .historic_layers
-            .into_iter()
-            .map(|historic_layer| {
-                let (tx, rx) = mpsc::channel(1);
-                joinset.spawn(layer_actor(
-                    tenant_shard_id,
-                    timeline.timeline_id,
-                    historic_layer,
-                    rx,
-                    Arc::clone(&mgmt_api_client),
-                    Arc::clone(&live_stats),
-                ));
-                tx
-            })
-            .collect::<Vec<_>>();
-
-        let mut timeline = Timeline {
-            joinset,
-            layers,
-            concurrency,
-        };
-
-        live_stats.timeline_restart_done();
-
-        loop {
-            assert!(!timeline.joinset.is_empty());
-            if let Some(res) = timeline.joinset.try_join_next() {
-                debug!(?res, "a layer actor exited, should not happen");
-                timeline.joinset.shutdown().await;
-                break;
-            }
-
-            let mut permit = Some(
-                Arc::clone(&timeline.concurrency)
-                    .acquire_owned()
-                    .await
-                    .unwrap(),
-            );
-
-            loop {
-                let layer_tx = {
-                    let mut rng = rand::thread_rng();
-                    timeline.layers.choose_mut(&mut rng).expect("no layers")
-                };
-                match layer_tx.try_send(permit.take().unwrap()) {
-                    Ok(_) => break,
-                    Err(e) => match e {
-                        mpsc::error::TrySendError::Full(back) => {
-                            // TODO: retrying introduces bias away from slow downloaders
-                            permit.replace(back);
-                        }
-                        mpsc::error::TrySendError::Closed(_) => panic!(),
-                    },
-                }
-            }
-        }
-    }
-}
-
-async fn layer_actor(
-    tenant_shard_id: TenantShardId,
-    timeline_id: TimelineId,
-    mut layer: HistoricLayerInfo,
-    mut rx: mpsc::Receiver<tokio::sync::OwnedSemaphorePermit>,
-    mgmt_api_client: Arc<mgmt_api::Client>,
-    live_stats: Arc<LiveStats>,
-) {
-    #[derive(Clone, Copy)]
-    enum Action {
-        Evict,
-        OnDemandDownload,
-    }
-
-    while let Some(_permit) = rx.recv().await {
-        let action = if layer.is_remote() {
-            Action::OnDemandDownload
-        } else {
-            Action::Evict
-        };
-
-        let did_it = match action {
-            Action::Evict => {
-                let did_it = mgmt_api_client
-                    .layer_evict(tenant_shard_id, timeline_id, layer.layer_file_name())
-                    .await
-                    .unwrap();
-                live_stats.eviction_done();
-                did_it
-            }
-            Action::OnDemandDownload => {
-                let did_it = mgmt_api_client
-                    .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
-                    .await
-                    .unwrap();
-                live_stats.download_done();
-                did_it
-            }
-        };
-        if !did_it {
-            debug!("local copy of layer map appears out of sync, re-downloading");
-            return;
-        }
-        debug!("did it");
-        layer.set_remote(match action {
-            Action::Evict => true,
-            Action::OnDemandDownload => false,
-        });
-    }
-}
--- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
+++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
@@ -1,7 +1,6 @@
 use std::sync::Arc;

 use humantime::Duration;
-use pageserver_api::shard::TenantShardId;
 use tokio::task::JoinSet;
 use utils::id::TenantTimelineId;

@@ -60,11 +59,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
        let mgmt_api_client = Arc::clone(&mgmt_api_client);
        js.spawn(async move {
            let info = mgmt_api_client
-                .timeline_info(
-                    TenantShardId::unsharded(tl.tenant_id),
-                    tl.timeline_id,
-                    ForceAwaitLogicalSize::Yes,
-                )
+                .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
                .await
                .unwrap();

@@ -79,11 +74,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
                while !info.current_logical_size_is_accurate {
                    ticker.tick().await;
                    info = mgmt_api_client
-                        .timeline_info(
-                            TenantShardId::unsharded(tl.tenant_id),
-                            tl.timeline_id,
-                            ForceAwaitLogicalSize::Yes,
-                        )
+                        .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
                        .await
                        .unwrap();
                }
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -16,7 +16,6 @@ mod util {
 mod cmd {
    pub(super) mod basebackup;
    pub(super) mod getpage_latest_lsn;
-    pub(super) mod ondemand_download_churn;
    pub(super) mod trigger_initial_size_calculation;
 }

@@ -26,7 +25,6 @@ enum Args {
    Basebackup(cmd::basebackup::Args),
    GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
    TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
-    OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
 }

 fn main() {
@@ -45,7 +43,6 @@ fn main() {
        Args::TriggerInitialSizeCalculation(args) => {
            cmd::trigger_initial_size_calculation::main(args)
        }
-        Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
    }
    .unwrap()
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -18,7 +18,6 @@ use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use pageserver::tenant::{secondary, TenantSharedResources};
 use remote_storage::GenericRemoteStorage;
-use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
 use tracing::*;

@@ -121,9 +120,6 @@ fn main() -> anyhow::Result<()> {
        &[("node_id", &conf.id.to_string())],
    );

-    // after setting up logging, log the effective IO engine choice
-    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
-
    let tenants_path = conf.tenants_path();
    if !tenants_path.exists() {
        utils::crashsafe::create_dir_all(conf.tenants_path())
@@ -318,7 +314,6 @@ fn start_pageserver(
    let http_listener = tcp_listener::bind(http_addr)?;

    let pg_addr = &conf.listen_pg_addr;
-
    info!("Starting pageserver pg protocol handler on {pg_addr}");
    let pageserver_listener = tcp_listener::bind(pg_addr)?;

@@ -551,7 +546,7 @@ fn start_pageserver(
        let router_state = Arc::new(
            http::routes::State::new(
                conf,
-                tenant_manager.clone(),
+                tenant_manager,
                http_auth.clone(),
                remote_storage.clone(),
                broker_client.clone(),
@@ -601,37 +596,32 @@ fn start_pageserver(
            None,
            "consumption metrics collection",
            true,
-            {
-                let tenant_manager = tenant_manager.clone();
-                async move {
-                    // first wait until background jobs are cleared to launch.
-                    //
-                    // this is because we only process active tenants and timelines, and the
-                    // Timeline::get_current_logical_size will spawn the logical size calculation,
-                    // which will not be rate-limited.
-                    let cancel = task_mgr::shutdown_token();
+            async move {
+                // first wait until background jobs are cleared to launch.
+                //
+                // this is because we only process active tenants and timelines, and the
+                // Timeline::get_current_logical_size will spawn the logical size calculation,
+                // which will not be rate-limited.
+                let cancel = task_mgr::shutdown_token();

-                    tokio::select! {
-                        _ = cancel.cancelled() => { return Ok(()); },
-                        _ = background_jobs_barrier.wait() => {}
-                    };
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()); },
+                    _ = background_jobs_barrier.wait() => {}
+                };

-                    pageserver::consumption_metrics::collect_metrics(
-                        tenant_manager,
-                        metric_collection_endpoint,
-                        &conf.metric_collection_bucket,
-                        conf.metric_collection_interval,
-                        conf.cached_metric_collection_interval,
-                        conf.synthetic_size_calculation_interval,
-                        conf.id,
-                        local_disk_storage,
-                        cancel,
-                        metrics_ctx,
-                    )
-                    .instrument(info_span!("metrics_collection"))
-                    .await?;
-                    Ok(())
-                }
+                pageserver::consumption_metrics::collect_metrics(
+                    metric_collection_endpoint,
+                    conf.metric_collection_interval,
+                    conf.cached_metric_collection_interval,
+                    conf.synthetic_size_calculation_interval,
+                    conf.id,
+                    local_disk_storage,
+                    cancel,
+                    metrics_ctx,
+                )
+                .instrument(info_span!("metrics_collection"))
+                .await?;
+                Ok(())
            },
        );
    }
@@ -672,37 +662,41 @@ fn start_pageserver(
    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

    // All started up! Now just sit and wait for shutdown signal.
-
    {
-        BACKGROUND_RUNTIME.block_on(async move {
-            let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
-            let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
-            let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
-            let signal = tokio::select! {
-                _ = sigquit.recv() => {
-                    info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
-                    std::process::exit(111);
-                }
-                _ = sigint.recv() => { "SIGINT" },
-                _ = sigterm.recv() => { "SIGTERM" },
-            };
+        use signal_hook::consts::*;
+        let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
+            let mut signals =
+                signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
+            return signals
+                .forever()
+                .next()
+                .expect("forever() never returns None unless explicitly closed");
+        });
+        let signal = BACKGROUND_RUNTIME
+            .block_on(signal_handler)
+            .expect("join error");
+        match signal {
+            SIGQUIT => {
+                info!("Got signal {signal}. Terminating in immediate shutdown mode",);
+                std::process::exit(111);
+            }
+            SIGINT | SIGTERM => {
+                info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);

-            info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
-
-            // This cancels the `shutdown_pageserver` cancellation tree.
-            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
-            // The plan is to change that over time.
-            shutdown_pageserver.take();
-            let bg_remote_storage = remote_storage.clone();
-            let bg_deletion_queue = deletion_queue.clone();
-            pageserver::shutdown_pageserver(
-                &tenant_manager,
-                bg_remote_storage.map(|_| bg_deletion_queue),
-                0,
-            )
-            .await;
-            unreachable!()
-        })
+                // This cancels the `shutdown_pageserver` cancellation tree.
+                // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
+                // The plan is to change that over time.
+                shutdown_pageserver.take();
+                let bg_remote_storage = remote_storage.clone();
+                let bg_deletion_queue = deletion_queue.clone();
+                BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
+                    bg_remote_storage.map(|_| bg_deletion_queue),
+                    0,
+                ));
+                unreachable!()
+            }
+            _ => unreachable!(),
+        }
    }
 }

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -30,17 +30,18 @@ use utils::{
    logging::LogFormat,
 };

+use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
+use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{
    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
-use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
-use crate::{tenant::config::TenantConf, virtual_file};
+use crate::virtual_file;
 use crate::{
    IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
-    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
+    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };

 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -95,8 +96,6 @@ pub mod defaults {

    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;

-    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
-
    ///
    /// Default built-in configuration file.
    ///
@@ -158,8 +157,6 @@ pub mod defaults {
 #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
 #secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}

-#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
-
 [remote_storage]

 "#
@@ -238,7 +235,6 @@ pub struct PageServerConf {
    // How often to send unchanged cached metrics to the metrics endpoint.
    pub cached_metric_collection_interval: Duration,
    pub metric_collection_endpoint: Option<Url>,
-    pub metric_collection_bucket: Option<RemoteStorageConfig>,
    pub synthetic_size_calculation_interval: Duration,

    pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
@@ -283,13 +279,6 @@ pub struct PageServerConf {
    pub max_vectored_read_bytes: MaxVectoredReadBytes,

    pub validate_vectored_get: bool,
-
-    /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
-    /// is exceeded, we start proactively closing ephemeral layers to limit the total amount
-    /// of ephemeral data.
-    ///
-    /// Setting this to zero disables limits on total ephemeral layer size.
-    pub ephemeral_bytes_per_memory_kb: usize,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -302,23 +291,16 @@ pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();

 // use dedicated enum for builder to better indicate the intention
 // and avoid possible confusion with nested options
-#[derive(Clone, Default)]
 pub enum BuilderValue<T> {
    Set(T),
-    #[default]
    NotSet,
 }

-impl<T: Clone> BuilderValue<T> {
-    pub fn ok_or(&self, field_name: &'static str, default: BuilderValue<T>) -> anyhow::Result<T> {
+impl<T> BuilderValue<T> {
+    pub fn ok_or<E>(self, err: E) -> Result<T, E> {
        match self {
-            Self::Set(v) => Ok(v.clone()),
-            Self::NotSet => match default {
-                BuilderValue::Set(v) => Ok(v.clone()),
-                BuilderValue::NotSet => {
-                    anyhow::bail!("missing config value {field_name:?}")
-                }
-            },
+            Self::Set(v) => Ok(v),
+            Self::NotSet => Err(err),
        }
    }
 }
@@ -344,7 +326,6 @@ pub(crate) struct NodeMetadata {
 }

 // needed to simplify config construction
-#[derive(Default)]
 struct PageServerConfigBuilder {
    listen_pg_addr: BuilderValue<String>,

@@ -385,7 +366,6 @@ struct PageServerConfigBuilder {
    cached_metric_collection_interval: BuilderValue<Duration>,
    metric_collection_endpoint: BuilderValue<Option<Url>>,
    synthetic_size_calculation_interval: BuilderValue<Duration>,
-    metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,

    disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,

@@ -411,13 +391,10 @@ struct PageServerConfigBuilder {
    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,

    validate_vectored_get: BuilderValue<bool>,
-
-    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
 }

-impl PageServerConfigBuilder {
-    #[inline(always)]
-    fn default_values() -> Self {
+impl Default for PageServerConfigBuilder {
+    fn default() -> Self {
        use self::BuilderValue::*;
        use defaults::*;
        Self {
@@ -470,8 +447,6 @@ impl PageServerConfigBuilder {
            .expect("cannot parse default synthetic size calculation interval")),
            metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),

-            metric_collection_bucket: Set(None),
-
            disk_usage_based_eviction: Set(None),

            test_remote_failures: Set(0),
@@ -499,7 +474,6 @@ impl PageServerConfigBuilder {
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
-            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
        }
    }
 }
@@ -604,13 +578,6 @@ impl PageServerConfigBuilder {
        self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
    }

-    pub fn metric_collection_bucket(
-        &mut self,
-        metric_collection_bucket: Option<RemoteStorageConfig>,
-    ) {
-        self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket)
-    }
-
    pub fn synthetic_size_calculation_interval(
        &mut self,
        synthetic_size_calculation_interval: Duration,
@@ -679,103 +646,126 @@ impl PageServerConfigBuilder {
        self.validate_vectored_get = BuilderValue::Set(value);
    }

-    pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
-        self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
-    }
-
    pub fn build(self) -> anyhow::Result<PageServerConf> {
-        let default = Self::default_values();
-
-        macro_rules! conf {
-            (USING DEFAULT { $($field:ident,)* } CUSTOM LOGIC { $($custom_field:ident : $custom_value:expr,)* } ) => {
-                PageServerConf {
-                    $(
-                        $field: self.$field.ok_or(stringify!($field), default.$field)?,
-                    )*
-                    $(
-                        $custom_field: $custom_value,
-                    )*
-                }
-            };
-        }
-
-        Ok(conf!(
-            USING DEFAULT
-            {
-                listen_pg_addr,
-                listen_http_addr,
-                availability_zone,
-                wait_lsn_timeout,
-                wal_redo_timeout,
-                superuser,
-                page_cache_size,
-                max_file_descriptors,
-                workdir,
-                pg_distrib_dir,
-                http_auth_type,
-                pg_auth_type,
-                auth_validation_public_key_path,
-                remote_storage_config,
-                id,
-                broker_endpoint,
-                broker_keepalive_interval,
-                log_format,
-                metric_collection_interval,
-                cached_metric_collection_interval,
-                metric_collection_endpoint,
-                metric_collection_bucket,
-                synthetic_size_calculation_interval,
-                disk_usage_based_eviction,
-                test_remote_failures,
-                ondemand_download_behavior_treat_error_as_warn,
-                background_task_maximum_delay,
-                control_plane_api,
-                control_plane_api_token,
-                control_plane_emergency_mode,
-                heatmap_upload_concurrency,
-                secondary_download_concurrency,
-                ingest_batch_size,
-                get_vectored_impl,
-                max_vectored_read_bytes,
-                validate_vectored_get,
-                ephemeral_bytes_per_memory_kb,
-            }
-            CUSTOM LOGIC
-            {
-                // TenantConf is handled separately
-                default_tenant_conf: TenantConf::default(),
-                concurrent_tenant_warmup: ConfigurableSemaphore::new({
-                    self
-                        .concurrent_tenant_warmup
-                        .ok_or("concurrent_tenant_warmpup",
-                               default.concurrent_tenant_warmup)?
-                }),
-                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
-                    self
-                        .concurrent_tenant_size_logical_size_queries
-                        .ok_or("concurrent_tenant_size_logical_size_queries",
-                               default.concurrent_tenant_size_logical_size_queries.clone())?
-                ),
-                eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
-                    // re-use `concurrent_tenant_size_logical_size_queries`
-                    self
-                        .concurrent_tenant_size_logical_size_queries
-                        .ok_or("eviction_task_immitated_concurrent_logical_size_queries",
-                               default.concurrent_tenant_size_logical_size_queries.clone())?,
-                ),
-                virtual_file_io_engine: match self.virtual_file_io_engine {
-                    BuilderValue::Set(v) => v,
-                    BuilderValue::NotSet => match crate::virtual_file::io_engine_feature_test().context("auto-detect virtual_file_io_engine")? {
-                        io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise
-                        io_engine::FeatureTestResult::Worse { engine, remark } => {
-                            // TODO: bubble this up to the caller so we can tracing::warn! it.
-                            eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}");
-                            engine
-                        }
-                    },
-                },
-            }
-        ))
+        let concurrent_tenant_warmup = self
+            .concurrent_tenant_warmup
+            .ok_or(anyhow!("missing concurrent_tenant_warmup"))?;
+        let concurrent_tenant_size_logical_size_queries = self
+            .concurrent_tenant_size_logical_size_queries
+            .ok_or(anyhow!(
+                "missing concurrent_tenant_size_logical_size_queries"
+            ))?;
+        Ok(PageServerConf {
+            listen_pg_addr: self
+                .listen_pg_addr
+                .ok_or(anyhow!("missing listen_pg_addr"))?,
+            listen_http_addr: self
+                .listen_http_addr
+                .ok_or(anyhow!("missing listen_http_addr"))?,
+            availability_zone: self
+                .availability_zone
+                .ok_or(anyhow!("missing availability_zone"))?,
+            wait_lsn_timeout: self
+                .wait_lsn_timeout
+                .ok_or(anyhow!("missing wait_lsn_timeout"))?,
+            wal_redo_timeout: self
+                .wal_redo_timeout
+                .ok_or(anyhow!("missing wal_redo_timeout"))?,
+            superuser: self.superuser.ok_or(anyhow!("missing superuser"))?,
+            page_cache_size: self
+                .page_cache_size
+                .ok_or(anyhow!("missing page_cache_size"))?,
+            max_file_descriptors: self
+                .max_file_descriptors
+                .ok_or(anyhow!("missing max_file_descriptors"))?,
+            workdir: self.workdir.ok_or(anyhow!("missing workdir"))?,
+            pg_distrib_dir: self
+                .pg_distrib_dir
+                .ok_or(anyhow!("missing pg_distrib_dir"))?,
+            http_auth_type: self
+                .http_auth_type
+                .ok_or(anyhow!("missing http_auth_type"))?,
+            pg_auth_type: self.pg_auth_type.ok_or(anyhow!("missing pg_auth_type"))?,
+            auth_validation_public_key_path: self
+                .auth_validation_public_key_path
+                .ok_or(anyhow!("missing auth_validation_public_key_path"))?,
+            remote_storage_config: self
+                .remote_storage_config
+                .ok_or(anyhow!("missing remote_storage_config"))?,
+            id: self.id.ok_or(anyhow!("missing id"))?,
+            // TenantConf is handled separately
+            default_tenant_conf: TenantConf::default(),
+            broker_endpoint: self
+                .broker_endpoint
+                .ok_or(anyhow!("No broker endpoints provided"))?,
+            broker_keepalive_interval: self
+                .broker_keepalive_interval
+                .ok_or(anyhow!("No broker keepalive interval provided"))?,
+            log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
+            concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
+            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
+                concurrent_tenant_size_logical_size_queries,
+            ),
+            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
+                concurrent_tenant_size_logical_size_queries,
+            ),
+            metric_collection_interval: self
+                .metric_collection_interval
+                .ok_or(anyhow!("missing metric_collection_interval"))?,
+            cached_metric_collection_interval: self
+                .cached_metric_collection_interval
+                .ok_or(anyhow!("missing cached_metric_collection_interval"))?,
+            metric_collection_endpoint: self
+                .metric_collection_endpoint
+                .ok_or(anyhow!("missing metric_collection_endpoint"))?,
+            synthetic_size_calculation_interval: self
+                .synthetic_size_calculation_interval
+                .ok_or(anyhow!("missing synthetic_size_calculation_interval"))?,
+            disk_usage_based_eviction: self
+                .disk_usage_based_eviction
+                .ok_or(anyhow!("missing disk_usage_based_eviction"))?,
+            test_remote_failures: self
+                .test_remote_failures
+                .ok_or(anyhow!("missing test_remote_failuers"))?,
+            ondemand_download_behavior_treat_error_as_warn: self
+                .ondemand_download_behavior_treat_error_as_warn
+                .ok_or(anyhow!(
+                    "missing ondemand_download_behavior_treat_error_as_warn"
+                ))?,
+            background_task_maximum_delay: self
+                .background_task_maximum_delay
+                .ok_or(anyhow!("missing background_task_maximum_delay"))?,
+            control_plane_api: self
+                .control_plane_api
+                .ok_or(anyhow!("missing control_plane_api"))?,
+            control_plane_api_token: self
+                .control_plane_api_token
+                .ok_or(anyhow!("missing control_plane_api_token"))?,
+            control_plane_emergency_mode: self
+                .control_plane_emergency_mode
+                .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
+            heatmap_upload_concurrency: self
+                .heatmap_upload_concurrency
+                .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
+            secondary_download_concurrency: self
+                .secondary_download_concurrency
+                .ok_or(anyhow!("missing secondary_download_concurrency"))?,
+            ingest_batch_size: self
+                .ingest_batch_size
+                .ok_or(anyhow!("missing ingest_batch_size"))?,
+            virtual_file_io_engine: self
+                .virtual_file_io_engine
+                .ok_or(anyhow!("missing virtual_file_io_engine"))?,
+            get_vectored_impl: self
+                .get_vectored_impl
+                .ok_or(anyhow!("missing get_vectored_impl"))?,
+            max_vectored_read_bytes: self
+                .max_vectored_read_bytes
+                .ok_or(anyhow!("missing max_vectored_read_bytes"))?,
+            validate_vectored_get: self
+                .validate_vectored_get
+                .ok_or(anyhow!("missing validate_vectored_get"))?,
+        })
    }
 }

@@ -855,7 +845,18 @@ impl PageServerConf {
            .join(timeline_id.to_string())
    }

-    pub(crate) fn timeline_delete_mark_file_path(
+    pub fn timeline_uninit_mark_file_path(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Utf8PathBuf {
+        path_with_suffix_extension(
+            self.timeline_path(&tenant_shard_id, &timeline_id),
+            TIMELINE_UNINIT_MARK_SUFFIX,
+        )
+    }
+
+    pub fn timeline_delete_mark_file_path(
        &self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
@@ -866,10 +867,7 @@ impl PageServerConf {
        )
    }

-    pub(crate) fn tenant_deleted_mark_file_path(
-        &self,
-        tenant_shard_id: &TenantShardId,
-    ) -> Utf8PathBuf {
+    pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
        self.tenant_path(tenant_shard_id)
            .join(TENANT_DELETED_MARKER_FILE_NAME)
    }
@@ -973,9 +971,6 @@ impl PageServerConf {
                    let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
                    builder.metric_collection_endpoint(Some(endpoint));
                },
-                "metric_collection_bucket" => {
-                    builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
-                }
                "synthetic_size_calculation_interval" =>
                    builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
                "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
@@ -1029,9 +1024,6 @@ impl PageServerConf {
                "validate_vectored_get" => {
                    builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
                }
-                "ephemeral_bytes_per_memory_kb" => {
-                    builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
-                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1094,7 +1086,6 @@ impl PageServerConf {
            metric_collection_interval: Duration::from_secs(60),
            cached_metric_collection_interval: Duration::from_secs(60 * 60),
            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
-            metric_collection_bucket: None,
            synthetic_size_calculation_interval: Duration::from_secs(60),
            disk_usage_based_eviction: None,
            test_remote_failures: 0,
@@ -1113,7 +1104,6 @@ impl PageServerConf {
                    .expect("Invalid default constant"),
            ),
            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
        }
    }
 }
@@ -1328,7 +1318,6 @@ background_task_maximum_delay = '334 s'
                    defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
                )?,
                metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
-                metric_collection_bucket: None,
                synthetic_size_calculation_interval: humantime::parse_duration(
                    defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
                )?,
@@ -1351,7 +1340,6 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1404,7 +1392,6 @@ background_task_maximum_delay = '334 s'
                metric_collection_interval: Duration::from_secs(222),
                cached_metric_collection_interval: Duration::from_secs(22200),
                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
-                metric_collection_bucket: None,
                synthetic_size_calculation_interval: Duration::from_secs(333),
                disk_usage_based_eviction: None,
                test_remote_failures: 0,
@@ -1423,7 +1410,6 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
            },
            "Should be able to parse all basic config values correctly"
        );
@@ -1552,6 +1538,7 @@ broker_endpoint = '{broker_endpoint}'

        let broker_endpoint = "http://127.0.0.1:7777";
        let trace_read_requests = true;
+        let image_layer_compression = pageserver_api::models::CompressionAlgorithm::LZ4;

        let config_string = format!(
            r#"{ALL_BASE_VALUES_TOML}
@@ -1559,7 +1546,8 @@ pg_distrib_dir='{pg_distrib_dir}'
 broker_endpoint = '{broker_endpoint}'

 [tenant_config]
-trace_read_requests = {trace_read_requests}"#,
+trace_read_requests = {trace_read_requests}
+image_layer_compression = 'LZ4'"#,
        );

        let toml = config_string.parse()?;
@@ -1569,6 +1557,10 @@ trace_read_requests = {trace_read_requests}"#,
            conf.default_tenant_conf.trace_read_requests, trace_read_requests,
            "Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
        );
+        assert_eq!(
+            conf.default_tenant_conf.image_layer_compression, image_layer_compression,
+            "Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
+        );

        Ok(())
    }
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,13 +3,10 @@
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{
-    mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
-};
+use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
-use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
 use reqwest::Url;
 use std::collections::HashMap;
 use std::sync::Arc;
@@ -43,9 +40,7 @@ type Cache = HashMap<MetricsKey, (EventType, u64)>;
 /// Main thread that serves metrics collection
 #[allow(clippy::too_many_arguments)]
 pub async fn collect_metrics(
-    tenant_manager: Arc<TenantManager>,
    metric_collection_endpoint: &Url,
-    metric_collection_bucket: &Option<RemoteStorageConfig>,
    metric_collection_interval: Duration,
    _cached_metric_collection_interval: Duration,
    synthetic_size_calculation_interval: Duration,
@@ -70,19 +65,15 @@ pub async fn collect_metrics(
        None,
        "synthetic size calculation",
        false,
-        {
-            let tenant_manager = tenant_manager.clone();
-            async move {
-                calculate_synthetic_size_worker(
-                    tenant_manager,
-                    synthetic_size_calculation_interval,
-                    &cancel,
-                    &worker_ctx,
-                )
-                .instrument(info_span!("synthetic_size_worker"))
-                .await?;
-                Ok(())
-            }
+        async move {
+            calculate_synthetic_size_worker(
+                synthetic_size_calculation_interval,
+                &cancel,
+                &worker_ctx,
+            )
+            .instrument(info_span!("synthetic_size_worker"))
+            .await?;
+            Ok(())
        },
    );

@@ -103,27 +94,13 @@ pub async fn collect_metrics(
        .build()
        .expect("Failed to create http client with timeout");

-    let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
-        match GenericRemoteStorage::from_config(bucket_config) {
-            Ok(client) => Some(client),
-            Err(e) => {
-                // Non-fatal error: if we were given an invalid config, we will proceed
-                // with sending metrics over the network, but not to S3.
-                tracing::warn!("Invalid configuration for metric_collection_bucket: {e}");
-                None
-            }
-        }
-    } else {
-        None
-    };
-
    let node_id = node_id.to_string();

    loop {
        let started_at = Instant::now();

        // these are point in time, with variable "now"
-        let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
+        let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;

        let metrics = Arc::new(metrics);

@@ -141,18 +118,10 @@ pub async fn collect_metrics(
                    tracing::error!("failed to persist metrics to {path:?}: {e:#}");
                }
            }
-
-            if let Some(bucket_client) = &bucket_client {
-                let res =
-                    upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
-                if let Err(e) = res {
-                    tracing::error!("failed to upload to S3: {e:#}");
-                }
-            }
        };

        let upload = async {
-            let res = upload::upload_metrics_http(
+            let res = upload::upload_metrics(
                &client,
                metric_collection_endpoint,
                &cancel,
@@ -163,7 +132,7 @@ pub async fn collect_metrics(
            .await;
            if let Err(e) = res {
                // serialization error which should never happen
-                tracing::error!("failed to upload via HTTP due to {e:#}");
+                tracing::error!("failed to upload due to {e:#}");
            }
        };

@@ -278,7 +247,6 @@ async fn reschedule(

 /// Caclculate synthetic size for each active tenant
 async fn calculate_synthetic_size_worker(
-    tenant_manager: Arc<TenantManager>,
    synthetic_size_calculation_interval: Duration,
    cancel: &CancellationToken,
    ctx: &RequestContext,
@@ -291,7 +259,7 @@ async fn calculate_synthetic_size_worker(
    loop {
        let started_at = Instant::now();

-        let tenants = match tenant_manager.list_tenants() {
+        let tenants = match mgr::list_tenants().await {
            Ok(tenants) => tenants,
            Err(e) => {
                warn!("cannot get tenant list: {e:#}");
@@ -310,14 +278,10 @@ async fn calculate_synthetic_size_worker(
                continue;
            }

-            let Ok(tenant) = tenant_manager.get_attached_tenant_shard(tenant_shard_id) else {
+            let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
                continue;
            };

-            if !tenant.is_active() {
-                continue;
-            }
-
            // there is never any reason to exit calculate_synthetic_size_worker following any
            // return value -- we don't need to care about shutdown because no tenant is found when
            // pageserver is shut down.
@@ -355,7 +319,9 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
    };

    // this error can be returned if timeline is shutting down, but it does not
-    // mean the synthetic size worker should terminate.
+    // mean the synthetic size worker should terminate. we do not need any checks
+    // in this function because `mgr::get_tenant` will error out after shutdown has
+    // progressed to shutting down tenants.
    let shutting_down = matches!(
        e.downcast_ref::<PageReconstructError>(),
        Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -1,4 +1,3 @@
-use crate::tenant::mgr::TenantManager;
 use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
@@ -182,7 +181,6 @@ impl MetricsKey {
 }

 pub(super) async fn collect_all_metrics(
-    tenant_manager: &Arc<TenantManager>,
    cached_metrics: &Cache,
    ctx: &RequestContext,
 ) -> Vec<RawMetric> {
@@ -190,7 +188,7 @@ pub(super) async fn collect_all_metrics(

    let started_at = std::time::Instant::now();

-    let tenants = match tenant_manager.list_tenants() {
+    let tenants = match crate::tenant::mgr::list_tenants().await {
        Ok(tenants) => tenants,
        Err(err) => {
            tracing::error!("failed to list tenants: {:?}", err);
@@ -202,8 +200,7 @@ pub(super) async fn collect_all_metrics(
        if state != TenantState::Active || !id.is_zero() {
            None
        } else {
-            tenant_manager
-                .get_attached_tenant_shard(id)
+            crate::tenant::mgr::get_tenant(id, true)
                .ok()
                .map(|tenant| (id.tenant_id, tenant))
        }
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -1,9 +1,4 @@
-use std::time::SystemTime;
-
-use chrono::{DateTime, Utc};
 use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
-use remote_storage::{GenericRemoteStorage, RemotePath};
-use tokio::io::AsyncWriteExt;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;

@@ -18,9 +13,8 @@ struct Ids {
    pub(super) timeline_id: Option<TimelineId>,
 }

-/// Serialize and write metrics to an HTTP endpoint
 #[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
-pub(super) async fn upload_metrics_http(
+pub(super) async fn upload_metrics(
    client: &reqwest::Client,
    metric_collection_endpoint: &reqwest::Url,
    cancel: &CancellationToken,
@@ -80,60 +74,6 @@ pub(super) async fn upload_metrics_http(
    Ok(())
 }

-/// Serialize and write metrics to a remote storage object
-#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
-pub(super) async fn upload_metrics_bucket(
-    client: &GenericRemoteStorage,
-    cancel: &CancellationToken,
-    node_id: &str,
-    metrics: &[RawMetric],
-) -> anyhow::Result<()> {
-    if metrics.is_empty() {
-        // Skip uploads if we have no metrics, so that readers don't have to handle the edge case
-        // of an empty object.
-        return Ok(());
-    }
-
-    // Compose object path
-    let datetime: DateTime<Utc> = SystemTime::now().into();
-    let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ");
-    let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?;
-
-    // Set up a gzip writer into a buffer
-    let mut compressed_bytes: Vec<u8> = Vec::new();
-    let compressed_writer = std::io::Cursor::new(&mut compressed_bytes);
-    let mut gzip_writer = async_compression::tokio::write::GzipEncoder::new(compressed_writer);
-
-    // Serialize and write into compressed buffer
-    let started_at = std::time::Instant::now();
-    for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
-        let (_chunk, body) = res?;
-        gzip_writer.write_all(&body).await?;
-    }
-    gzip_writer.flush().await?;
-    gzip_writer.shutdown().await?;
-    let compressed_length = compressed_bytes.len();
-
-    // Write to remote storage
-    client
-        .upload_storage_object(
-            futures::stream::once(futures::future::ready(Ok(compressed_bytes.into()))),
-            compressed_length,
-            &path,
-            cancel,
-        )
-        .await?;
-    let elapsed = started_at.elapsed();
-
-    tracing::info!(
-        compressed_length,
-        elapsed_ms = elapsed.as_millis(),
-        "write metrics bucket at {path}",
-    );
-
-    Ok(())
-}
-
 // The return type is quite ugly, but we gain testability in isolation
 fn serialize_in_chunks<'a, F>(
    chunk_size: usize,
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -5,14 +5,13 @@ use pageserver_api::{
    controller_api::NodeRegisterRequest,
    shard::TenantShardId,
    upcall_api::{
-        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
-        ValidateRequestTenant, ValidateResponse,
+        ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
    },
 };
 use serde::{de::DeserializeOwned, Serialize};
 use tokio_util::sync::CancellationToken;
 use url::Url;
-use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
+use utils::{backoff, generation::Generation, id::NodeId};

 use crate::{
    config::{NodeMetadata, PageServerConf},
@@ -38,9 +37,7 @@ pub trait ControlPlaneGenerationsApi {
    fn re_attach(
        &self,
        conf: &PageServerConf,
-    ) -> impl Future<
-        Output = Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError>,
-    > + Send;
+    ) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
    fn validate(
        &self,
        tenants: Vec<(TenantShardId, Generation)>,
@@ -121,7 +118,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
    async fn re_attach(
        &self,
        conf: &PageServerConf,
-    ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
+    ) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
        let re_attach_path = self
            .base_url
            .join("re-attach")
@@ -184,7 +181,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
        Ok(response
            .tenants
            .into_iter()
-            .map(|rart| (rart.id, rart))
+            .map(|t| (t.id, Generation::new(t.gen)))
            .collect::<HashMap<_, _>>())
    }

@@ -210,10 +207,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                .collect(),
        };

-        failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel);
-        if self.cancel.is_cancelled() {
-            return Err(RetryForeverError::ShuttingDown);
-        }
+        fail::fail_point!("control-plane-client-validate");

        let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;

--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -724,8 +724,8 @@ impl DeletionQueue {
 mod test {
    use camino::Utf8Path;
    use hex_literal::hex;
-    use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
-    use std::{io::ErrorKind, time::Duration};
+    use pageserver_api::shard::ShardIndex;
+    use std::io::ErrorKind;
    use tracing::info;

    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -834,10 +834,9 @@ mod test {
        async fn re_attach(
            &self,
            _conf: &PageServerConf,
-        ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
+        ) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
            unimplemented!()
        }
-
        async fn validate(
            &self,
            tenants: Vec<(TenantShardId, Generation)>,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -61,6 +61,7 @@ use crate::{
    metrics::disk_usage_based_eviction::METRICS,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
+        self,
        mgr::TenantManager,
        remote_timeline_client::LayerFileMetadata,
        secondary::SecondaryTenant,
@@ -813,8 +814,8 @@ async fn collect_eviction_candidates(
    const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);

    // get a snapshot of the list of tenants
-    let tenants = tenant_manager
-        .list_tenants()
+    let tenants = tenant::mgr::list_tenants()
+        .await
        .context("get list of tenants")?;

    // TODO: avoid listing every layer in every tenant: this loop can block the executor,
@@ -826,12 +827,8 @@ async fn collect_eviction_candidates(
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
-        let tenant = match tenant_manager.get_attached_tenant_shard(tenant_id) {
-            Ok(tenant) if tenant.is_active() => tenant,
-            Ok(_) => {
-                debug!(tenant_id=%tenant_id.tenant_id, shard_id=%tenant_id.shard_slug(), "Tenant shard is not active");
-                continue;
-            }
+        let tenant = match tenant::mgr::get_tenant(tenant_id, true) {
+            Ok(tenant) => tenant,
            Err(e) => {
                // this can happen if tenant has lifecycle transition after we fetched it
                debug!("failed to get tenant: {e:#}");
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -965,28 +965,12 @@ paths:
        required: true
        schema:
          type: string
-      - name: wait_ms
-        description: If set, we will wait this long for download to complete, and if it isn't complete then return 202
-        in: query
-        required: false
-        schema:
-          type: integer
    post:
      description: |
        If the location is in secondary mode, download latest heatmap and layers
      responses:
        "200":
          description: Success
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/SecondaryProgress"
-        "202":
-          description: Download has started but not yet finished
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/SecondaryProgress"
        "500":
          description: Generic operation error
          content:
@@ -1038,7 +1022,7 @@ paths:
                  format: hex
      responses:
        "201":
-          description: Timeline was created, or already existed with matching parameters
+          description: TimelineInfo
          content:
            application/json:
              schema:
@@ -1068,17 +1052,11 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"
        "409":
-          description: Timeline already exists, with different parameters.  Creation cannot proceed.
+          description: Timeline already exists, creation skipped
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
-        "429":
-          description: A creation request was sent for the same Timeline Id while a creation was already in progress.  Back off and retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
        "500":
          description: Generic operation error
          content:
@@ -1466,6 +1444,8 @@ components:
          type: integer
        trace_read_requests:
          type: boolean
+        image_layer_compression:
+          type: string
        heatmap_period:
          type: string
    TenantConfigResponse:
@@ -1629,7 +1609,7 @@ components:
          type: integer
          format: int64
          minimum: 0
-          description: The amount of disk space currently used.
+          description: The amount of disk space currently utilized by layer files.
        free_space_bytes:
          type: integer
          format: int64
@@ -1645,37 +1625,6 @@ components:
            Lower is better score for how good this pageserver would be for the next tenant.
            The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated.

-    SecondaryProgress:
-      type: object
-      required:
-        - heatmap_mtime
-        - layers_downloaded
-        - layers_total
-        - bytes_downloaded
-        - bytes_total
-      properties:
-        heatmap_mtime:
-          type: string
-          format: date-time
-          description: Modification time of the most recently downloaded layer heatmap (RFC 3339 format)
-        layers_downloaded:
-          type: integer
-          format: int64
-          description: How many layers from the latest layer heatmap are present on disk
-        bytes_downloaded:
-          type: integer
-          format: int64
-          description: How many bytes of layer content from the latest layer heatmap are present on disk
-        layers_total:
-          type: integer
-          format: int64
-          description: How many layers were in the latest layer heatmap
-        bytes_total:
-          type: integer
-          format: int64
-          description: How many bytes of layer content were in the latest layer heatmap
-
-
    Error:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -36,7 +36,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
-use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -49,8 +48,8 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::{
-    GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError,
-    TenantSlotUpsertError, TenantStateError,
+    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
+    TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
 use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
 use crate::tenant::remote_timeline_client;
@@ -249,11 +248,16 @@ impl From<GetTenantError> for ApiError {
    fn from(tse: GetTenantError) -> ApiError {
        match tse {
            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
+            GetTenantError::Broken(reason) => {
+                ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
+            }
            GetTenantError::NotActive(_) => {
                // Why is this not `ApiError::NotFound`?
                // Because we must be careful to never return 404 for a tenant if it does
                // in fact exist locally. If we did, the caller could draw the conclusion
                // that it can attach the tenant to another PS and we'd be in split-brain.
+                //
+                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
                ApiError::ResourceUnavailable("Tenant not yet active".into())
            }
            GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
@@ -264,9 +268,6 @@ impl From<GetTenantError> for ApiError {
 impl From<GetActiveTenantError> for ApiError {
    fn from(e: GetActiveTenantError) -> ApiError {
        match e {
-            GetActiveTenantError::Broken(reason) => {
-                ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
-            }
            GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
            GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
            GetActiveTenantError::NotFound(gte) => gte.into(),
@@ -277,6 +278,19 @@ impl From<GetActiveTenantError> for ApiError {
    }
 }

+impl From<SetNewTenantConfigError> for ApiError {
+    fn from(e: SetNewTenantConfigError) -> ApiError {
+        match e {
+            SetNewTenantConfigError::GetTenant(tid) => {
+                ApiError::NotFound(anyhow!("tenant {}", tid).into())
+            }
+            e @ (SetNewTenantConfigError::Persist(_) | SetNewTenantConfigError::Other(_)) => {
+                ApiError::InternalServerError(anyhow::Error::new(e))
+            }
+        }
+    }
+}
+
 impl From<crate::tenant::DeleteTimelineError> for ApiError {
    fn from(value: crate::tenant::DeleteTimelineError) -> Self {
        use crate::tenant::DeleteTimelineError::*;
@@ -480,7 +494,7 @@ async fn timeline_create_handler(
    async {
        let tenant = state
            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
+            .get_attached_tenant_shard(tenant_shard_id, false)?;

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

@@ -520,13 +534,10 @@ async fn timeline_create_handler(
                    HttpErrorBody::from_msg("Tenant shutting down".to_string()),
                )
            }
-            Err(e @ tenant::CreateTimelineError::Conflict) => {
-                json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string()))
-            }
-            Err(e @ tenant::CreateTimelineError::AlreadyCreating) => json_response(
-                StatusCode::TOO_MANY_REQUESTS,
-                HttpErrorBody::from_msg(e.to_string()),
-            ),
+            Err(
+                tenant::CreateTimelineError::Conflict
+                | tenant::CreateTimelineError::AlreadyCreating,
+            ) => json_response(StatusCode::CONFLICT, ()),
            Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
                StatusCode::NOT_ACCEPTABLE,
                HttpErrorBody::from_msg(format!("{err:#}")),
@@ -569,7 +580,7 @@ async fn timeline_list_handler(
    let response_data = async {
        let tenant = state
            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
+            .get_attached_tenant_shard(tenant_shard_id, false)?;

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

@@ -607,7 +618,6 @@ async fn timeline_preserve_initdb_handler(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let state = get_state(&request);

    // Part of the process for disaster recovery from safekeeper-stored WAL:
    // If we don't recover into a new timeline but want to keep the timeline ID,
@@ -615,9 +625,7 @@ async fn timeline_preserve_initdb_handler(
    // location where timeline recreation cand find it.

    async {
-        let tenant = state
-            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
+        let tenant = mgr::get_tenant(tenant_shard_id, false)?;

        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -659,7 +667,7 @@ async fn timeline_detail_handler(
    let timeline_info = async {
        let tenant = state
            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
+            .get_attached_tenant_shard(tenant_shard_id, false)?;

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

@@ -846,7 +854,7 @@ async fn timeline_delete_handler(

    let tenant = state
        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)
+        .get_attached_tenant_shard(tenant_shard_id, false)
        .map_err(|e| {
            match e {
                // GetTenantError has a built-in conversion to ApiError, but in this context we don't
@@ -877,16 +885,14 @@ async fn tenant_detach_handler(

    let state = get_state(&request);
    let conf = state.conf;
-    state
-        .tenant_manager
-        .detach_tenant(
-            conf,
-            tenant_shard_id,
-            detach_ignored.unwrap_or(false),
-            &state.deletion_queue_client,
-        )
-        .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
-        .await?;
+    mgr::detach_tenant(
+        conf,
+        tenant_shard_id,
+        detach_ignored.unwrap_or(false),
+        &state.deletion_queue_client,
+    )
+    .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
+    .await?;

    json_response(StatusCode::OK, ())
 }
@@ -964,11 +970,10 @@ async fn tenant_list_handler(
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;
-    let state = get_state(&request);

-    let response_data = state
-        .tenant_manager
-        .list_tenants()
+    let response_data = mgr::list_tenants()
+        .instrument(info_span!("tenant_list"))
+        .await
        .map_err(|_| {
            ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
        })?
@@ -991,27 +996,9 @@ async fn tenant_status(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let state = get_state(&request);
-
-    // In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting.
-    let activate = true;
-    #[cfg(feature = "testing")]
-    let activate = parse_query_param(&request, "activate")?.unwrap_or(activate);

    let tenant_info = async {
-        let tenant = state
-            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
-
-        if activate {
-            // This is advisory: we prefer to let the tenant activate on-demand when this function is
-            // called, but it is still valid to return 200 and describe the current state of the tenant
-            // if it doesn't make it into an active state.
-            tenant
-                .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
-                .await
-                .ok();
-        }
+        let tenant = mgr::get_tenant(tenant_shard_id, false)?;

        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
@@ -1084,7 +1071,9 @@ async fn tenant_size_handler(
    let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
    let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
    let headers = request.headers();
-    let state = get_state(&request);
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let tenant = mgr::get_tenant(tenant_shard_id, true)?;

    if !tenant_shard_id.is_zero() {
        return Err(ApiError::BadRequest(anyhow!(
@@ -1092,12 +1081,6 @@ async fn tenant_size_handler(
        )));
    }

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
    // this can be long operation
    let inputs = tenant
        .gather_size_inputs(
@@ -1166,15 +1149,10 @@ async fn tenant_shard_split_handler(
    let state = get_state(&request);
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
    let new_shards = state
        .tenant_manager
        .shard_split(
-            tenant,
+            tenant_shard_id,
            ShardCount::new(req.new_shard_count),
            req.new_stripe_size,
            &ctx,
@@ -1392,11 +1370,8 @@ async fn get_tenant_config_handler(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let state = get_state(&request);

-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
+    let tenant = mgr::get_tenant(tenant_shard_id, false)?;

    let response = HashMap::from([
        (
@@ -1424,31 +1399,13 @@ async fn update_tenant_config_handler(
    let tenant_id = request_data.tenant_id;
    check_permission(&request, Some(tenant_id))?;

-    let new_tenant_conf =
+    let tenant_conf =
        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

    let state = get_state(&request);
-
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
-    // This is a legacy API that only operates on attached tenants: the preferred
-    // API to use is the location_config/ endpoint, which lets the caller provide
-    // the full LocationConf.
-    let location_conf = LocationConf::attached_single(
-        new_tenant_conf.clone(),
-        tenant.get_generation(),
-        &ShardParameters::default(),
-    );
-
-    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-    tenant.set_new_tenant_config(new_tenant_conf);
+    mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
+        .instrument(info_span!("tenant_config", %tenant_id))
+        .await?;

    json_response(StatusCode::OK, ())
 }
@@ -1471,14 +1428,13 @@ async fn put_tenant_location_config_handler(
    // The `Detached` state is special, it doesn't upsert a tenant, it removes
    // its local disk content and drops it from memory.
    if let LocationConfigMode::Detached = request_data.config.mode {
-        if let Err(e) = state
-            .tenant_manager
-            .detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
-            .instrument(info_span!("tenant_detach",
-                tenant_id = %tenant_shard_id.tenant_id,
-                shard_id = %tenant_shard_id.shard_slug()
-            ))
-            .await
+        if let Err(e) =
+            mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
+                .instrument(info_span!("tenant_detach",
+                    tenant_id = %tenant_shard_id.tenant_id,
+                    shard_id = %tenant_shard_id.shard_slug()
+                ))
+                .await
        {
            match e {
                TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
@@ -1672,12 +1628,10 @@ async fn handle_tenant_break(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;

-    let state = get_state(&r);
-    state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?
-        .set_broken("broken from test".to_owned())
-        .await;
+    let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true)
+        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
+
+    tenant.set_broken("broken from test".to_owned()).await;

    json_response(StatusCode::OK, ())
 }
@@ -1694,7 +1648,8 @@ async fn timeline_gc_handler(
    let gc_req: TimelineGcRequest = json_request(&mut request).await?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?;
+    let wait_task_done =
+        mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
    let gc_result = wait_task_done
        .await
        .context("wait for gc task")
@@ -1921,7 +1876,7 @@ async fn active_timeline_of_active_tenant(
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id)?;
+    let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;

    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

@@ -2032,42 +1987,13 @@ async fn secondary_download_handler(
 ) -> Result<Response<Body>, ApiError> {
    let state = get_state(&request);
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    let wait = parse_query_param(&request, "wait_ms")?.map(Duration::from_millis);
+    state
+        .secondary_controller
+        .download_tenant(tenant_shard_id)
+        .await
+        .map_err(ApiError::InternalServerError)?;

-    // We don't need this to issue the download request, but:
-    // - it enables us to cleanly return 404 if we get a request for an absent shard
-    // - we will use this to provide status feedback in the response
-    let Some(secondary_tenant) = state
-        .tenant_manager
-        .get_secondary_tenant_shard(tenant_shard_id)
-    else {
-        return Err(ApiError::NotFound(
-            anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(),
-        ));
-    };
-
-    let timeout = wait.unwrap_or(Duration::MAX);
-
-    let status = match tokio::time::timeout(
-        timeout,
-        state.secondary_controller.download_tenant(tenant_shard_id),
-    )
-    .await
-    {
-        // Download job ran to completion.
-        Ok(Ok(())) => StatusCode::OK,
-        // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
-        // okay.  We could get an error here in the unlikely edge case that the tenant
-        // was detached between our check above and executing the download job.
-        Ok(Err(e)) => return Err(ApiError::InternalServerError(e)),
-        // A timeout is not an error: we have started the download, we're just not done
-        // yet.  The caller will get a response body indicating status.
-        Err(_) => StatusCode::ACCEPTED,
-    };
-
-    let progress = secondary_tenant.progress.lock().unwrap().clone();
-
-    json_response(status, progress)
+    json_response(StatusCode::OK, ())
 }

 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -2127,10 +2053,6 @@ async fn get_utilization(
    r: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    fail::fail_point!("get-utilization-http-handler", |_| {
-        Err(ApiError::ResourceUnavailable("failpoint".into()))
-    });
-
    // this probably could be completely public, but lets make that change later.
    check_permission(&r, None)?;

@@ -2186,16 +2108,6 @@ where
    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
    H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
 {
-    if request.uri() != &"/v1/failpoints".parse::<Uri>().unwrap() {
-        fail::fail_point!("api-503", |_| Err(ApiError::ResourceUnavailable(
-            "failpoint".into()
-        )));
-
-        fail::fail_point!("api-500", |_| Err(ApiError::InternalServerError(
-            anyhow::anyhow!("failpoint")
-        )));
-    }
-
    // Spawn a new task to handle the request, to protect the handler from unexpected
    // async cancellations. Most pageserver functions are not async cancellation safe.
    // We arm a drop-guard, so that if Hyper drops the Future, we signal the task
@@ -2307,7 +2219,6 @@ pub fn make_router(

    Ok(router
        .data(state)
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
        .get("/v1/status", |r| api_handler(r, status_handler))
        .put("/v1/failpoints", |r| {
            testing_api_handler("manage failpoints", r, failpoints_handler)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Konstantin Knizhnik	70d1086e0f	Prepare for first stage of deployment: do not bump format version and do not write data in new format but recognoze new format	2024-03-15 10:02:51 +02:00
Konstantin Knizhnik	5a8e8baf9f	Make ruff happy	2024-03-14 18:05:30 +02:00
Konstantin Knizhnik	57a4119a7b	Add test for compression	2024-03-14 16:45:45 +02:00
Konstantin Knizhnik	aaef3789b0	Ignore format version when comparing summary for delta_layer	2024-03-14 14:21:35 +02:00
Konstantin Knizhnik	0b57e0b8f2	Fix image layer format version matching	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	485ecbaf8f	Fix test_attach_tenant_config.py test	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	0bcbce197a	Fix test_attach_tenent_config.py test	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	19d59e58d2	Use CompressionAlgorithm enum	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	ce65d13dbd	Add compress_image_layer to openapi spec	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	18fefff026	Fix compressed blob writer	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	2a69861896	Fix parse_tenant_config test	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	98375b3896	Support vectored comp[ressed blobs read	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	8c60359ae5	Emable iomage layer compression by default	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	8c7136b057	Add compress_image_layer property to TenantConfig	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	0df6c41eaa	Compress image layer	2024-03-14 08:33:37 +02:00