Merge pull request #7254 from neondatabase/rc/proxy/2024-03-27

Proxy release 2024-03-27
Merge pull request #7173 from neondatabase/rc/proxy/2024-03-19
2026-01-21 20:32:56 +00:00 · 2024-03-27 11:44:09 +01:00 · 2024-03-19 12:11:42 +00:00 · 2024-03-14 14:57:05 +05:00 · 2024-03-14 14:16:36 +05:00 · 2024-03-08 08:19:16 +00:00
195 changed files with 2623 additions and 7371 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -22,7 +22,6 @@
 !s3_scrubber/
 !safekeeper/
 !storage_broker/
-!storage_controller/
 !trace/
 !vendor/postgres-*/
 !workspace_hack/
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -10,7 +10,7 @@ inputs:
    required: true
  api_host:
    desctiption: 'Neon API host'
-    default: console-stage.neon.build
+    default: console.stage.neon.tech
 outputs:
  dsn:
    description: 'Created Branch DSN (for main database)'
--- a/.github/actions/neon-branch-delete/action.yml
+++ b/.github/actions/neon-branch-delete/action.yml
@@ -13,7 +13,7 @@ inputs:
    required: true
  api_host:
    desctiption: 'Neon API host'
-    default: console-stage.neon.build
+    default: console.stage.neon.tech

 runs:
  using: "composite"
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -13,7 +13,7 @@ inputs:
    default: 15
  api_host:
    desctiption: 'Neon API host'
-    default: console-stage.neon.build
+    default: console.stage.neon.tech
  provisioner:
    desctiption: 'k8s-pod or k8s-neonvm'
    default: 'k8s-pod'
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -10,7 +10,7 @@ inputs:
    required: true
  api_host:
    desctiption: 'Neon API host'
-    default: console-stage.neon.build
+    default: console.stage.neon.tech

 runs:
  using: "composite"
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -147,16 +147,15 @@ jobs:
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
-                      { "platform": "neon-captest-new",              "db_size": "50gb" },
-                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
+          "include": [{ "platform": "neon-captest-freetier",   "db_size": "3gb"  },
+                      { "platform": "neon-captest-new",        "db_size": "50gb" },
+                      { "platform": "neonvm-captest-freetier", "db_size": "3gb"  },
+                      { "platform": "neonvm-captest-new",      "db_size": "50gb" }]
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
-                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
+                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -172,7 +171,7 @@ jobs:

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
-                                                     { "platform": "rds-aurora"   }]')
+                                                   { "platform": "rds-aurora"   }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -191,7 +190,7 @@ jobs:

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                     { "platform": "rds-aurora",   "scale": "10" }]')
+                                                    { "platform": "rds-aurora",   "scale": "10" }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -254,9 +253,6 @@ jobs:
          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
-          neonvm-captest-sharding-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
-            ;;
          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
@@ -274,15 +270,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Benchmark init
      uses: ./.github/actions/run-python-test-set
@@ -409,15 +401,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: ClickBench benchmark
      uses: ./.github/actions/run-python-test-set
@@ -519,15 +507,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Run TPC-H benchmark
      uses: ./.github/actions/run-python-test-set
@@ -613,15 +597,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1127,7 +1127,6 @@ jobs:
              -f deployProxy=false \
              -f deployStorage=true \
              -f deployStorageBroker=true \
-              -f deployStorageController=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true
@@ -1137,7 +1136,6 @@ jobs:
              -f deployProxy=false \
              -f deployStorage=true \
              -f deployStorageBroker=true \
-              -f deployStorageController=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
@@ -1146,7 +1144,6 @@ jobs:
              -f deployProxy=true \
              -f deployStorage=false \
              -f deployStorageBroker=false \
-              -f deployStorageController=false \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -62,14 +62,14 @@ jobs:

  trigger-e2e-tests:
    needs: [ tag ]
-    runs-on: ubuntu-latest
+    runs-on: [ self-hosted, gen3, small ]
    env:
      TAG: ${{ needs.tag.outputs.build-tag }}
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+      options: --init
    steps:
      - name: check if ecr image are present
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
        run: |
          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
@@ -79,55 +79,41 @@ jobs:
            fi
          done

-      - name: Set e2e-platforms
-        id: e2e-platforms
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          # Default set of platforms to run e2e tests on
-          platforms='["docker", "k8s"]'
-
-          # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
-          # If the workflow run is not a pull request, add k8s-neonvm to the list.
-          if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
-            for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
-              case "$f" in
-                vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
-                  platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
-                  ;;
-                *)
-                  # no-op
-                  ;;
-              esac
-            done
-          else
-            platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
-          fi
-
-          echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
-
      - name: Set PR's status to pending and request a remote CI test
-        env:
-          E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
-          REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud"
+          # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
+          # but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
+          # to place a job run status update later.
+          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
+          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}

-          gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \
-            --method POST \
-            --raw-field "state=pending" \
-            --raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
-            --raw-field "context=neon-cloud-e2e"
+          REMOTE_REPO="${{ github.repository_owner }}/cloud"

-          gh workflow --repo ${REMOTE_REPO} \
-            run testing.yml \
-              --ref "main" \
-              --raw-field "ci_job_name=neon-cloud-e2e" \
-              --raw-field "commit_hash=$COMMIT_SHA" \
-              --raw-field "remote_repo=${GITHUB_REPOSITORY}" \
-              --raw-field "storage_image_tag=${TAG}" \
-              --raw-field "compute_image_tag=${TAG}" \
-              --raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
-              --raw-field "e2e-platforms=${E2E_PLATFORMS}"
+          curl -f -X POST \
+          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"state\": \"pending\",
+              \"context\": \"neon-cloud-e2e\",
+              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
+            }"
+
+          curl -f -X POST \
+          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"ref\": \"main\",
+              \"inputs\": {
+                \"ci_job_name\": \"neon-cloud-e2e\",
+                \"commit_hash\": \"$COMMIT_SHA\",
+                \"remote_repo\": \"${{ github.repository }}\",
+                \"storage_image_tag\": \"${TAG}\",
+                \"compute_image_tag\": \"${TAG}\",
+                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
+              }
+            }"
--- a/2
+++ b/2
@@ -1,5 +1,5 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/storage_controller @neondatabase/storage
+/control_plane/attachment_service @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
 /libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
 /libs/remote_storage/ @neondatabase/storage
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -271,10 +271,42 @@ dependencies = [
 ]

 [[package]]
-name = "atomic-take"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
+name = "attachment_service"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "aws-config",
+ "bytes",
+ "camino",
+ "clap",
+ "control_plane",
+ "diesel",
+ "diesel_migrations",
+ "fail",
+ "futures",
+ "git-version",
+ "hex",
+ "humantime",
+ "hyper",
+ "lasso",
+ "measured",
+ "metrics",
+ "once_cell",
+ "pageserver_api",
+ "pageserver_client",
+ "postgres_connection",
+ "r2d2",
+ "reqwest",
+ "routerify",
+ "serde",
+ "serde_json",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]

 [[package]]
 name = "autocfg"
@@ -304,7 +336,7 @@ dependencies = [
 "fastrand 2.0.0",
 "hex",
 "http 0.2.9",
- "hyper 0.14.26",
+ "hyper",
 "ring 0.17.6",
 "time",
 "tokio",
@@ -341,7 +373,7 @@ dependencies = [
 "bytes",
 "fastrand 2.0.0",
 "http 0.2.9",
- "http-body 0.4.5",
+ "http-body",
 "percent-encoding",
 "pin-project-lite",
 "tracing",
@@ -392,7 +424,7 @@ dependencies = [
 "aws-types",
 "bytes",
 "http 0.2.9",
- "http-body 0.4.5",
+ "http-body",
 "once_cell",
 "percent-encoding",
 "regex-lite",
@@ -520,7 +552,7 @@ dependencies = [
 "crc32fast",
 "hex",
 "http 0.2.9",
- "http-body 0.4.5",
+ "http-body",
 "md-5",
 "pin-project-lite",
 "sha1",
@@ -552,7 +584,7 @@ dependencies = [
 "bytes-utils",
 "futures-core",
 "http 0.2.9",
- "http-body 0.4.5",
+ "http-body",
 "once_cell",
 "percent-encoding",
 "pin-project-lite",
@@ -591,10 +623,10 @@ dependencies = [
 "aws-smithy-types",
 "bytes",
 "fastrand 2.0.0",
- "h2 0.3.26",
+ "h2",
 "http 0.2.9",
- "http-body 0.4.5",
- "hyper 0.14.26",
+ "http-body",
+ "hyper",
 "hyper-rustls",
 "once_cell",
 "pin-project-lite",
@@ -632,7 +664,7 @@ dependencies = [
 "bytes-utils",
 "futures-core",
 "http 0.2.9",
- "http-body 0.4.5",
+ "http-body",
 "itoa",
 "num-integer",
 "pin-project-lite",
@@ -681,8 +713,8 @@ dependencies = [
 "bytes",
 "futures-util",
 "http 0.2.9",
- "http-body 0.4.5",
- "hyper 0.14.26",
+ "http-body",
+ "hyper",
 "itoa",
 "matchit",
 "memchr",
@@ -697,7 +729,7 @@ dependencies = [
 "sha1",
 "sync_wrapper",
 "tokio",
- "tokio-tungstenite 0.20.0",
+ "tokio-tungstenite",
 "tower",
 "tower-layer",
 "tower-service",
@@ -713,7 +745,7 @@ dependencies = [
 "bytes",
 "futures-util",
 "http 0.2.9",
- "http-body 0.4.5",
+ "http-body",
 "mime",
 "rustversion",
 "tower-layer",
@@ -1130,7 +1162,7 @@ version = "4.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b"
 dependencies = [
- "heck 0.4.1",
+ "heck",
 "proc-macro2",
 "quote",
 "syn 2.0.52",
@@ -1202,7 +1234,7 @@ dependencies = [
 "compute_api",
 "flate2",
 "futures",
- "hyper 0.14.26",
+ "hyper",
 "nix 0.27.1",
 "notify",
 "num_cpus",
@@ -1319,7 +1351,7 @@ dependencies = [
 "git-version",
 "hex",
 "humantime",
- "hyper 0.14.26",
+ "hyper",
 "nix 0.27.1",
 "once_cell",
 "pageserver_api",
@@ -1468,9 +1500,12 @@ dependencies = [

 [[package]]
 name = "crossbeam-utils"
-version = "0.8.19"
+version = "0.8.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
+checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
+dependencies = [
+ "cfg-if",
+]

 [[package]]
 name = "crossterm"
@@ -1843,12 +1878,23 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"

 [[package]]
 name = "errno"
-version = "0.3.8"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
+checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
 dependencies = [
+ "errno-dragonfly",
+ "libc",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "errno-dragonfly"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
+dependencies = [
+ "cc",
 "libc",
- "windows-sys 0.52.0",
 ]

 [[package]]
@@ -2188,9 +2234,9 @@ dependencies = [

 [[package]]
 name = "h2"
-version = "0.3.26"
+version = "0.3.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8"
+checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9"
 dependencies = [
 "bytes",
 "fnv",
@@ -2205,25 +2251,6 @@ dependencies = [
 "tracing",
 ]

-[[package]]
-name = "h2"
-version = "0.4.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "816ec7294445779408f36fe57bc5b7fc1cf59664059096c65f905c1c61f58069"
-dependencies = [
- "bytes",
- "fnv",
- "futures-core",
- "futures-sink",
- "futures-util",
- "http 1.1.0",
- "indexmap 2.0.1",
- "slab",
- "tokio",
- "tokio-util",
- "tracing",
-]
-
 [[package]]
 name = "half"
 version = "1.8.2"
@@ -2305,12 +2332,6 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"

-[[package]]
-name = "heck"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
-
 [[package]]
 name = "hermit-abi"
 version = "0.3.3"
@@ -2395,29 +2416,6 @@ dependencies = [
 "pin-project-lite",
 ]

-[[package]]
-name = "http-body"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643"
-dependencies = [
- "bytes",
- "http 1.1.0",
-]
-
-[[package]]
-name = "http-body-util"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41cb79eb393015dadd30fc252023adb0b2400a0caee0fa2a077e6e21a551e840"
-dependencies = [
- "bytes",
- "futures-util",
- "http 1.1.0",
- "http-body 1.0.0",
- "pin-project-lite",
-]
-
 [[package]]
 name = "http-types"
 version = "2.12.0"
@@ -2476,9 +2474,9 @@ dependencies = [
 "futures-channel",
 "futures-core",
 "futures-util",
- "h2 0.3.26",
+ "h2",
 "http 0.2.9",
- "http-body 0.4.5",
+ "http-body",
 "httparse",
 "httpdate",
 "itoa",
@@ -2490,26 +2488,6 @@ dependencies = [
 "want",
 ]

-[[package]]
-name = "hyper"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a"
-dependencies = [
- "bytes",
- "futures-channel",
- "futures-util",
- "h2 0.4.4",
- "http 1.1.0",
- "http-body 1.0.0",
- "httparse",
- "httpdate",
- "itoa",
- "pin-project-lite",
- "smallvec",
- "tokio",
-]
-
 [[package]]
 name = "hyper-rustls"
 version = "0.24.0"
@@ -2517,7 +2495,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7"
 dependencies = [
 "http 0.2.9",
- "hyper 0.14.26",
+ "hyper",
 "log",
 "rustls 0.21.9",
 "rustls-native-certs 0.6.2",
@@ -2531,7 +2509,7 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
 dependencies = [
- "hyper 0.14.26",
+ "hyper",
 "pin-project-lite",
 "tokio",
 "tokio-io-timeout",
@@ -2544,7 +2522,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
 dependencies = [
 "bytes",
- "hyper 0.14.26",
+ "hyper",
 "native-tls",
 "tokio",
 "tokio-native-tls",
@@ -2552,33 +2530,15 @@ dependencies = [

 [[package]]
 name = "hyper-tungstenite"
-version = "0.13.0"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad"
+checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9"
 dependencies = [
- "http-body-util",
- "hyper 1.2.0",
- "hyper-util",
+ "hyper",
 "pin-project-lite",
 "tokio",
- "tokio-tungstenite 0.21.0",
- "tungstenite 0.21.0",
-]
-
-[[package]]
-name = "hyper-util"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa"
-dependencies = [
- "bytes",
- "futures-util",
- "http 1.1.0",
- "http-body 1.0.0",
- "hyper 1.2.0",
- "pin-project-lite",
- "socket2 0.5.5",
- "tokio",
+ "tokio-tungstenite",
+ "tungstenite",
 ]

 [[package]]
@@ -2872,12 +2832,6 @@ version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"

-[[package]]
-name = "linux-raw-sys"
-version = "0.4.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
-
 [[package]]
 name = "lock_api"
 version = "0.4.10"
@@ -2932,12 +2886,11 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

 [[package]]
 name = "measured"
-version = "0.0.20"
+version = "0.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3cbf033874bea03565f2449572c8640ca37ec26300455faf36001f24755da452"
+checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
 dependencies = [
 "bytes",
- "crossbeam-utils",
 "hashbrown 0.14.0",
 "itoa",
 "lasso",
@@ -2950,27 +2903,16 @@ dependencies = [

 [[package]]
 name = "measured-derive"
-version = "0.0.20"
+version = "0.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be9e29b682b38f8af2a89f960455054ab1a9f5a06822f6f3500637ad9fa57def"
+checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
 dependencies = [
- "heck 0.5.0",
+ "heck",
 "proc-macro2",
 "quote",
 "syn 2.0.52",
 ]

-[[package]]
-name = "measured-process"
-version = "0.0.20"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a20849acdd04c5d6a88f565559044546904648a1842a2937cfff0b48b4ca7ef2"
-dependencies = [
- "libc",
- "measured",
- "procfs 0.16.0",
-]
-
 [[package]]
 name = "memchr"
 version = "2.6.4"
@@ -3010,10 +2952,8 @@ version = "0.1.0"
 dependencies = [
 "chrono",
 "libc",
- "measured",
- "measured-process",
 "once_cell",
- "procfs 0.14.2",
+ "procfs",
 "prometheus",
 "rand 0.8.5",
 "rand_distr",
@@ -3495,9 +3435,9 @@ dependencies = [

 [[package]]
 name = "ordered-multimap"
-version = "0.7.3"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
+checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
 dependencies = [
 "dlv-list",
 "hashbrown 0.14.0",
@@ -3563,17 +3503,12 @@ dependencies = [
 "camino",
 "clap",
 "git-version",
- "humantime",
 "pageserver",
- "pageserver_api",
 "postgres_ffi",
- "remote_storage",
 "serde",
 "serde_json",
 "svg_fmt",
 "tokio",
- "tokio-util",
- "toml_edit",
 "utils",
 "workspace_hack",
 ]
@@ -3609,7 +3544,7 @@ dependencies = [
 "hex-literal",
 "humantime",
 "humantime-serde",
- "hyper 0.14.26",
+ "hyper",
 "itertools",
 "leaky-bucket",
 "md5",
@@ -3628,7 +3563,7 @@ dependencies = [
 "postgres_connection",
 "postgres_ffi",
 "pq_proto",
- "procfs 0.14.2",
+ "procfs",
 "rand 0.8.5",
 "regex",
 "remote_storage",
@@ -3719,6 +3654,7 @@ dependencies = [
 "anyhow",
 "async-compression",
 "async-stream",
+ "async-trait",
 "byteorder",
 "bytes",
 "chrono",
@@ -4188,29 +4124,6 @@ dependencies = [
 "rustix 0.36.16",
 ]

-[[package]]
-name = "procfs"
-version = "0.16.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
-dependencies = [
- "bitflags 2.4.1",
- "hex",
- "lazy_static",
- "procfs-core",
- "rustix 0.38.28",
-]
-
-[[package]]
-name = "procfs-core"
-version = "0.16.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
-dependencies = [
- "bitflags 2.4.1",
- "hex",
-]
-
 [[package]]
 name = "prometheus"
 version = "0.13.3"
@@ -4223,7 +4136,7 @@ dependencies = [
 "libc",
 "memchr",
 "parking_lot 0.12.1",
- "procfs 0.14.2",
+ "procfs",
 "thiserror",
 ]

@@ -4244,7 +4157,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
 dependencies = [
 "bytes",
- "heck 0.4.1",
+ "heck",
 "itertools",
 "lazy_static",
 "log",
@@ -4286,9 +4199,7 @@ name = "proxy"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "async-compression",
 "async-trait",
- "atomic-take",
 "aws-config",
 "aws-sdk-iam",
 "aws-sigv4",
@@ -4312,15 +4223,11 @@ dependencies = [
 "hmac",
 "hostname",
 "http 1.1.0",
- "http-body-util",
 "humantime",
- "hyper 0.14.26",
- "hyper 1.2.0",
+ "hyper",
 "hyper-tungstenite",
- "hyper-util",
 "ipnet",
 "itertools",
- "jsonwebtoken",
 "lasso",
 "md5",
 "metrics",
@@ -4651,7 +4558,7 @@ dependencies = [
 "futures-util",
 "http-types",
 "humantime",
- "hyper 0.14.26",
+ "hyper",
 "itertools",
 "metrics",
 "once_cell",
@@ -4681,10 +4588,10 @@ dependencies = [
 "encoding_rs",
 "futures-core",
 "futures-util",
- "h2 0.3.26",
+ "h2",
 "http 0.2.9",
- "http-body 0.4.5",
- "hyper 0.14.26",
+ "http-body",
+ "hyper",
 "hyper-rustls",
 "hyper-tls",
 "ipnet",
@@ -4742,7 +4649,7 @@ dependencies = [
 "futures",
 "getrandom 0.2.11",
 "http 0.2.9",
- "hyper 0.14.26",
+ "hyper",
 "parking_lot 0.11.2",
 "reqwest",
 "reqwest-middleware",
@@ -4829,7 +4736,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945"
 dependencies = [
 "http 0.2.9",
- "hyper 0.14.26",
+ "hyper",
 "lazy_static",
 "percent-encoding",
 "regex",
@@ -4941,19 +4848,6 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

-[[package]]
-name = "rustix"
-version = "0.38.28"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316"
-dependencies = [
- "bitflags 2.4.1",
- "errno",
- "libc",
- "linux-raw-sys 0.4.13",
- "windows-sys 0.52.0",
-]
-
 [[package]]
 name = "rustls"
 version = "0.21.9"
@@ -5134,7 +5028,7 @@ dependencies = [
 "git-version",
 "hex",
 "humantime",
- "hyper 0.14.26",
+ "hyper",
 "metrics",
 "once_cell",
 "parking_lot 0.12.1",
@@ -5619,9 +5513,9 @@ dependencies = [

 [[package]]
 name = "smallvec"
-version = "1.13.1"
+version = "1.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"
+checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"

 [[package]]
 name = "smol_str"
@@ -5713,7 +5607,7 @@ dependencies = [
 "futures-util",
 "git-version",
 "humantime",
- "hyper 0.14.26",
+ "hyper",
 "metrics",
 "once_cell",
 "parking_lot 0.12.1",
@@ -5727,65 +5621,6 @@ dependencies = [
 "workspace_hack",
 ]

-[[package]]
-name = "storage_controller"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "aws-config",
- "bytes",
- "camino",
- "clap",
- "control_plane",
- "diesel",
- "diesel_migrations",
- "fail",
- "futures",
- "git-version",
- "hex",
- "humantime",
- "hyper 0.14.26",
- "itertools",
- "lasso",
- "measured",
- "metrics",
- "once_cell",
- "pageserver_api",
- "pageserver_client",
- "postgres_connection",
- "r2d2",
- "reqwest",
- "routerify",
- "serde",
- "serde_json",
- "thiserror",
- "tokio",
- "tokio-util",
- "tracing",
- "utils",
- "workspace_hack",
-]
-
-[[package]]
-name = "storcon_cli"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "clap",
- "comfy-table",
- "hyper 0.14.26",
- "pageserver_api",
- "pageserver_client",
- "reqwest",
- "serde",
- "serde_json",
- "thiserror",
- "tokio",
- "tracing",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "stringprep"
 version = "0.1.2"
@@ -5814,7 +5649,7 @@ version = "0.24.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
 dependencies = [
- "heck 0.4.1",
+ "heck",
 "proc-macro2",
 "quote",
 "rustversion",
@@ -5942,23 +5777,23 @@ dependencies = [

 [[package]]
 name = "test-context"
-version = "0.3.0"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6676ab8513edfd2601a108621103fdb45cac9098305ca25ec93f7023b06b05d9"
+checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3"
 dependencies = [
+ "async-trait",
 "futures",
 "test-context-macros",
 ]

 [[package]]
 name = "test-context-macros"
-version = "0.3.0"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1"
+checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d"
 dependencies = [
- "proc-macro2",
 "quote",
- "syn 2.0.52",
+ "syn 1.0.109",
 ]

 [[package]]
@@ -6099,9 +5934,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"

 [[package]]
 name = "tokio"
-version = "1.37.0"
+version = "1.36.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787"
+checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
 dependencies = [
 "backtrace",
 "bytes",
@@ -6256,19 +6091,7 @@ dependencies = [
 "futures-util",
 "log",
 "tokio",
- "tungstenite 0.20.1",
-]
-
-[[package]]
-name = "tokio-tungstenite"
-version = "0.21.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
-dependencies = [
- "futures-util",
- "log",
- "tokio",
- "tungstenite 0.21.0",
+ "tungstenite",
 ]

 [[package]]
@@ -6335,10 +6158,10 @@ dependencies = [
 "bytes",
 "futures-core",
 "futures-util",
- "h2 0.3.26",
+ "h2",
 "http 0.2.9",
- "http-body 0.4.5",
- "hyper 0.14.26",
+ "http-body",
+ "hyper",
 "hyper-timeout",
 "percent-encoding",
 "pin-project",
@@ -6524,7 +6347,7 @@ dependencies = [
 name = "tracing-utils"
 version = "0.1.0"
 dependencies = [
- "hyper 0.14.26",
+ "hyper",
 "opentelemetry",
 "opentelemetry-otlp",
 "opentelemetry-semantic-conventions",
@@ -6561,25 +6384,6 @@ dependencies = [
 "utf-8",
 ]

-[[package]]
-name = "tungstenite"
-version = "0.21.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
-dependencies = [
- "byteorder",
- "bytes",
- "data-encoding",
- "http 1.1.0",
- "httparse",
- "log",
- "rand 0.8.5",
- "sha1",
- "thiserror",
- "url",
- "utf-8",
-]
-
 [[package]]
 name = "twox-hash"
 version = "1.6.3"
@@ -6744,8 +6548,7 @@ dependencies = [
 "heapless",
 "hex",
 "hex-literal",
- "humantime",
- "hyper 0.14.26",
+ "hyper",
 "jsonwebtoken",
 "leaky-bucket",
 "metrics",
@@ -7105,15 +6908,6 @@ dependencies = [
 "windows-targets 0.48.0",
 ]

-[[package]]
-name = "windows-sys"
-version = "0.52.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
-dependencies = [
- "windows-targets 0.52.4",
-]
-
 [[package]]
 name = "windows-targets"
 version = "0.42.2"
@@ -7144,21 +6938,6 @@ dependencies = [
 "windows_x86_64_msvc 0.48.0",
 ]

-[[package]]
-name = "windows-targets"
-version = "0.52.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
-dependencies = [
- "windows_aarch64_gnullvm 0.52.4",
- "windows_aarch64_msvc 0.52.4",
- "windows_i686_gnu 0.52.4",
- "windows_i686_msvc 0.52.4",
- "windows_x86_64_gnu 0.52.4",
- "windows_x86_64_gnullvm 0.52.4",
- "windows_x86_64_msvc 0.52.4",
-]
-
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.42.2"
@@ -7171,12 +6950,6 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"

-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.52.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
-
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.42.2"
@@ -7189,12 +6962,6 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"

-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.52.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
-
 [[package]]
 name = "windows_i686_gnu"
 version = "0.42.2"
@@ -7207,12 +6974,6 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"

-[[package]]
-name = "windows_i686_gnu"
-version = "0.52.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
-
 [[package]]
 name = "windows_i686_msvc"
 version = "0.42.2"
@@ -7225,12 +6986,6 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"

-[[package]]
-name = "windows_i686_msvc"
-version = "0.52.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
-
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.42.2"
@@ -7243,12 +6998,6 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"

-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.52.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
-
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.42.2"
@@ -7261,12 +7010,6 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"

-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.52.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
-
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.42.2"
@@ -7279,12 +7022,6 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"

-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.52.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
-
 [[package]]
 name = "winnow"
 version = "0.4.6"
@@ -7333,10 +7070,11 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "getrandom 0.2.11",
+ "hashbrown 0.13.2",
 "hashbrown 0.14.0",
 "hex",
 "hmac",
- "hyper 0.14.26",
+ "hyper",
 "indexmap 1.9.3",
 "itertools",
 "libc",
@@ -7374,6 +7112,7 @@ dependencies = [
 "tower",
 "tracing",
 "tracing-core",
+ "tungstenite",
 "url",
 "uuid",
 "zeroize",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
 members = [
    "compute_tools",
    "control_plane",
-    "control_plane/storcon_cli",
+    "control_plane/attachment_service",
    "pageserver",
    "pageserver/compaction",
    "pageserver/ctl",
@@ -12,7 +12,6 @@ members = [
    "proxy",
    "safekeeper",
    "storage_broker",
-    "storage_controller",
    "s3_scrubber",
    "workspace_hack",
    "trace",
@@ -44,7 +43,6 @@ license = "Apache-2.0"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
-atomic-take = "1.1.0"
 azure_core = "0.18"
 azure_identity = "0.18"
 azure_storage = "0.18"
@@ -98,7 +96,7 @@ http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
-hyper-tungstenite = "0.13.0"
+hyper-tungstenite = "0.11"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
@@ -107,8 +105,7 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.20", features=["lasso"] }
-measured-process = { version = "0.0.20" }
+measured = { version = "0.0.13", features=["default", "lasso"] }
 memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -161,7 +158,7 @@ svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
-test-context = "0.3"
+test-context = "0.1"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"
 tikv-jemalloc-ctl = "0.5"
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -944,9 +944,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl

-# Create remote extension download directory
-RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
-
 # Install:
 # libreadline8 for psql
 # libicu67, locales for collations (including ICU and plpgsql_check)
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1262,12 +1262,10 @@ LIMIT 100",
        .await
        .map_err(DownloadError::Other);

-        if download_size.is_ok() {
-            self.ext_download_progress
-                .write()
-                .expect("bad lock")
-                .insert(ext_archive_name.to_string(), (download_start, true));
-        }
+        self.ext_download_progress
+            .write()
+            .expect("bad lock")
+            .insert(ext_archive_name.to_string(), (download_start, true));

        download_size
    }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -6,8 +6,8 @@ use std::path::Path;
 use anyhow::Result;

 use crate::pg_helpers::escape_conf_value;
-use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize};
-use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
+use crate::pg_helpers::PgOptionsSerialize;
+use compute_api::spec::{ComputeMode, ComputeSpec};

 /// Check that `line` is inside a text file and put it there if it is not.
 /// Create file if it doesn't exist.
@@ -92,27 +92,6 @@ pub fn write_postgres_conf(
        }
    }

-    if cfg!(target_os = "linux") {
-        // Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is
-        // disabled), then the control plane has enabled swap and we should set
-        // dynamic_shared_memory_type = 'mmap'.
-        //
-        // This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047.
-        let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory")
-            // ignore any errors - they may be expected to occur under certain situations (e.g. when
-            // not running in Linux).
-            .unwrap_or_else(|_| String::new());
-        if overcommit_memory_contents.trim() == "2" {
-            let opt = GenericOption {
-                name: "dynamic_shared_memory_type".to_owned(),
-                value: Some("mmap".to_owned()),
-                vartype: "enum".to_owned(),
-            };
-
-            write!(file, "{}", opt.to_pg_setting())?;
-        }
-    }
-
    // If there are any extra options in the 'settings' field, append those
    if spec.cluster.settings.is_some() {
        writeln!(file, "# Managed by compute_ctl: begin")?;
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String {
    format!("'{}'", res)
 }

-pub trait GenericOptionExt {
+trait GenericOptionExt {
    fn to_pg_option(&self) -> String;
    fn to_pg_setting(&self) -> String;
 }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            RoleAction::Create => {
                // This branch only runs when roles are created through the console, so it is
                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
-                // from neon_superuser.
+                // from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
                let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("running role create query: '{}'", &query);
@@ -743,24 +743,21 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
    // which may happen in two cases:
    // - extension was just installed
    // - extension was already installed and is up to date
-    let query = "ALTER EXTENSION neon UPDATE";
-    info!("update neon extension version with query: {}", query);
-    if let Err(e) = client.simple_query(query) {
-        error!(
-            "failed to upgrade neon extension during `handle_extension_neon`: {}",
-            e
-        );
-    }
+    // DISABLED due to compute node unpinning epic
+    // let query = "ALTER EXTENSION neon UPDATE";
+    // info!("update neon extension version with query: {}", query);
+    // client.simple_query(query)?;

    Ok(())
 }

 #[instrument(skip_all)]
-pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
-    info!("handle neon extension upgrade");
-    let query = "ALTER EXTENSION neon UPDATE";
-    info!("update neon extension version with query: {}", query);
-    client.simple_query(query)?;
+pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
+    info!("handle neon extension upgrade (not really)");
+    // DISABLED due to compute node unpinning epic
+    // let query = "ALTER EXTENSION neon UPDATE";
+    // info!("update neon extension version with query: {}", query);
+    // client.simple_query(query)?;

    Ok(())
 }
@@ -809,8 +806,19 @@ $$;"#,
        "",
        "",
        "",
-        "",
        // Add new migrations below.
+        r#"
+DO $$
+DECLARE
+    role_name TEXT;
+BEGIN
+    FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
+    END LOOP;
+END
+$$;"#,
    ];

    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "storage_controller"
+name = "attachment_service"
 version = "0.1.0"
 edition.workspace = true
 license.workspace = true
@@ -25,7 +25,6 @@ git-version.workspace = true
 hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
-itertools.workspace = true
 lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
@@ -45,8 +44,8 @@ diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
 r2d2 = { version = "0.8.10" }

-utils = { path = "../libs/utils/" }
-metrics = { path = "../libs/metrics/" }
-control_plane = { path = "../control_plane" }
-workspace_hack = { version = "0.1", path = "../workspace_hack" }
+utils = { path = "../../libs/utils/" }
+metrics = { path = "../../libs/metrics/" }
+control_plane = { path = ".." }
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }

--- a/control_plane/attachment_service/migrations/.keep
+++ b/control_plane/attachment_service/migrations/.keep
--- a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
--- a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
--- a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
--- a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
--- a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
+++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
--- a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
+++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
--- a/control_plane/attachment_service/src/auth.rs
+++ b/control_plane/attachment_service/src/auth.rs
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -1,4 +1,3 @@
-use std::sync::Arc;
 use std::{collections::HashMap, time::Duration};

 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
@@ -15,30 +14,19 @@ use utils::{

 use crate::service::Config;

+const BUSY_DELAY: Duration = Duration::from_secs(1);
 const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);

 pub(crate) const API_CONCURRENCY: usize = 32;

-struct UnshardedComputeHookTenant {
-    // Which node is this tenant attached to
-    node_id: NodeId,
-
-    // Must hold this lock to send a notification.
-    send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
-}
 struct ShardedComputeHookTenant {
    stripe_size: ShardStripeSize,
    shard_count: ShardCount,
    shards: Vec<(ShardNumber, NodeId)>,
-
-    // Must hold this lock to send a notification.  The contents represent
-    // the last successfully sent notification, and are used to coalesce multiple
-    // updates by only sending when there is a chance since our last successful send.
-    send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
 }

 enum ComputeHookTenant {
-    Unsharded(UnshardedComputeHookTenant),
+    Unsharded(NodeId),
    Sharded(ShardedComputeHookTenant),
 }

@@ -50,20 +38,9 @@ impl ComputeHookTenant {
                shards: vec![(tenant_shard_id.shard_number, node_id)],
                stripe_size,
                shard_count: tenant_shard_id.shard_count,
-                send_lock: Arc::default(),
            })
        } else {
-            Self::Unsharded(UnshardedComputeHookTenant {
-                node_id,
-                send_lock: Arc::default(),
-            })
-        }
-    }
-
-    fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>> {
-        match self {
-            Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock,
-            Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock,
+            Self::Unsharded(node_id)
        }
    }

@@ -76,8 +53,8 @@ impl ComputeHookTenant {
        node_id: NodeId,
    ) {
        match self {
-            Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => {
-                unsharded_tenant.node_id = node_id
+            Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
+                *existing_node_id = node_id
            }
            Self::Sharded(sharded_tenant)
                if sharded_tenant.stripe_size == stripe_size
@@ -104,14 +81,14 @@ impl ComputeHookTenant {
    }
 }

-#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Debug)]
 struct ComputeHookNotifyRequestShard {
    node_id: NodeId,
    shard_number: ShardNumber,
 }

 /// Request body that we send to the control plane to notify it of where a tenant is attached
-#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Debug)]
 struct ComputeHookNotifyRequest {
    tenant_id: TenantId,
    stripe_size: Option<ShardStripeSize>,
@@ -144,44 +121,14 @@ pub(crate) enum NotifyError {
    Fatal(StatusCode),
 }

-enum MaybeSendResult {
-    // Please send this request while holding the lock, and if you succeed then write
-    // the request into the lock.
-    Transmit(
-        (
-            ComputeHookNotifyRequest,
-            tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>,
-        ),
-    ),
-    // Something requires sending, but you must wait for a current sender then call again
-    AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>),
-    // Nothing requires sending
-    Noop,
-}
-
 impl ComputeHookTenant {
-    fn maybe_send(
-        &self,
-        tenant_id: TenantId,
-        lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>>,
-    ) -> MaybeSendResult {
-        let locked = match lock {
-            Some(already_locked) => already_locked,
-            None => {
-                // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock.
-                let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else {
-                    return MaybeSendResult::AwaitLock(self.get_send_lock().clone());
-                };
-                locked
-            }
-        };
-
-        let request = match self {
-            Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest {
+    fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
+        match self {
+            Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
                tenant_id,
                shards: vec![ComputeHookNotifyRequestShard {
                    shard_number: ShardNumber(0),
-                    node_id: unsharded_tenant.node_id,
+                    node_id: *node_id,
                }],
                stripe_size: None,
            }),
@@ -205,25 +152,12 @@ impl ComputeHookTenant {
                // Sharded tenant doesn't yet have information for all its shards

                tracing::info!(
-                    "ComputeHookTenant::maybe_send: not enough shards ({}/{})",
+                    "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
                    sharded_tenant.shards.len(),
                    sharded_tenant.shard_count.count()
                );
                None
            }
-        };
-
-        match request {
-            None => {
-                // Not yet ready to emit a notification
-                tracing::info!("Tenant isn't yet ready to emit a notification");
-                MaybeSendResult::Noop
-            }
-            Some(request) if Some(&request) == locked.as_ref() => {
-                // No change from the last value successfully sent
-                MaybeSendResult::Noop
-            }
-            Some(request) => MaybeSendResult::Transmit((request, locked)),
        }
    }
 }
@@ -233,15 +167,8 @@ impl ComputeHookTenant {
 /// the compute connection string.
 pub(super) struct ComputeHook {
    config: Config,
-    state: std::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
+    state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
    authorization_header: Option<String>,
-
-    // Concurrency limiter, so that we do not overload the cloud control plane when updating
-    // large numbers of tenants (e.g. when failing over after a node failure)
-    api_concurrency: tokio::sync::Semaphore,
-
-    // This lock is only used in testing enviroments, to serialize calls into neon_lock
-    neon_local_lock: tokio::sync::Mutex<()>,
 }

 impl ComputeHook {
@@ -255,20 +182,14 @@ impl ComputeHook {
            state: Default::default(),
            config,
            authorization_header,
-            neon_local_lock: Default::default(),
-            api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
        }
    }

    /// For test environments: use neon_local's LocalEnv to update compute
    async fn do_notify_local(
        &self,
-        reconfigure_request: &ComputeHookNotifyRequest,
+        reconfigure_request: ComputeHookNotifyRequest,
    ) -> anyhow::Result<()> {
-        // neon_local updates are not safe to call concurrently, use a lock to serialize
-        // all calls to this function
-        let _locked = self.neon_local_lock.lock().await;
-
        let env = match LocalEnv::load_config() {
            Ok(e) => e,
            Err(e) => {
@@ -285,7 +206,7 @@ impl ComputeHook {
        } = reconfigure_request;

        let compute_pageservers = shards
-            .iter()
+            .into_iter()
            .map(|shard| {
                let ps_conf = env
                    .get_pageserver_conf(shard.node_id)
@@ -297,10 +218,10 @@ impl ComputeHook {
            .collect::<Vec<_>>();

        for (endpoint_name, endpoint) in &cplane.endpoints {
-            if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
+            if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
                tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
                endpoint
-                    .reconfigure(compute_pageservers.clone(), *stripe_size)
+                    .reconfigure(compute_pageservers.clone(), stripe_size)
                    .await?;
            }
        }
@@ -359,10 +280,11 @@ impl ComputeHook {
                Err(NotifyError::SlowDown)
            }
            StatusCode::LOCKED => {
-                // We consider this fatal, because it's possible that the operation blocking the control one is
-                // also the one that is waiting for this reconcile.  We should let the reconciler calling
-                // this hook fail, to give control plane a chance to un-lock.
-                tracing::info!("Control plane reports tenant is locked, dropping out of notify");
+                // Delay our retry if busy: the usual fast exponential backoff in backoff::retry
+                // is not appropriate
+                tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
+                    .await
+                    .ok();
                Err(NotifyError::Busy)
            }
            StatusCode::SERVICE_UNAVAILABLE
@@ -378,29 +300,13 @@ impl ComputeHook {
    async fn do_notify(
        &self,
        url: &String,
-        reconfigure_request: &ComputeHookNotifyRequest,
+        reconfigure_request: ComputeHookNotifyRequest,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
        let client = reqwest::Client::new();
-
-        // We hold these semaphore units across all retries, rather than only across each
-        // HTTP request: this is to preserve fairness and avoid a situation where a retry might
-        // time out waiting for a semaphore.
-        let _units = self
-            .api_concurrency
-            .acquire()
-            .await
-            // Interpret closed semaphore as shutdown
-            .map_err(|_| NotifyError::ShuttingDown)?;
-
        backoff::retry(
-            || self.do_notify_iteration(&client, url, reconfigure_request, cancel),
-            |e| {
-                matches!(
-                    e,
-                    NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy
-                )
-            },
+            || self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
+            |e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
            3,
            10,
            "Send compute notification",
@@ -434,70 +340,42 @@ impl ComputeHook {
        stripe_size: ShardStripeSize,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
-        let maybe_send_result = {
-            let mut state_locked = self.state.lock().unwrap();
+        let mut locked = self.state.lock().await;

-            use std::collections::hash_map::Entry;
-            let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
-                Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
-                    tenant_shard_id,
-                    stripe_size,
-                    node_id,
-                )),
-                Entry::Occupied(e) => {
-                    let tenant = e.into_mut();
-                    tenant.update(tenant_shard_id, stripe_size, node_id);
-                    tenant
-                }
-            };
-            tenant.maybe_send(tenant_shard_id.tenant_id, None)
+        use std::collections::hash_map::Entry;
+        let tenant = match locked.entry(tenant_shard_id.tenant_id) {
+            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
+                tenant_shard_id,
+                stripe_size,
+                node_id,
+            )),
+            Entry::Occupied(e) => {
+                let tenant = e.into_mut();
+                tenant.update(tenant_shard_id, stripe_size, node_id);
+                tenant
+            }
        };

-        // Process result: we may get an update to send, or we may have to wait for a lock
-        // before trying again.
-        let (request, mut send_lock_guard) = match maybe_send_result {
-            MaybeSendResult::Noop => {
-                return Ok(());
-            }
-            MaybeSendResult::AwaitLock(send_lock) => {
-                let send_locked = send_lock.lock_owned().await;
-
-                // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here
-                // we have acquired the send lock and take `[Self::state]` lock.  This is safe because maybe_send only uses
-                // try_lock.
-                let state_locked = self.state.lock().unwrap();
-                let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else {
-                    return Ok(());
-                };
-                match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) {
-                    MaybeSendResult::AwaitLock(_) => {
-                        unreachable!("We supplied lock guard")
-                    }
-                    MaybeSendResult::Noop => {
-                        return Ok(());
-                    }
-                    MaybeSendResult::Transmit((request, lock)) => (request, lock),
-                }
-            }
-            MaybeSendResult::Transmit((request, lock)) => (request, lock),
+        let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
+        let Some(reconfigure_request) = reconfigure_request else {
+            // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
+            // until it does.
+            tracing::info!("Tenant isn't yet ready to emit a notification");
+            return Ok(());
        };

-        let result = if let Some(notify_url) = &self.config.compute_hook_url {
-            self.do_notify(notify_url, &request, cancel).await
+        if let Some(notify_url) = &self.config.compute_hook_url {
+            self.do_notify(notify_url, reconfigure_request, cancel)
+                .await
        } else {
-            self.do_notify_local(&request).await.map_err(|e| {
-                // This path is for testing only, so munge the error into our prod-style error type.
-                tracing::error!("Local notification hook failed: {e}");
-                NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
-            })
-        };
-
-        if result.is_ok() {
-            // Before dropping the send lock, stash the request we just sent so that
-            // subsequent callers can avoid redundantly re-sending the same thing.
-            *send_lock_guard = Some(request);
+            self.do_notify_local(reconfigure_request)
+                .await
+                .map_err(|e| {
+                    // This path is for testing only, so munge the error into our prod-style error type.
+                    tracing::error!("Local notification hook failed: {e}");
+                    NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
+                })
        }
-        result
    }
 }

@@ -521,22 +399,21 @@ pub(crate) mod tests {
            NodeId(1),
        );

-        // An unsharded tenant is always ready to emit a notification, but won't
-        // send the same one twice
-        let send_result = tenant_state.maybe_send(tenant_id, None);
-        let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
-            anyhow::bail!("Wrong send result");
-        };
-        assert_eq!(request.shards.len(), 1);
-        assert!(request.stripe_size.is_none());
-
-        // Simulate successful send
-        *guard = Some(request);
-        drop(guard);
-
-        // Try asking again: this should be a no-op
-        let send_result = tenant_state.maybe_send(tenant_id, None);
-        assert!(matches!(send_result, MaybeSendResult::Noop));
+        // An unsharded tenant is always ready to emit a notification
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .shards
+                .len(),
+            1
+        );
+        assert!(tenant_state
+            .maybe_reconfigure(tenant_id)
+            .unwrap()
+            .stripe_size
+            .is_none());

        // Writing the first shard of a multi-sharded situation (i.e. in a split)
        // resets the tenant state and puts it in an non-notifying state (need to
@@ -550,10 +427,7 @@ pub(crate) mod tests {
            ShardStripeSize(32768),
            NodeId(1),
        );
-        assert!(matches!(
-            tenant_state.maybe_send(tenant_id, None),
-            MaybeSendResult::Noop
-        ));
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());

        // Writing the second shard makes it ready to notify
        tenant_state.update(
@@ -566,16 +440,22 @@ pub(crate) mod tests {
            NodeId(1),
        );

-        let send_result = tenant_state.maybe_send(tenant_id, None);
-        let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
-            anyhow::bail!("Wrong send result");
-        };
-        assert_eq!(request.shards.len(), 2);
-        assert_eq!(request.stripe_size, Some(ShardStripeSize(32768)));
-
-        // Simulate successful send
-        *guard = Some(request);
-        drop(guard);
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .shards
+                .len(),
+            2
+        );
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .stripe_size,
+            Some(ShardStripeSize(32768))
+        );

        Ok(())
    }
--- a/control_plane/attachment_service/src/heartbeater.rs
+++ b/control_plane/attachment_service/src/heartbeater.rs
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -8,7 +8,6 @@ use futures::Future;
 use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
-use metrics::{BuildInfo, NeonMetrics};
 use pageserver_api::models::{
    TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
    TenantTimeTravelRequest, TimelineCreateRequest,
@@ -35,8 +34,7 @@ use utils::{
 };

 use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest,
-    TenantShardMigrateRequest,
+    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
 };
 use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};

@@ -45,19 +43,15 @@ use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
 use routerify::Middleware;

 /// State available to HTTP request handlers
+#[derive(Clone)]
 pub struct HttpState {
    service: Arc<crate::service::Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
-    neon_metrics: NeonMetrics,
    allowlist_routes: Vec<Uri>,
 }

 impl HttpState {
-    pub fn new(
-        service: Arc<crate::service::Service>,
-        auth: Option<Arc<SwappableJwtAuth>>,
-        build_info: BuildInfo,
-    ) -> Self {
+    pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
        let allowlist_routes = ["/status", "/ready", "/metrics"]
            .iter()
            .map(|v| v.parse().unwrap())
@@ -65,7 +59,6 @@ impl HttpState {
        Self {
            service,
            auth,
-            neon_metrics: NeonMetrics::new(build_info),
            allowlist_routes,
        }
    }
@@ -405,15 +398,6 @@ async fn handle_tenant_describe(
    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
 }

-async fn handle_tenant_list(
-    service: Arc<Service>,
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    json_response(StatusCode::OK, service.tenant_list())
-}
-
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -427,10 +411,7 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
    check_permissions(&req, Scope::Admin)?;

    let state = get_state(&req);
-    let nodes = state.service.node_list().await?;
-    let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
-
-    json_response(StatusCode::OK, api_nodes)
+    json_response(StatusCode::OK, state.service.node_list().await?)
 }

 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -497,22 +478,6 @@ async fn handle_tenant_shard_migrate(
    )
 }

-async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
-    let state = get_state(&req);
-
-    json_response(
-        StatusCode::OK,
-        state
-            .service
-            .tenant_update_policy(tenant_id, update_req)
-            .await?,
-    )
-}
-
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    check_permissions(&req, Scope::PageServerApi)?;
@@ -544,14 +509,6 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, state.service.consistency_check().await?)
 }

-async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let state = get_state(&req);
-
-    json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
-}
-
 /// Status endpoint is just used for checking that our HTTP listener is up
 async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(StatusCode::OK, ())
@@ -608,17 +565,9 @@ where
    .await
 }

-/// Check if the required scope is held in the request's token, or if the request has
-/// a token with 'admin' scope then always permit it.
 fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
    check_permission_with(request, |claims| {
-        match crate::auth::check_permission(claims, required_scope) {
-            Err(e) => match crate::auth::check_permission(claims, Scope::Admin) {
-                Ok(()) => Ok(()),
-                Err(_) => Err(e),
-            },
-            Ok(()) => Ok(()),
-        }
+        crate::auth::check_permission(claims, required_scope)
    })
 }

@@ -678,11 +627,10 @@ fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>
    })
 }

-pub async fn measured_metrics_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";

-    let state = get_state(&req);
-    let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics);
+    let payload = crate::metrics::METRICS_REGISTRY.encode();
    let response = Response::builder()
        .status(200)
        .header(CONTENT_TYPE, TEXT_FORMAT)
@@ -711,7 +659,6 @@ where
 pub fn make_router(
    service: Arc<Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
-    build_info: BuildInfo,
 ) -> RouterBuilder<hyper::Body, ApiError> {
    let mut router = endpoint::make_router()
        .middleware(prologue_metrics_middleware())
@@ -728,7 +675,7 @@ pub fn make_router(
    }

    router
-        .data(Arc::new(HttpState::new(service, auth, build_info)))
+        .data(Arc::new(HttpState::new(service, auth)))
        .get("/metrics", |r| {
            named_request_span(r, measured_metrics_handler, RequestName("metrics"))
        })
@@ -779,9 +726,6 @@ pub fn make_router(
                RequestName("debug_v1_consistency_check"),
            )
        })
-        .post("/debug/v1/reconcile_all", |r| {
-            request_span(r, handle_reconcile_all)
-        })
        .put("/debug/v1/failpoints", |r| {
            request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
        })
@@ -821,16 +765,6 @@ pub fn make_router(
                RequestName("control_v1_tenant_describe"),
            )
        })
-        .get("/control/v1/tenant", |r| {
-            tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list"))
-        })
-        .put("/control/v1/tenant/:tenant_id/policy", |r| {
-            named_request_span(
-                r,
-                handle_tenant_update_policy,
-                RequestName("control_v1_tenant_policy"),
-            )
-        })
        // Tenant operations
        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
--- a/control_plane/attachment_service/src/id_lock_map.rs
+++ b/control_plane/attachment_service/src/id_lock_map.rs
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -14,7 +14,7 @@ mod reconciler;
 mod scheduler;
 mod schema;
 pub mod service;
-mod tenant_shard;
+mod tenant_state;

 #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
 struct Sequence(u64);
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -1,20 +1,18 @@
 use anyhow::{anyhow, Context};
+use attachment_service::http::make_router;
+use attachment_service::metrics::preinitialize_metrics;
+use attachment_service::persistence::Persistence;
+use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
-use metrics::BuildInfo;
 use std::sync::Arc;
-use storage_controller::http::make_router;
-use storage_controller::metrics::preinitialize_metrics;
-use storage_controller::persistence::Persistence;
-use storage_controller::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};

-use utils::sentry_init::init_sentry;
 use utils::{project_build_tag, project_git_version, tcp_listener};

 project_git_version!(GIT_VERSION);
@@ -52,7 +50,7 @@ struct Cli {
    #[arg(short, long)]
    path: Option<Utf8PathBuf>,

-    /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller
+    /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
    #[arg(long)]
    database_url: Option<String>,

@@ -160,8 +158,6 @@ fn main() -> anyhow::Result<()> {
        std::process::exit(1);
    }));

-    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
-
    tokio::runtime::Builder::new_current_thread()
        // We use spawn_blocking for database operations, so require approximately
        // as many blocking threads as we will open database connections.
@@ -193,11 +189,6 @@ async fn async_main() -> anyhow::Result<()> {
        args.listen
    );

-    let build_info = BuildInfo {
-        revision: GIT_VERSION,
-        build_tag: BUILD_TAG,
-    };
-
    let strict_mode = if args.dev {
        StrictMode::Dev
    } else {
@@ -259,7 +250,7 @@ async fn async_main() -> anyhow::Result<()> {
    let auth = secrets
        .public_key
        .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
-    let router = make_router(service.clone(), auth, build_info)
+    let router = make_router(service.clone(), auth)
        .build()
        .map_err(|err| anyhow!(err))?;
    let router_service = utils::http::RouterService::new(router).unwrap();
--- a/control_plane/attachment_service/src/metrics.rs
+++ b/control_plane/attachment_service/src/metrics.rs
@@ -8,8 +8,10 @@
 //! The rest of the code defines label group types and deals with converting outer types to labels.
 //!
 use bytes::Bytes;
-use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup};
-use metrics::NeonMetrics;
+use measured::{
+    label::{LabelValue, StaticLabelSet},
+    FixedCardinalityLabel, MetricGroup,
+};
 use once_cell::sync::Lazy;
 use std::sync::Mutex;

@@ -24,28 +26,21 @@ pub fn preinitialize_metrics() {

 pub(crate) struct StorageControllerMetrics {
    pub(crate) metrics_group: StorageControllerMetricGroup,
-    encoder: Mutex<measured::text::BufferedTextEncoder>,
+    encoder: Mutex<measured::text::TextEncoder>,
 }

 #[derive(measured::MetricGroup)]
-#[metric(new())]
 pub(crate) struct StorageControllerMetricGroup {
    /// Count of how many times we spawn a reconcile task
    pub(crate) storage_controller_reconcile_spawn: measured::Counter,
-
    /// Reconciler tasks completed, broken down by success/failure/cancelled
    pub(crate) storage_controller_reconcile_complete:
        measured::CounterVec<ReconcileCompleteLabelGroupSet>,

-    /// Count of how many times we make an optimization change to a tenant's scheduling
-    pub(crate) storage_controller_schedule_optimization: measured::Counter,
-
    /// HTTP request status counters for handled requests
    pub(crate) storage_controller_http_request_status:
        measured::CounterVec<HttpRequestStatusLabelGroupSet>,
-
    /// HTTP request handler latency across all status codes
-    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
    pub(crate) storage_controller_http_request_latency:
        measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,

@@ -57,7 +52,6 @@ pub(crate) struct StorageControllerMetricGroup {
    /// Latency of HTTP requests to the pageserver, broken down by pageserver
    /// node id, request name and method. This include both successful and unsuccessful
    /// requests.
-    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
    pub(crate) storage_controller_pageserver_request_latency:
        measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,

@@ -69,7 +63,6 @@ pub(crate) struct StorageControllerMetricGroup {
    /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
    /// node id, request name and method. This include both successful and unsuccessful
    /// requests.
-    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
    pub(crate) storage_controller_passthrough_request_latency:
        measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,

@@ -78,34 +71,75 @@ pub(crate) struct StorageControllerMetricGroup {
        measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,

    /// Latency of database queries, broken down by operation.
-    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
    pub(crate) storage_controller_database_query_latency:
        measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
 }

 impl StorageControllerMetrics {
-    pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes {
+    pub(crate) fn encode(&self) -> Bytes {
        let mut encoder = self.encoder.lock().unwrap();
-        neon_metrics
-            .collect_group_into(&mut *encoder)
-            .unwrap_or_else(|infallible| match infallible {});
-        self.metrics_group
-            .collect_group_into(&mut *encoder)
-            .unwrap_or_else(|infallible| match infallible {});
+        self.metrics_group.collect_into(&mut *encoder);
        encoder.finish()
    }
 }

 impl Default for StorageControllerMetrics {
    fn default() -> Self {
-        let mut metrics_group = StorageControllerMetricGroup::new();
-        metrics_group
-            .storage_controller_reconcile_complete
-            .init_all_dense();
-
        Self {
-            metrics_group,
-            encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
+            metrics_group: StorageControllerMetricGroup::new(),
+            encoder: Mutex::new(measured::text::TextEncoder::new()),
+        }
+    }
+}
+
+impl StorageControllerMetricGroup {
+    pub(crate) fn new() -> Self {
+        Self {
+            storage_controller_reconcile_spawn: measured::Counter::new(),
+            storage_controller_reconcile_complete: measured::CounterVec::new(
+                ReconcileCompleteLabelGroupSet {
+                    status: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_http_request_status: measured::CounterVec::new(
+                HttpRequestStatusLabelGroupSet {
+                    path: lasso::ThreadedRodeo::new(),
+                    method: StaticLabelSet::new(),
+                    status: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_http_request_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
+            storage_controller_pageserver_request_error: measured::CounterVec::new(
+                PageserverRequestLabelGroupSet {
+                    pageserver_id: lasso::ThreadedRodeo::new(),
+                    path: lasso::ThreadedRodeo::new(),
+                    method: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_pageserver_request_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
+            storage_controller_passthrough_request_error: measured::CounterVec::new(
+                PageserverRequestLabelGroupSet {
+                    pageserver_id: lasso::ThreadedRodeo::new(),
+                    path: lasso::ThreadedRodeo::new(),
+                    method: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_passthrough_request_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
+            storage_controller_database_query_error: measured::CounterVec::new(
+                DatabaseQueryErrorLabelGroupSet {
+                    operation: StaticLabelSet::new(),
+                    error_type: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_database_query_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
        }
    }
 }
@@ -119,7 +153,7 @@ pub(crate) struct ReconcileCompleteLabelGroup {
 #[derive(measured::LabelGroup)]
 #[label(set = HttpRequestStatusLabelGroupSet)]
 pub(crate) struct HttpRequestStatusLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
    pub(crate) path: &'a str,
    pub(crate) method: Method,
    pub(crate) status: StatusCode,
@@ -128,21 +162,40 @@ pub(crate) struct HttpRequestStatusLabelGroup<'a> {
 #[derive(measured::LabelGroup)]
 #[label(set = HttpRequestLatencyLabelGroupSet)]
 pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
    pub(crate) path: &'a str,
    pub(crate) method: Method,
 }

+impl Default for HttpRequestLatencyLabelGroupSet {
+    fn default() -> Self {
+        Self {
+            path: lasso::ThreadedRodeo::new(),
+            method: StaticLabelSet::new(),
+        }
+    }
+}
+
 #[derive(measured::LabelGroup, Clone)]
 #[label(set = PageserverRequestLabelGroupSet)]
 pub(crate) struct PageserverRequestLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
    pub(crate) pageserver_id: &'a str,
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
    pub(crate) path: &'a str,
    pub(crate) method: Method,
 }

+impl Default for PageserverRequestLabelGroupSet {
+    fn default() -> Self {
+        Self {
+            pageserver_id: lasso::ThreadedRodeo::new(),
+            path: lasso::ThreadedRodeo::new(),
+            method: StaticLabelSet::new(),
+        }
+    }
+}
+
 #[derive(measured::LabelGroup)]
 #[label(set = DatabaseQueryErrorLabelGroupSet)]
 pub(crate) struct DatabaseQueryErrorLabelGroup {
@@ -156,7 +209,7 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup {
    pub(crate) operation: DatabaseOperation,
 }

-#[derive(FixedCardinalityLabel, Clone, Copy)]
+#[derive(FixedCardinalityLabel)]
 pub(crate) enum ReconcileOutcome {
    #[label(rename = "ok")]
    Success,
@@ -164,7 +217,7 @@ pub(crate) enum ReconcileOutcome {
    Cancel,
 }

-#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[derive(FixedCardinalityLabel, Clone)]
 pub(crate) enum Method {
    Get,
    Put,
@@ -189,12 +242,11 @@ impl From<hyper::Method> for Method {
    }
 }

-#[derive(Clone, Copy)]
 pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);

 impl LabelValue for StatusCode {
    fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
-        v.write_int(self.0.as_u16() as i64)
+        v.write_int(self.0.as_u16() as u64)
    }
 }

@@ -212,7 +264,7 @@ impl FixedCardinalityLabel for StatusCode {
    }
 }

-#[derive(FixedCardinalityLabel, Clone, Copy)]
+#[derive(FixedCardinalityLabel)]
 pub(crate) enum DatabaseErrorLabel {
    Query,
    Connection,
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -3,8 +3,7 @@ use std::{str::FromStr, time::Duration};
 use hyper::StatusCode;
 use pageserver_api::{
    controller_api::{
-        NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
-        TenantLocateResponseShard,
+        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
    },
    shard::TenantShardId,
 };
@@ -257,19 +256,6 @@ impl Node {
        )
        .await
    }
-
-    /// Generate the simplified API-friendly description of a node's state
-    pub(crate) fn describe(&self) -> NodeDescribeResponse {
-        NodeDescribeResponse {
-            id: self.id,
-            availability: self.availability.into(),
-            scheduling: self.scheduling,
-            listen_http_addr: self.listen_http_addr.clone(),
-            listen_http_port: self.listen_http_port,
-            listen_pg_addr: self.listen_pg_addr.clone(),
-            listen_pg_port: self.listen_pg_port,
-        }
-    }
 }

 impl std::fmt::Display for Node {
--- a/control_plane/attachment_service/src/pageserver_client.rs
+++ b/control_plane/attachment_service/src/pageserver_client.rs
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -9,7 +9,6 @@ use camino::Utf8PathBuf;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
-use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::ShardConfigError;
@@ -79,7 +78,7 @@ pub(crate) enum DatabaseError {
    Logical(String),
 }

-#[derive(measured::FixedCardinalityLabel, Copy, Clone)]
+#[derive(measured::FixedCardinalityLabel, Clone)]
 pub(crate) enum DatabaseOperation {
    InsertNode,
    UpdateNode,
@@ -108,12 +107,6 @@ pub(crate) enum AbortShardSplitStatus {

 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;

-/// Some methods can operate on either a whole tenant or a single shard
-pub(crate) enum TenantFilter {
-    Tenant(TenantId),
-    Shard(TenantShardId),
-}
-
 impl Persistence {
    // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
    // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
@@ -147,13 +140,15 @@ impl Persistence {
    /// Wraps `with_conn` in order to collect latency and error metrics
    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
    where
-        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
        R: Send + 'static,
    {
        let latency = &METRICS_REGISTRY
            .metrics_group
            .storage_controller_database_query_latency;
-        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
+        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
+            operation: op.clone(),
+        });

        let res = self.with_conn(func).await;

@@ -173,7 +168,7 @@ impl Persistence {
    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
    where
-        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
        R: Send + 'static,
    {
        let mut conn = self.connection_pool.get()?;
@@ -280,11 +275,6 @@ impl Persistence {
                // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
                shard.placement_policy = "{\"Attached\":0}".to_string();
            }
-
-            if shard.scheduling_policy.is_empty() {
-                shard.scheduling_policy =
-                    serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
-            }
        }

        let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
@@ -475,45 +465,59 @@ impl Persistence {
    /// that we only do the first time a tenant is set to an attached policy via /location_config.
    pub(crate) async fn update_tenant_shard(
        &self,
-        tenant: TenantFilter,
-        input_placement_policy: Option<PlacementPolicy>,
-        input_config: Option<TenantConfig>,
+        tenant_shard_id: TenantShardId,
+        input_placement_policy: PlacementPolicy,
+        input_config: TenantConfig,
        input_generation: Option<Generation>,
-        input_scheduling_policy: Option<ShardSchedulingPolicy>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;

        self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
-            let query = match tenant {
-                TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .into_boxed(),
-                TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(input_tenant_id.to_string()))
-                    .into_boxed(),
-            };
+            let query = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));

-            #[derive(AsChangeset)]
-            #[diesel(table_name = crate::schema::tenant_shards)]
-            struct ShardUpdate {
-                generation: Option<i32>,
-                placement_policy: Option<String>,
-                config: Option<String>,
-                scheduling_policy: Option<String>,
+            if let Some(input_generation) = input_generation {
+                // Update includes generation column
+                query
+                    .set((
+                        generation.eq(Some(input_generation.into().unwrap() as i32)),
+                        placement_policy
+                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
+                        config.eq(serde_json::to_string(&input_config).unwrap()),
+                    ))
+                    .execute(conn)?;
+            } else {
+                // Update does not include generation column
+                query
+                    .set((
+                        placement_policy
+                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
+                        config.eq(serde_json::to_string(&input_config).unwrap()),
+                    ))
+                    .execute(conn)?;
            }

-            let update = ShardUpdate {
-                generation: input_generation.map(|g| g.into().unwrap() as i32),
-                placement_policy: input_placement_policy
-                    .map(|p| serde_json::to_string(&p).unwrap()),
-                config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
-                scheduling_policy: input_scheduling_policy
-                    .map(|p| serde_json::to_string(&p).unwrap()),
-            };
+            Ok(())
+        })
+        .await?;

-            query.set(update).execute(conn)?;
+        Ok(())
+    }
+
+    pub(crate) async fn update_tenant_config(
+        &self,
+        input_tenant_id: TenantId,
+        input_config: TenantConfig,
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+
+        self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
+            diesel::update(tenant_shards)
+                .filter(tenant_id.eq(input_tenant_id.to_string()))
+                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
+                .execute(conn)?;

            Ok(())
        })
@@ -694,7 +698,7 @@ impl Persistence {
    }
 }

-/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
+/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
 #[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
 #[diesel(table_name = crate::schema::tenant_shards)]
 pub(crate) struct TenantShardPersistence {
@@ -724,8 +728,6 @@ pub(crate) struct TenantShardPersistence {
    pub(crate) splitting: SplitState,
    #[serde(default)]
    pub(crate) config: String,
-    #[serde(default)]
-    pub(crate) scheduling_policy: String,
 }

 impl TenantShardPersistence {
--- a/control_plane/attachment_service/src/persistence/split_state.rs
+++ b/control_plane/attachment_service/src/persistence/split_state.rs
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -18,14 +18,14 @@ use utils::sync::gate::GateGuard;

 use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
-use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation};
+use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};

 const DEFAULT_HEATMAP_PERIOD: &str = "60s";

 /// Object with the lifetime of the background reconcile task that is created
 /// for tenants which have a difference between their intent and observed states.
 pub(super) struct Reconciler {
-    /// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot
+    /// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
    /// of a tenant's state from when we spawned a reconcile task.
    pub(super) tenant_shard_id: TenantShardId,
    pub(crate) shard: ShardIdentity,
@@ -48,11 +48,11 @@ pub(super) struct Reconciler {

    /// To avoid stalling if the cloud control plane is unavailable, we may proceed
    /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
-    /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry.
+    /// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry.
    pub(crate) compute_notify_failure: bool,

    /// A means to abort background reconciliation: it is essential to
-    /// call this when something changes in the original TenantShard that
+    /// call this when something changes in the original TenantState that
    /// will make this reconciliation impossible or unnecessary, for
    /// example when a pageserver node goes offline, or the PlacementPolicy for
    /// the tenant is changed.
@@ -66,7 +66,7 @@ pub(super) struct Reconciler {
    pub(crate) persistence: Arc<Persistence>,
 }

-/// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any
+/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
 /// reference counting for Scheduler.  The IntentState is what the scheduler works with,
 /// and the TargetState is just the instruction for a particular Reconciler run.
 #[derive(Debug)]
@@ -487,7 +487,6 @@ impl Reconciler {
        while let Err(e) = self.compute_notify().await {
            match e {
                NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
-                NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
                _ => {
                    tracing::warn!(
                        "Live migration blocked by compute notification error, retrying: {e}"
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -1,4 +1,4 @@
-use crate::{node::Node, tenant_shard::TenantShard};
+use crate::{node::Node, tenant_state::TenantState};
 use pageserver_api::controller_api::UtilizationScore;
 use serde::Serialize;
 use std::collections::HashMap;
@@ -27,7 +27,7 @@ pub enum MaySchedule {

 #[derive(Serialize)]
 struct SchedulerNode {
-    /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
+    /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
    shard_count: usize,

    /// Whether this node is currently elegible to have new shards scheduled (this is derived
@@ -58,70 +58,6 @@ pub(crate) struct Scheduler {
    nodes: HashMap<NodeId, SchedulerNode>,
 }

-/// Score for soft constraint scheduling: lower scores are preferred to higher scores.
-///
-/// For example, we may set an affinity score based on the number of shards from the same
-/// tenant already on a node, to implicitly prefer to balance out shards.
-#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
-pub(crate) struct AffinityScore(pub(crate) usize);
-
-impl AffinityScore {
-    /// If we have no anti-affinity at all toward a node, this is its score.  It means
-    /// the scheduler has a free choice amongst nodes with this score, and may pick a node
-    /// based on other information such as total utilization.
-    pub(crate) const FREE: Self = Self(0);
-
-    pub(crate) fn inc(&mut self) {
-        self.0 += 1;
-    }
-}
-
-impl std::ops::Add for AffinityScore {
-    type Output = Self;
-
-    fn add(self, rhs: Self) -> Self::Output {
-        Self(self.0 + rhs.0)
-    }
-}
-
-// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
-// it for many shards in the same tenant.
-#[derive(Debug, Default)]
-pub(crate) struct ScheduleContext {
-    /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
-    pub(crate) nodes: HashMap<NodeId, AffinityScore>,
-
-    /// Specifically how many _attached_ locations are on each node
-    pub(crate) attached_nodes: HashMap<NodeId, usize>,
-}
-
-impl ScheduleContext {
-    /// Input is a list of nodes we would like to avoid using again within this context.  The more
-    /// times a node is passed into this call, the less inclined we are to use it.
-    pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
-        for node_id in nodes {
-            let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE);
-            entry.inc()
-        }
-    }
-
-    pub(crate) fn push_attached(&mut self, node_id: NodeId) {
-        let entry = self.attached_nodes.entry(node_id).or_default();
-        *entry += 1;
-    }
-
-    pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
-        self.nodes
-            .get(&node_id)
-            .copied()
-            .unwrap_or(AffinityScore::FREE)
-    }
-
-    pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
-        self.attached_nodes.get(&node_id).copied().unwrap_or(0)
-    }
-}
-
 impl Scheduler {
    pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
        let mut scheduler_nodes = HashMap::new();
@@ -147,7 +83,7 @@ impl Scheduler {
    pub(crate) fn consistency_check<'a>(
        &self,
        nodes: impl Iterator<Item = &'a Node>,
-        shards: impl Iterator<Item = &'a TenantShard>,
+        shards: impl Iterator<Item = &'a TenantState>,
    ) -> anyhow::Result<()> {
        let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
        for node in nodes {
@@ -288,47 +224,27 @@ impl Scheduler {
        node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
    }

-    /// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they
-    /// are already in use by this shard -- we use this to avoid picking the same node
-    /// as both attached and secondary location.  This is a hard constraint: if we cannot
-    /// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`].
-    ///
-    /// context: we prefer to avoid using nodes identified in the context, according
-    /// to their anti-affinity score.  We use this to prefeer to avoid placing shards in
-    /// the same tenant on the same node.  This is a soft constraint: the context will never
-    /// cause us to fail to schedule a shard.
-    pub(crate) fn schedule_shard(
-        &self,
-        hard_exclude: &[NodeId],
-        context: &ScheduleContext,
-    ) -> Result<NodeId, ScheduleError> {
+    pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
        if self.nodes.is_empty() {
            return Err(ScheduleError::NoPageservers);
        }

-        let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
+        let mut tenant_counts: Vec<(NodeId, usize)> = self
            .nodes
            .iter()
            .filter_map(|(k, v)| {
                if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
                    None
                } else {
-                    Some((
-                        *k,
-                        context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
-                        v.shard_count,
-                    ))
+                    Some((*k, v.shard_count))
                }
            })
            .collect();

-        // Sort by, in order of precedence:
-        //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
-        //  2nd: Utilization.  Within nodes with the same affinity, use the least loaded nodes.
-        //  3rd: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
-        scores.sort_by_key(|i| (i.1, i.2, i.0));
+        // Sort by tenant count.  Nodes with the same tenant count are sorted by ID.
+        tenant_counts.sort_by_key(|i| (i.1, i.0));

-        if scores.is_empty() {
+        if tenant_counts.is_empty() {
            // After applying constraints, no pageservers were left.  We log some detail about
            // the state of nodes to help understand why this happened.  This is not logged as an error because
            // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
@@ -344,11 +260,10 @@ impl Scheduler {
            return Err(ScheduleError::ImpossibleConstraint);
        }

-        // Lowest score wins
-        let node_id = scores.first().unwrap().0;
+        let node_id = tenant_counts.first().unwrap().0;
        tracing::info!(
-            "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
-            scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
+            "scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
+            tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
        );

        // Note that we do not update shard count here to reflect the scheduling: that
@@ -356,12 +271,6 @@ impl Scheduler {

        Ok(node_id)
    }
-
-    /// Unit test access to internal state
-    #[cfg(test)]
-    pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
-        self.nodes.get(&node_id).unwrap().shard_count
-    }
 }

 #[cfg(test)]
@@ -398,7 +307,7 @@ pub(crate) mod test_utils {
 mod tests {
    use super::*;

-    use crate::tenant_shard::IntentState;
+    use crate::tenant_state::IntentState;
    #[test]
    fn scheduler_basic() -> anyhow::Result<()> {
        let nodes = test_utils::make_test_nodes(2);
@@ -407,17 +316,15 @@ mod tests {
        let mut t1_intent = IntentState::new();
        let mut t2_intent = IntentState::new();

-        let context = ScheduleContext::default();
-
-        let scheduled = scheduler.schedule_shard(&[], &context)?;
+        let scheduled = scheduler.schedule_shard(&[])?;
        t1_intent.set_attached(&mut scheduler, Some(scheduled));
-        let scheduled = scheduler.schedule_shard(&[], &context)?;
+        let scheduled = scheduler.schedule_shard(&[])?;
        t2_intent.set_attached(&mut scheduler, Some(scheduled));

        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);

-        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
+        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
        t1_intent.push_secondary(&mut scheduler, scheduled);

        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -22,7 +22,6 @@ diesel::table! {
        placement_policy -> Varchar,
        splitting -> Int2,
        config -> Text,
-        scheduling_policy -> Varchar,
    }
 }

--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -8,10 +8,7 @@ use std::{
 };

 use crate::{
-    id_lock_map::IdLockMap,
-    persistence::{AbortShardSplitStatus, TenantFilter},
-    reconciler::ReconcileError,
-    scheduler::ScheduleContext,
+    id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
 };
 use anyhow::Context;
 use control_plane::storage_controller::{
@@ -20,14 +17,12 @@ use control_plane::storage_controller::{
 use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
-use itertools::Itertools;
 use pageserver_api::{
    controller_api::{
        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-        ShardSchedulingPolicy, TenantCreateResponse, TenantCreateResponseShard,
-        TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse,
-        TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
-        UtilizationScore,
+        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
+        TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
+        TenantShardMigrateResponse, UtilizationScore,
    },
    models::{SecondaryProgress, TenantConfigRequest},
 };
@@ -56,6 +51,7 @@ use utils::{
    generation::Generation,
    http::error::ApiError,
    id::{NodeId, TenantId, TimelineId},
+    seqwait::SeqWait,
    sync::gate::Gate,
 };

@@ -66,10 +62,11 @@ use crate::{
    persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence},
    reconciler::attached_location_conf,
    scheduler::Scheduler,
-    tenant_shard::{
+    tenant_state::{
        IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
-        ReconcilerWaiter, TenantShard,
+        ReconcilerWaiter, TenantState,
    },
+    Sequence,
 };

 // For operations that should be quick, like attaching a new tenant
@@ -92,7 +89,7 @@ pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);

 // Top level state available to all HTTP handlers
 struct ServiceState {
-    tenants: BTreeMap<TenantShardId, TenantShard>,
+    tenants: BTreeMap<TenantShardId, TenantState>,

    nodes: Arc<HashMap<NodeId, Node>>,

@@ -102,7 +99,7 @@ struct ServiceState {
 impl ServiceState {
    fn new(
        nodes: HashMap<NodeId, Node>,
-        tenants: BTreeMap<TenantShardId, TenantShard>,
+        tenants: BTreeMap<TenantShardId, TenantState>,
        scheduler: Scheduler,
    ) -> Self {
        Self {
@@ -116,7 +113,7 @@ impl ServiceState {
        &mut self,
    ) -> (
        &mut Arc<HashMap<NodeId, Node>>,
-        &mut BTreeMap<TenantShardId, TenantShard>,
+        &mut BTreeMap<TenantShardId, TenantState>,
        &mut Scheduler,
    ) {
        (&mut self.nodes, &mut self.tenants, &mut self.scheduler)
@@ -335,11 +332,11 @@ impl Service {

            for (tenant_shard_id, shard_observations) in observed {
                for (node_id, observed_loc) in shard_observations {
-                    let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
+                    let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
                        cleanup.push((tenant_shard_id, node_id));
                        continue;
                    };
-                    tenant_shard
+                    tenant_state
                        .observed
                        .locations
                        .insert(node_id, ObservedStateLocation { conf: observed_loc });
@@ -347,15 +344,9 @@ impl Service {
            }

            // Populate each tenant's intent state
-            let mut schedule_context = ScheduleContext::default();
-            for (tenant_shard_id, tenant_shard) in tenants.iter_mut() {
-                if tenant_shard_id.shard_number == ShardNumber(0) {
-                    // Reset scheduling context each time we advance to the next Tenant
-                    schedule_context = ScheduleContext::default();
-                }
-
-                tenant_shard.intent_from_observed(scheduler);
-                if let Err(e) = tenant_shard.schedule(scheduler, &mut schedule_context) {
+            for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
+                tenant_state.intent_from_observed(scheduler);
+                if let Err(e) = tenant_state.schedule(scheduler) {
                    // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
                    // not enough pageservers are available.  The tenant may well still be available
                    // to clients.
@@ -364,11 +355,11 @@ impl Service {
                    // If we're both intending and observed to be attached at a particular node, we will
                    // emit a compute notification for this. In the case where our observed state does not
                    // yet match our intent, we will eventually reconcile, and that will emit a compute notification.
-                    if let Some(attached_at) = tenant_shard.stably_attached() {
+                    if let Some(attached_at) = tenant_state.stably_attached() {
                        compute_notifications.push((
                            *tenant_shard_id,
                            attached_at,
-                            tenant_shard.shard.stripe_size,
+                            tenant_state.shard.stripe_size,
                        ));
                    }
                }
@@ -679,13 +670,7 @@ impl Service {
        let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD);
        while !self.cancel.is_cancelled() {
            tokio::select! {
-              _ = interval.tick() => {
-                let reconciles_spawned = self.reconcile_all();
-                if reconciles_spawned == 0 {
-                    // Run optimizer only when we didn't find any other work to do
-                    self.optimize_all();
-                }
-            }
+              _ = interval.tick() => { self.reconcile_all(); }
              _ = self.cancel.cancelled() => return
            }
        }
@@ -743,7 +728,7 @@ impl Service {

    /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation
    /// was successful, this will update the observed state of the tenant such that subsequent
-    /// calls to [`TenantShard::maybe_reconcile`] will do nothing.
+    /// calls to [`TenantState::maybe_reconcile`] will do nothing.
    #[instrument(skip_all, fields(
        tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(),
        sequence=%result.sequence
@@ -761,10 +746,10 @@ impl Service {
        tenant.generation = std::cmp::max(tenant.generation, result.generation);

        // If the reconciler signals that it failed to notify compute, set this state on
-        // the shard so that a future [`TenantShard::maybe_reconcile`] will try again.
+        // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
        tenant.pending_compute_notification = result.pending_compute_notification;

-        // Let the TenantShard know it is idle.
+        // Let the TenantState know it is idle.
        tenant.reconcile_complete(result.sequence);

        match result.result {
@@ -972,14 +957,30 @@ impl Service {
        }
        for tsp in tenant_shard_persistence {
            let tenant_shard_id = tsp.get_tenant_shard_id()?;
-
+            let shard_identity = tsp.get_shard_identity()?;
            // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
            // it with what we can infer: the node for which a generation was most recently issued.
            let mut intent = IntentState::new();
            if let Some(generation_pageserver) = tsp.generation_pageserver {
                intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
            }
-            let new_tenant = TenantShard::from_persistent(tsp, intent)?;
+
+            let new_tenant = TenantState {
+                tenant_shard_id,
+                shard: shard_identity,
+                sequence: Sequence::initial(),
+                generation: tsp.generation.map(|g| Generation::new(g as u32)),
+                policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
+                intent,
+                observed: ObservedState::new(),
+                config: serde_json::from_str(&tsp.config).unwrap(),
+                reconciler: None,
+                splitting: tsp.splitting,
+                waiter: Arc::new(SeqWait::new(Sequence::initial())),
+                error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
+                last_error: Arc::default(),
+                pending_compute_notification: false,
+            };

            tenants.insert(tenant_shard_id, new_tenant);
        }
@@ -1103,8 +1104,6 @@ impl Service {
                placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
                config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                splitting: SplitState::default(),
-                scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
-                    .unwrap(),
            };

            match self.persistence.insert_tenant_shards(vec![tsp]).await {
@@ -1126,7 +1125,7 @@ impl Service {
                    let mut locked = self.inner.write().unwrap();
                    locked.tenants.insert(
                        attach_req.tenant_shard_id,
-                        TenantShard::new(
+                        TenantState::new(
                            attach_req.tenant_shard_id,
                            ShardIdentity::unsharded(),
                            PlacementPolicy::Attached(0),
@@ -1157,10 +1156,9 @@ impl Service {
                    // when we reattaching a detached tenant.
                    self.persistence
                        .update_tenant_shard(
-                            TenantFilter::Shard(attach_req.tenant_shard_id),
-                            Some(PlacementPolicy::Attached(0)),
-                            Some(conf),
-                            None,
+                            attach_req.tenant_shard_id,
+                            PlacementPolicy::Attached(0),
+                            conf,
                            None,
                        )
                        .await?;
@@ -1178,32 +1176,32 @@ impl Service {
        let mut locked = self.inner.write().unwrap();
        let (_nodes, tenants, scheduler) = locked.parts_mut();

-        let tenant_shard = tenants
+        let tenant_state = tenants
            .get_mut(&attach_req.tenant_shard_id)
            .expect("Checked for existence above");

        if let Some(new_generation) = new_generation {
-            tenant_shard.generation = Some(new_generation);
-            tenant_shard.policy = PlacementPolicy::Attached(0);
+            tenant_state.generation = Some(new_generation);
+            tenant_state.policy = PlacementPolicy::Attached(0);
        } else {
            // This is a detach notification.  We must update placement policy to avoid re-attaching
            // during background scheduling/reconciliation, or during storage controller restart.
            assert!(attach_req.node_id.is_none());
-            tenant_shard.policy = PlacementPolicy::Detached;
+            tenant_state.policy = PlacementPolicy::Detached;
        }

        if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
            tracing::info!(
                tenant_id = %attach_req.tenant_shard_id,
                ps_id = %attaching_pageserver,
-                generation = ?tenant_shard.generation,
+                generation = ?tenant_state.generation,
                "issuing",
            );
-        } else if let Some(ps_id) = tenant_shard.intent.get_attached() {
+        } else if let Some(ps_id) = tenant_state.intent.get_attached() {
            tracing::info!(
                tenant_id = %attach_req.tenant_shard_id,
                %ps_id,
-                generation = ?tenant_shard.generation,
+                generation = ?tenant_state.generation,
                "dropping",
            );
        } else {
@@ -1211,14 +1209,14 @@ impl Service {
            tenant_id = %attach_req.tenant_shard_id,
            "no-op: tenant already has no pageserver");
        }
-        tenant_shard
+        tenant_state
            .intent
            .set_attached(scheduler, attach_req.node_id);

        tracing::info!(
            "attach_hook: tenant {} set generation {:?}, pageserver {}",
            attach_req.tenant_shard_id,
-            tenant_shard.generation,
+            tenant_state.generation,
            // TODO: this is an odd number of 0xf's
            attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
        );
@@ -1230,36 +1228,36 @@ impl Service {
        #[cfg(feature = "testing")]
        {
            if let Some(node_id) = attach_req.node_id {
-                tenant_shard.observed.locations = HashMap::from([(
+                tenant_state.observed.locations = HashMap::from([(
                    node_id,
                    ObservedStateLocation {
                        conf: Some(attached_location_conf(
-                            tenant_shard.generation.unwrap(),
-                            &tenant_shard.shard,
-                            &tenant_shard.config,
+                            tenant_state.generation.unwrap(),
+                            &tenant_state.shard,
+                            &tenant_state.config,
                            false,
                        )),
                    },
                )]);
            } else {
-                tenant_shard.observed.locations.clear();
+                tenant_state.observed.locations.clear();
            }
        }

        Ok(AttachHookResponse {
            gen: attach_req
                .node_id
-                .map(|_| tenant_shard.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
+                .map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
        })
    }

    pub(crate) fn inspect(&self, inspect_req: InspectRequest) -> InspectResponse {
        let locked = self.inner.read().unwrap();

-        let tenant_shard = locked.tenants.get(&inspect_req.tenant_shard_id);
+        let tenant_state = locked.tenants.get(&inspect_req.tenant_shard_id);

        InspectResponse {
-            attachment: tenant_shard.and_then(|s| {
+            attachment: tenant_state.and_then(|s| {
                s.intent
                    .get_attached()
                    .map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps))
@@ -1321,11 +1319,11 @@ impl Service {
            let mut locked = self.inner.write().unwrap();

            for (tenant_shard_id, observed_loc) in configs.tenant_shards {
-                let Some(tenant_shard) = locked.tenants.get_mut(&tenant_shard_id) else {
+                let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
                    cleanup.push(tenant_shard_id);
                    continue;
                };
-                tenant_shard
+                tenant_state
                    .observed
                    .locations
                    .insert(node.get_id(), ObservedStateLocation { conf: observed_loc });
@@ -1496,13 +1494,13 @@ impl Service {
        };

        for req_tenant in validate_req.tenants {
-            if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) {
-                let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen));
+            if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
+                let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen));
                tracing::info!(
                    "handle_validate: {}(gen {}): valid={valid} (latest {:?})",
                    req_tenant.id,
                    req_tenant.gen,
-                    tenant_shard.generation
+                    tenant_state.generation
                );
                response.tenants.push(ValidateResponseTenant {
                    id: req_tenant.id,
@@ -1617,8 +1615,6 @@ impl Service {
                placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                config: serde_json::to_string(&create_req.config).unwrap(),
                splitting: SplitState::default(),
-                scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
-                    .unwrap(),
            })
            .collect();

@@ -1641,8 +1637,6 @@ impl Service {
            Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))),
        };

-        let mut schedule_context = ScheduleContext::default();
-
        let (waiters, response_shards) = {
            let mut locked = self.inner.write().unwrap();
            let (nodes, tenants, scheduler) = locked.parts_mut();
@@ -1664,14 +1658,11 @@ impl Service {
                        // attached and secondary locations (independently) away frorm those
                        // pageservers also holding a shard for this tenant.

-                        entry
-                            .get_mut()
-                            .schedule(scheduler, &mut schedule_context)
-                            .map_err(|e| {
-                                ApiError::Conflict(format!(
-                                    "Failed to schedule shard {tenant_shard_id}: {e}"
-                                ))
-                            })?;
+                        entry.get_mut().schedule(scheduler).map_err(|e| {
+                            ApiError::Conflict(format!(
+                                "Failed to schedule shard {tenant_shard_id}: {e}"
+                            ))
+                        })?;

                        if let Some(node_id) = entry.get().intent.get_attached() {
                            let generation = entry
@@ -1688,7 +1679,7 @@ impl Service {
                        continue;
                    }
                    Entry::Vacant(entry) => {
-                        let state = entry.insert(TenantShard::new(
+                        let state = entry.insert(TenantState::new(
                            tenant_shard_id,
                            ShardIdentity::from_params(
                                tenant_shard_id.shard_number,
@@ -1699,7 +1690,7 @@ impl Service {

                        state.generation = initial_generation;
                        state.config = create_req.config.clone();
-                        if let Err(e) = state.schedule(scheduler, &mut schedule_context) {
+                        if let Err(e) = state.schedule(scheduler) {
                            schcedule_error = Some(e);
                        }

@@ -1763,9 +1754,6 @@ impl Service {

    /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
    /// and transform it into either a tenant creation of a series of shard updates.
-    ///
-    /// If the incoming request makes no changes, a [`TenantCreateOrUpdate::Update`] result will
-    /// still be returned.
    fn tenant_location_config_prepare(
        &self,
        tenant_id: TenantId,
@@ -1813,12 +1801,17 @@ impl Service {
                _ => None,
            };

-            updates.push(ShardUpdate {
-                tenant_shard_id: *shard_id,
-                placement_policy: placement_policy.clone(),
-                tenant_config: req.config.tenant_conf.clone(),
-                generation: set_generation,
-            });
+            if shard.policy != placement_policy
+                || shard.config != req.config.tenant_conf
+                || set_generation.is_some()
+            {
+                updates.push(ShardUpdate {
+                    tenant_shard_id: *shard_id,
+                    placement_policy: placement_policy.clone(),
+                    tenant_config: req.config.tenant_conf.clone(),
+                    generation: set_generation,
+                });
+            }
        }

        if create {
@@ -1847,7 +1840,6 @@ impl Service {
                },
            )
        } else {
-            assert!(!updates.is_empty());
            TenantCreateOrUpdate::Update(updates)
        }
    }
@@ -1906,7 +1898,6 @@ impl Service {
                // Persist updates
                // Ordering: write to the database before applying changes in-memory, so that
                // we will not appear time-travel backwards on a restart.
-                let mut schedule_context = ScheduleContext::default();
                for ShardUpdate {
                    tenant_shard_id,
                    placement_policy,
@@ -1916,11 +1907,10 @@ impl Service {
                {
                    self.persistence
                        .update_tenant_shard(
-                            TenantFilter::Shard(*tenant_shard_id),
-                            Some(placement_policy.clone()),
-                            Some(tenant_config.clone()),
+                            *tenant_shard_id,
+                            placement_policy.clone(),
+                            tenant_config.clone(),
                            *generation,
-                            None,
                        )
                        .await?;
                }
@@ -1954,7 +1944,7 @@ impl Service {
                            shard.generation = Some(generation);
                        }

-                        shard.schedule(scheduler, &mut schedule_context)?;
+                        shard.schedule(scheduler)?;

                        let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
                        if let Some(waiter) = maybe_waiter {
@@ -1998,13 +1988,7 @@ impl Service {
        let config = req.config;

        self.persistence
-            .update_tenant_shard(
-                TenantFilter::Tenant(req.tenant_id),
-                None,
-                Some(config.clone()),
-                None,
-                None,
-            )
+            .update_tenant_config(req.tenant_id, config.clone())
            .await?;

        let waiters = {
@@ -2114,7 +2098,7 @@ impl Service {
            let scheduler = &locked.scheduler;
            // Right now we only perform the operation on a single node without parallelization
            // TODO fan out the operation to multiple nodes for better performance
-            let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
+            let node_id = scheduler.schedule_shard(&[])?;
            let node = locked
                .nodes
                .get(&node_id)
@@ -2357,58 +2341,6 @@ impl Service {
        Ok(StatusCode::NOT_FOUND)
    }

-    /// Naming: this configures the storage controller's policies for a tenant, whereas [`Self::tenant_config_set`] is "set the TenantConfig"
-    /// for a tenant.  The TenantConfig is passed through to pageservers, whereas this function modifies
-    /// the tenant's policies (configuration) within the storage controller
-    pub(crate) async fn tenant_update_policy(
-        &self,
-        tenant_id: TenantId,
-        req: TenantPolicyRequest,
-    ) -> Result<(), ApiError> {
-        // We require an exclusive lock, because we are updating persistent and in-memory state
-        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
-
-        let TenantPolicyRequest {
-            placement,
-            scheduling,
-        } = req;
-
-        self.persistence
-            .update_tenant_shard(
-                TenantFilter::Tenant(tenant_id),
-                placement.clone(),
-                None,
-                None,
-                scheduling,
-            )
-            .await?;
-
-        let mut schedule_context = ScheduleContext::default();
-        let mut locked = self.inner.write().unwrap();
-        let (nodes, tenants, scheduler) = locked.parts_mut();
-        for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
-            if let Some(placement) = &placement {
-                shard.policy = placement.clone();
-
-                tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(),
-                               "Updated placement policy to {placement:?}");
-            }
-
-            if let Some(scheduling) = &scheduling {
-                shard.set_scheduling_policy(*scheduling);
-
-                tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(),
-                               "Updated scheduling policy to {scheduling:?}");
-            }
-
-            // In case scheduling is being switched back on, try it now.
-            shard.schedule(scheduler, &mut schedule_context).ok();
-            self.maybe_reconcile_shard(shard, nodes);
-        }
-
-        Ok(())
-    }
-
    pub(crate) async fn tenant_timeline_create(
        &self,
        tenant_id: TenantId,
@@ -2735,71 +2667,45 @@ impl Service {
        })
    }

-    /// Returns None if the input iterator of shards does not include a shard with number=0
-    fn tenant_describe_impl<'a>(
-        &self,
-        shards: impl Iterator<Item = &'a TenantShard>,
-    ) -> Option<TenantDescribeResponse> {
-        let mut shard_zero = None;
-        let mut describe_shards = Vec::new();
-
-        for shard in shards {
-            if shard.tenant_shard_id.is_zero() {
-                shard_zero = Some(shard);
-            }
-
-            describe_shards.push(TenantDescribeResponseShard {
-                tenant_shard_id: shard.tenant_shard_id,
-                node_attached: *shard.intent.get_attached(),
-                node_secondary: shard.intent.get_secondary().to_vec(),
-                last_error: shard.last_error.lock().unwrap().clone(),
-                is_reconciling: shard.reconciler.is_some(),
-                is_pending_compute_notification: shard.pending_compute_notification,
-                is_splitting: matches!(shard.splitting, SplitState::Splitting),
-                scheduling_policy: *shard.get_scheduling_policy(),
-            })
-        }
-
-        let shard_zero = shard_zero?;
-
-        Some(TenantDescribeResponse {
-            tenant_id: shard_zero.tenant_shard_id.tenant_id,
-            shards: describe_shards,
-            stripe_size: shard_zero.shard.stripe_size,
-            policy: shard_zero.policy.clone(),
-            config: shard_zero.config.clone(),
-        })
-    }
-
    pub(crate) fn tenant_describe(
        &self,
        tenant_id: TenantId,
    ) -> Result<TenantDescribeResponse, ApiError> {
        let locked = self.inner.read().unwrap();

-        self.tenant_describe_impl(
-            locked
-                .tenants
-                .range(TenantShardId::tenant_range(tenant_id))
-                .map(|(_k, v)| v),
-        )
-        .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
-    }
+        let mut shard_zero = None;
+        let mut shards = Vec::new();

-    pub(crate) fn tenant_list(&self) -> Vec<TenantDescribeResponse> {
-        let locked = self.inner.read().unwrap();
-
-        let mut result = Vec::new();
-        for (_tenant_id, tenant_shards) in
-            &locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id)
+        for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
        {
-            result.push(
-                self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v))
-                    .expect("Groups are always non-empty"),
-            );
+            if tenant_shard_id.is_zero() {
+                shard_zero = Some(shard);
+            }
+
+            let response_shard = TenantDescribeResponseShard {
+                tenant_shard_id: *tenant_shard_id,
+                node_attached: *shard.intent.get_attached(),
+                node_secondary: shard.intent.get_secondary().to_vec(),
+                last_error: shard.last_error.lock().unwrap().clone(),
+                is_reconciling: shard.reconciler.is_some(),
+                is_pending_compute_notification: shard.pending_compute_notification,
+                is_splitting: matches!(shard.splitting, SplitState::Splitting),
+            };
+            shards.push(response_shard);
        }

-        result
+        let Some(shard_zero) = shard_zero else {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
+            ));
+        };
+
+        Ok(TenantDescribeResponse {
+            shards,
+            stripe_size: shard_zero.shard.stripe_size,
+            policy: shard_zero.policy.clone(),
+            config: shard_zero.config.clone(),
+        })
    }

    #[instrument(skip_all, fields(tenant_id=%op.tenant_id))]
@@ -2892,7 +2798,7 @@ impl Service {

                tracing::info!("Restoring parent shard {tenant_shard_id}");
                shard.splitting = SplitState::Idle;
-                if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) {
+                if let Err(e) = shard.schedule(scheduler) {
                    // If this shard can't be scheduled now (perhaps due to offline nodes or
                    // capacity issues), that must not prevent us rolling back a split.  In this
                    // case it should be eventually scheduled in the background.
@@ -3016,7 +2922,6 @@ impl Service {
                    )
                };

-                let mut schedule_context = ScheduleContext::default();
                for child in child_ids {
                    let mut child_shard = parent_ident;
                    child_shard.number = child.shard_number;
@@ -3038,7 +2943,7 @@ impl Service {
                        },
                    );

-                    let mut child_state = TenantShard::new(child, child_shard, policy.clone());
+                    let mut child_state = TenantState::new(child, child_shard, policy.clone());
                    child_state.intent = IntentState::single(scheduler, Some(pageserver));
                    child_state.observed = ObservedState {
                        locations: child_observed,
@@ -3046,13 +2951,13 @@ impl Service {
                    child_state.generation = Some(generation);
                    child_state.config = config.clone();

-                    // The child's TenantShard::splitting is intentionally left at the default value of Idle,
+                    // The child's TenantState::splitting is intentionally left at the default value of Idle,
                    // as at this point in the split process we have succeeded and this part is infallible:
                    // we will never need to do any special recovery from this state.

                    child_locations.push((child, pageserver, child_shard.stripe_size));

-                    if let Err(e) = child_state.schedule(scheduler, &mut schedule_context) {
+                    if let Err(e) = child_state.schedule(scheduler) {
                        // This is not fatal, because we've implicitly already got an attached
                        // location for the child shard.  Failure here just means we couldn't
                        // find a secondary (e.g. because cluster is overloaded).
@@ -3345,10 +3250,6 @@ impl Service {
                    placement_policy: serde_json::to_string(&policy).unwrap(),
                    config: serde_json::to_string(&config).unwrap(),
                    splitting: SplitState::Splitting,
-
-                    // Scheduling policies do not carry through to children
-                    scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
-                        .unwrap(),
                });
            }

@@ -3595,8 +3496,8 @@ impl Service {
        Ok(())
    }

-    /// For debug/support: a full JSON dump of TenantShards.  Returns a response so that
-    /// we don't have to make TenantShard clonable in the return path.
+    /// For debug/support: a full JSON dump of TenantStates.  Returns a response so that
+    /// we don't have to make TenantState clonable in the return path.
    pub(crate) fn tenants_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> {
        let serialized = {
            let locked = self.inner.read().unwrap();
@@ -3700,7 +3601,7 @@ impl Service {
    }

    /// For debug/support: a JSON dump of the [`Scheduler`].  Returns a response so that
-    /// we don't have to make TenantShard clonable in the return path.
+    /// we don't have to make TenantState clonable in the return path.
    pub(crate) fn scheduler_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> {
        let serialized = {
            let locked = self.inner.read().unwrap();
@@ -3916,9 +3817,8 @@ impl Service {
            AvailabilityTransition::ToOffline => {
                tracing::info!("Node {} transition to offline", node_id);
                let mut tenants_affected: usize = 0;
-
-                for (tenant_shard_id, tenant_shard) in tenants {
-                    if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
+                for (tenant_shard_id, tenant_state) in tenants {
+                    if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
                        // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
                        // not assume our knowledge of the node's configuration is accurate until it comes back online
                        observed_loc.conf = None;
@@ -3931,24 +3831,18 @@ impl Service {
                        continue;
                    }

-                    if tenant_shard.intent.demote_attached(node_id) {
-                        tenant_shard.sequence = tenant_shard.sequence.next();
-
-                        // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
-                        // for tenants without secondary locations: if they have a secondary location, then this
-                        // schedule() call is just promoting an existing secondary)
-                        let mut schedule_context = ScheduleContext::default();
-
-                        match tenant_shard.schedule(scheduler, &mut schedule_context) {
+                    if tenant_state.intent.demote_attached(node_id) {
+                        tenant_state.sequence = tenant_state.sequence.next();
+                        match tenant_state.schedule(scheduler) {
                            Err(e) => {
                                // It is possible that some tenants will become unschedulable when too many pageservers
                                // go offline: in this case there isn't much we can do other than make the issue observable.
-                                // TODO: give TenantShard a scheduling error attribute to be queried later.
+                                // TODO: give TenantState a scheduling error attribute to be queried later.
                                tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id);
                            }
                            Ok(()) => {
                                if self
-                                    .maybe_reconcile_shard(tenant_shard, &new_nodes)
+                                    .maybe_reconcile_shard(tenant_state, &new_nodes)
                                    .is_some()
                                {
                                    tenants_affected += 1;
@@ -3967,10 +3861,10 @@ impl Service {
                tracing::info!("Node {} transition to active", node_id);
                // When a node comes back online, we must reconcile any tenant that has a None observed
                // location on the node.
-                for tenant_shard in locked.tenants.values_mut() {
-                    if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
+                for tenant_state in locked.tenants.values_mut() {
+                    if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
                        if observed_loc.conf.is_none() {
-                            self.maybe_reconcile_shard(tenant_shard, &new_nodes);
+                            self.maybe_reconcile_shard(tenant_state, &new_nodes);
                        }
                    }
                }
@@ -3990,6 +3884,9 @@ impl Service {
    /// Helper for methods that will try and call pageserver APIs for
    /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
    /// is attached somewhere.
+    ///
+    /// TODO: this doesn't actually ensure attached unless the PlacementPolicy is
+    /// an attached policy.  We should error out if it isn't.
    fn ensure_attached_schedule(
        &self,
        mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
@@ -3998,27 +3895,10 @@ impl Service {
        let mut waiters = Vec::new();
        let (nodes, tenants, scheduler) = locked.parts_mut();

-        let mut schedule_context = ScheduleContext::default();
-        for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
-            shard.schedule(scheduler, &mut schedule_context)?;
-
-            // The shard's policies may not result in an attached location being scheduled: this
-            // is an error because our caller needs it attached somewhere.
-            if shard.intent.get_attached().is_none() {
-                return Err(anyhow::anyhow!(
-                    "Tenant {tenant_id} not scheduled to be attached"
-                ));
-            };
-
-            if shard.stably_attached().is_some() {
-                // We do not require the shard to be totally up to date on reconciliation: we just require
-                // that it has been attached on the intended node.   Other dirty state such as unattached secondary
-                // locations, or compute hook notifications can be ignored.
-                continue;
-            }
+        for (_tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+            shard.schedule(scheduler)?;

            if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
-                tracing::info!("Waiting for shard {tenant_shard_id} to reconcile, in order to ensure it is attached");
                waiters.push(waiter);
            }
        }
@@ -4053,11 +3933,11 @@ impl Service {
        Ok(())
    }

-    /// Convenience wrapper around [`TenantShard::maybe_reconcile`] that provides
+    /// Convenience wrapper around [`TenantState::maybe_reconcile`] that provides
    /// all the references to parts of Self that are needed
    fn maybe_reconcile_shard(
        &self,
-        shard: &mut TenantShard,
+        shard: &mut TenantState,
        nodes: &Arc<HashMap<NodeId, Node>>,
    ) -> Option<ReconcilerWaiter> {
        shard.maybe_reconcile(
@@ -4080,144 +3960,8 @@ impl Service {
        let (nodes, tenants, _scheduler) = locked.parts_mut();
        let pageservers = nodes.clone();

-        let mut schedule_context = ScheduleContext::default();
-
        let mut reconciles_spawned = 0;
-        for (tenant_shard_id, shard) in tenants.iter_mut() {
-            if tenant_shard_id.is_zero() {
-                schedule_context = ScheduleContext::default();
-            }
-
-            // Eventual consistency: if an earlier reconcile job failed, and the shard is still
-            // dirty, spawn another rone
-            if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
-                reconciles_spawned += 1;
-            }
-
-            schedule_context.avoid(&shard.intent.all_pageservers());
-        }
-
-        reconciles_spawned
-    }
-
-    /// `optimize` in this context means identifying shards which have valid scheduled locations, but
-    /// could be scheduled somewhere better:
-    /// - Cutting over to a secondary if the node with the secondary is more lightly loaded
-    ///    * e.g. after a node fails then recovers, to move some work back to it
-    /// - Cutting over to a secondary if it improves the spread of shard attachments within a tenant
-    ///    * e.g. after a shard split, the initial attached locations will all be on the node where
-    ///      we did the split, but are probably better placed elsewhere.
-    /// - Creating new secondary locations if it improves the spreading of a sharded tenant
-    ///    * e.g. after a shard split, some locations will be on the same node (where the split
-    ///     happened), and will probably be better placed elsewhere.
-    ///
-    /// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at
-    /// the time of scheduling, this function looks for cases where a better-scoring location is available
-    /// according to those same soft constraints.
-    fn optimize_all(&self) -> usize {
-        let mut locked = self.inner.write().unwrap();
-        let (nodes, tenants, scheduler) = locked.parts_mut();
-        let pageservers = nodes.clone();
-
-        let mut schedule_context = ScheduleContext::default();
-
-        let mut reconciles_spawned = 0;
-
-        let mut tenant_shards: Vec<&TenantShard> = Vec::new();
-
-        // Limit on how many shards' optmizations each call to this function will execute.  Combined
-        // with the frequency of background calls, this acts as an implicit rate limit that runs a small
-        // trickle of optimizations in the background, rather than executing a large number in parallel
-        // when a change occurs.
-        const MAX_OPTIMIZATIONS_PER_PASS: usize = 2;
-
-        let mut work = Vec::new();
-
-        for (tenant_shard_id, shard) in tenants.iter() {
-            if tenant_shard_id.is_zero() {
-                // Reset accumulators on the first shard in a tenant
-                schedule_context = ScheduleContext::default();
-                tenant_shards.clear();
-            }
-
-            if work.len() >= MAX_OPTIMIZATIONS_PER_PASS {
-                break;
-            }
-
-            match shard.get_scheduling_policy() {
-                ShardSchedulingPolicy::Active => {
-                    // Ok to do optimization
-                }
-                ShardSchedulingPolicy::Essential
-                | ShardSchedulingPolicy::Pause
-                | ShardSchedulingPolicy::Stop => {
-                    // Policy prevents optimizing this shard.
-                    continue;
-                }
-            }
-
-            // Accumulate the schedule context for all the shards in a tenant: we must have
-            // the total view of all shards before we can try to optimize any of them.
-            schedule_context.avoid(&shard.intent.all_pageservers());
-            if let Some(attached) = shard.intent.get_attached() {
-                schedule_context.push_attached(*attached);
-            }
-            tenant_shards.push(shard);
-
-            // Once we have seen the last shard in the tenant, proceed to search across all shards
-            // in the tenant for optimizations
-            if shard.shard.number.0 == shard.shard.count.count() - 1 {
-                if tenant_shards.iter().any(|s| s.reconciler.is_some()) {
-                    // Do not start any optimizations while another change to the tenant is ongoing: this
-                    // is not necessary for correctness, but simplifies operations and implicitly throttles
-                    // optimization changes to happen in a "trickle" over time.
-                    continue;
-                }
-
-                if tenant_shards.iter().any(|s| {
-                    !matches!(s.splitting, SplitState::Idle)
-                        || matches!(s.policy, PlacementPolicy::Detached)
-                }) {
-                    // Never attempt to optimize a tenant that is currently being split, or
-                    // a tenant that is meant to be detached
-                    continue;
-                }
-
-                // TODO: optimization calculations are relatively expensive: create some fast-path for
-                // the common idle case (avoiding the search on tenants that we have recently checked)
-
-                for shard in &tenant_shards {
-                    if let Some(optimization) =
-                        // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
-                        // its primary location based on soft constraints, cut it over.
-                        shard.optimize_attachment(nodes, &schedule_context)
-                    {
-                        work.push((shard.tenant_shard_id, optimization));
-                        break;
-                    } else if let Some(optimization) =
-                        // If idle, maybe optimize secondary locations: if a shard has a secondary location that would be
-                        // better placed on another node, based on ScheduleContext, then adjust it.  This
-                        // covers cases like after a shard split, where we might have too many shards
-                        // in the same tenant with secondary locations on the node where they originally split.
-                        shard.optimize_secondary(scheduler, &schedule_context)
-                    {
-                        work.push((shard.tenant_shard_id, optimization));
-                        break;
-                    }
-
-                    // TODO: extend this mechanism to prefer attaching on nodes with fewer attached
-                    // tenants (i.e. extend schedule state to distinguish attached from secondary counts),
-                    // for the total number of attachments on a node (not just within a tenant.)
-                }
-            }
-        }
-
-        for (tenant_shard_id, optimization) in work {
-            let shard = tenants
-                .get_mut(&tenant_shard_id)
-                .expect("We held lock from place we got this ID");
-            shard.apply_optimization(scheduler, optimization);
-
+        for (_tenant_shard_id, shard) in tenants.iter_mut() {
            if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
                reconciles_spawned += 1;
            }
@@ -4226,35 +3970,9 @@ impl Service {
        reconciles_spawned
    }

-    /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but
-    /// also wait for any generated Reconcilers to complete.  Calling this until it returns zero should
-    /// put the system into a quiescent state where future background reconciliations won't do anything.
-    pub(crate) async fn reconcile_all_now(&self) -> Result<usize, ReconcileWaitError> {
-        let reconciles_spawned = self.reconcile_all();
-        if reconciles_spawned == 0 {
-            // Only optimize when we are otherwise idle
-            self.optimize_all();
-        }
-
-        let waiters = {
-            let mut waiters = Vec::new();
-            let locked = self.inner.read().unwrap();
-            for (_tenant_shard_id, shard) in locked.tenants.iter() {
-                if let Some(waiter) = shard.get_waiter() {
-                    waiters.push(waiter);
-                }
-            }
-            waiters
-        };
-
-        let waiter_count = waiters.len();
-        self.await_waiters(waiters, RECONCILE_TIMEOUT).await?;
-        Ok(waiter_count)
-    }
-
    pub async fn shutdown(&self) {
        // Note that this already stops processing any results from reconciles: so
-        // we do not expect that our [`TenantShard`] objects will reach a neat
+        // we do not expect that our [`TenantState`] objects will reach a neat
        // final state.
        self.cancel.cancel();

--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -7,9 +7,8 @@ use std::{
 use crate::{
    metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
    persistence::TenantShardPersistence,
-    scheduler::{AffinityScore, MaySchedule, ScheduleContext},
 };
-use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
+use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
    shard::{ShardIdentity, TenantShardId},
@@ -50,7 +49,7 @@ where
 /// This struct implement Serialize for debugging purposes, but is _not_ persisted
 /// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
 #[derive(Serialize)]
-pub(crate) struct TenantShard {
+pub(crate) struct TenantState {
    pub(crate) tenant_shard_id: TenantShardId,

    pub(crate) shard: ShardIdentity,
@@ -117,10 +116,6 @@ pub(crate) struct TenantShard {
    /// sending it.  This is the mechanism by which compute notifications are included in the scope
    /// of state that we publish externally in an eventually consistent way.
    pub(crate) pending_compute_notification: bool,
-
-    // Support/debug tool: if something is going wrong or flapping with scheduling, this may
-    // be set to a non-active state to avoid making changes while the issue is fixed.
-    scheduling_policy: ShardSchedulingPolicy,
 }

 #[derive(Default, Clone, Debug, Serialize)]
@@ -251,13 +246,8 @@ impl IntentState {

 impl Drop for IntentState {
    fn drop(&mut self) {
-        // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler.
-        // We do not check this while panicking, to avoid polluting unit test failures or
-        // other assertions with this assertion's output.  It's still wrong to leak these,
-        // but if we already have a panic then we don't need to independently flag this case.
-        if !(std::thread::panicking()) {
-            debug_assert!(self.attached.is_none() && self.secondary.is_empty());
-        }
+        // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
+        debug_assert!(self.attached.is_none() && self.secondary.is_empty());
    }
 }

@@ -302,26 +292,6 @@ pub enum ReconcileWaitError {
    Failed(TenantShardId, String),
 }

-#[derive(Eq, PartialEq, Debug)]
-pub(crate) struct ReplaceSecondary {
-    old_node_id: NodeId,
-    new_node_id: NodeId,
-}
-
-#[derive(Eq, PartialEq, Debug)]
-pub(crate) struct MigrateAttachment {
-    old_attached_node_id: NodeId,
-    new_attached_node_id: NodeId,
-}
-
-#[derive(Eq, PartialEq, Debug)]
-pub(crate) enum ScheduleOptimization {
-    // Replace one of our secondary locations with a different node
-    ReplaceSecondary(ReplaceSecondary),
-    // Migrate attachment to an existing secondary location
-    MigrateAttachment(MigrateAttachment),
-}
-
 impl ReconcilerWaiter {
    pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
        tokio::select! {
@@ -354,7 +324,7 @@ pub(crate) struct ReconcilerHandle {
 }

 /// When a reconcile task completes, it sends this result object
-/// to be applied to the primary TenantShard.
+/// to be applied to the primary TenantState.
 pub(crate) struct ReconcileResult {
    pub(crate) sequence: Sequence,
    /// On errors, `observed` should be treated as an incompleted description
@@ -367,7 +337,7 @@ pub(crate) struct ReconcileResult {
    pub(crate) generation: Option<Generation>,
    pub(crate) observed: ObservedState,

-    /// Set [`TenantShard::pending_compute_notification`] from this flag
+    /// Set [`TenantState::pending_compute_notification`] from this flag
    pub(crate) pending_compute_notification: bool,
 }

@@ -379,7 +349,7 @@ impl ObservedState {
    }
 }

-impl TenantShard {
+impl TenantState {
    pub(crate) fn new(
        tenant_shard_id: TenantShardId,
        shard: ShardIdentity,
@@ -400,7 +370,6 @@ impl TenantShard {
            error_waiter: Arc::new(SeqWait::new(Sequence(0))),
            last_error: Arc::default(),
            pending_compute_notification: false,
-            scheduling_policy: ShardSchedulingPolicy::default(),
        }
    }

@@ -456,7 +425,6 @@ impl TenantShard {
    fn schedule_attached(
        &mut self,
        scheduler: &mut Scheduler,
-        context: &ScheduleContext,
    ) -> Result<(bool, NodeId), ScheduleError> {
        // No work to do if we already have an attached tenant
        if let Some(node_id) = self.intent.attached {
@@ -470,33 +438,14 @@ impl TenantShard {
            Ok((true, promote_secondary))
        } else {
            // Pick a fresh node: either we had no secondaries or none were schedulable
-            let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?;
+            let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
            tracing::debug!("Selected {} as attached", node_id);
            self.intent.set_attached(scheduler, Some(node_id));
            Ok((true, node_id))
        }
    }

-    pub(crate) fn schedule(
-        &mut self,
-        scheduler: &mut Scheduler,
-        context: &mut ScheduleContext,
-    ) -> Result<(), ScheduleError> {
-        let r = self.do_schedule(scheduler, context);
-
-        context.avoid(&self.intent.all_pageservers());
-        if let Some(attached) = self.intent.get_attached() {
-            context.push_attached(*attached);
-        }
-
-        r
-    }
-
-    pub(crate) fn do_schedule(
-        &mut self,
-        scheduler: &mut Scheduler,
-        context: &ScheduleContext,
-    ) -> Result<(), ScheduleError> {
+    pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
        // TODO: before scheduling new nodes, check if any existing content in
        // self.intent refers to pageservers that are offline, and pick other
        // pageservers if so.
@@ -504,16 +453,6 @@ impl TenantShard {
        // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
        // change their attach location.

-        match self.scheduling_policy {
-            ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {}
-            ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
-                // Warn to make it obvious why other things aren't happening/working, if we skip scheduling
-                tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
-                    "Scheduling is disabled by policy {:?}", self.scheduling_policy);
-                return Ok(());
-            }
-        }
-
        // Build the set of pageservers already in use by this tenant, to avoid scheduling
        // more work on the same pageservers we're already using.
        let mut modified = false;
@@ -540,13 +479,12 @@ impl TenantShard {
                }

                // Should have exactly one attached, and N secondaries
-                let (modified_attached, attached_node_id) =
-                    self.schedule_attached(scheduler, context)?;
+                let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
                modified |= modified_attached;

                let mut used_pageservers = vec![attached_node_id];
                while self.intent.secondary.len() < secondary_count {
-                    let node_id = scheduler.schedule_shard(&used_pageservers, context)?;
+                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
                    self.intent.push_secondary(scheduler, node_id);
                    used_pageservers.push(node_id);
                    modified = true;
@@ -559,7 +497,7 @@ impl TenantShard {
                    modified = true;
                } else if self.intent.secondary.is_empty() {
                    // Populate secondary by scheduling a fresh node
-                    let node_id = scheduler.schedule_shard(&[], context)?;
+                    let node_id = scheduler.schedule_shard(&[])?;
                    self.intent.push_secondary(scheduler, node_id);
                    modified = true;
                }
@@ -586,167 +524,6 @@ impl TenantShard {
        Ok(())
    }

-    /// Optimize attachments: if a shard has a secondary location that is preferable to
-    /// its primary location based on soft constraints, switch that secondary location
-    /// to be attached.
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
-    pub(crate) fn optimize_attachment(
-        &self,
-        nodes: &HashMap<NodeId, Node>,
-        schedule_context: &ScheduleContext,
-    ) -> Option<ScheduleOptimization> {
-        let attached = (*self.intent.get_attached())?;
-        if self.intent.secondary.is_empty() {
-            // We can only do useful work if we have both attached and secondary locations: this
-            // function doesn't schedule new locations, only swaps between attached and secondaries.
-            return None;
-        }
-
-        let current_affinity_score = schedule_context.get_node_affinity(attached);
-        let current_attachment_count = schedule_context.get_node_attachments(attached);
-
-        // Generate score for each node, dropping any un-schedulable nodes.
-        let all_pageservers = self.intent.all_pageservers();
-        let mut scores = all_pageservers
-            .iter()
-            .flat_map(|node_id| {
-                if matches!(
-                    nodes
-                        .get(node_id)
-                        .map(|n| n.may_schedule())
-                        .unwrap_or(MaySchedule::No),
-                    MaySchedule::No
-                ) {
-                    None
-                } else {
-                    let affinity_score = schedule_context.get_node_affinity(*node_id);
-                    let attachment_count = schedule_context.get_node_attachments(*node_id);
-                    Some((*node_id, affinity_score, attachment_count))
-                }
-            })
-            .collect::<Vec<_>>();
-
-        // Sort precedence:
-        //  1st - prefer nodes with the lowest total affinity score
-        //  2nd - prefer nodes with the lowest number of attachments in this context
-        //  3rd - if all else is equal, sort by node ID for determinism in tests.
-        scores.sort_by_key(|i| (i.1, i.2, i.0));
-
-        if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) =
-            scores.first()
-        {
-            if attached != *preferred_node {
-                // The best alternative must be more than 1 better than us, otherwise we could end
-                // up flapping back next time we're called (e.g. there's no point migrating from
-                // a location with score 1 to a score zero, because on next location the situation
-                // would be the same, but in reverse).
-                if current_affinity_score > *preferred_affinity_score + AffinityScore(1)
-                    || current_attachment_count > *preferred_attachment_count + 1
-                {
-                    tracing::info!(
-                        "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
-                        self.intent.get_secondary()
-                    );
-                    return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                        old_attached_node_id: attached,
-                        new_attached_node_id: *preferred_node,
-                    }));
-                }
-            } else {
-                tracing::debug!(
-                    "Node {} is already preferred (score {:?})",
-                    preferred_node,
-                    preferred_affinity_score
-                );
-            }
-        }
-
-        // Fall-through: we didn't find an optimization
-        None
-    }
-
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
-    pub(crate) fn optimize_secondary(
-        &self,
-        scheduler: &Scheduler,
-        schedule_context: &ScheduleContext,
-    ) -> Option<ScheduleOptimization> {
-        if self.intent.secondary.is_empty() {
-            // We can only do useful work if we have both attached and secondary locations: this
-            // function doesn't schedule new locations, only swaps between attached and secondaries.
-            return None;
-        }
-
-        for secondary in self.intent.get_secondary() {
-            let Some(affinity_score) = schedule_context.nodes.get(secondary) else {
-                // We're already on a node unaffected any affinity constraints,
-                // so we won't change it.
-                continue;
-            };
-
-            // Let the scheduler suggest a node, where it would put us if we were scheduling afresh
-            // This implicitly limits the choice to nodes that are available, and prefers nodes
-            // with lower utilization.
-            let Ok(candidate_node) =
-                scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context)
-            else {
-                // A scheduling error means we have no possible candidate replacements
-                continue;
-            };
-
-            let candidate_affinity_score = schedule_context
-                .nodes
-                .get(&candidate_node)
-                .unwrap_or(&AffinityScore::FREE);
-
-            // The best alternative must be more than 1 better than us, otherwise we could end
-            // up flapping back next time we're called.
-            if *candidate_affinity_score + AffinityScore(1) < *affinity_score {
-                // If some other node is available and has a lower score than this node, then
-                // that other node is a good place to migrate to.
-                tracing::info!(
-                    "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
-                    self.intent.get_secondary()
-                );
-                return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
-                    old_node_id: *secondary,
-                    new_node_id: candidate_node,
-                }));
-            }
-        }
-
-        None
-    }
-
-    pub(crate) fn apply_optimization(
-        &mut self,
-        scheduler: &mut Scheduler,
-        optimization: ScheduleOptimization,
-    ) {
-        metrics::METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_schedule_optimization
-            .inc();
-
-        match optimization {
-            ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                old_attached_node_id,
-                new_attached_node_id,
-            }) => {
-                self.intent.demote_attached(old_attached_node_id);
-                self.intent
-                    .promote_attached(scheduler, new_attached_node_id);
-            }
-            ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
-                old_node_id,
-                new_node_id,
-            }) => {
-                self.intent.remove_secondary(scheduler, old_node_id);
-                self.intent.push_secondary(scheduler, new_node_id);
-            }
-        }
-    }
-
    /// Query whether the tenant's observed state for attached node matches its intent state, and if so,
    /// yield the node ID.  This is appropriate for emitting compute hook notifications: we are checking that
    /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
@@ -891,19 +668,6 @@ impl TenantShard {
            }
        }

-        // Pre-checks done: finally check whether we may actually do the work
-        match self.scheduling_policy {
-            ShardSchedulingPolicy::Active
-            | ShardSchedulingPolicy::Essential
-            | ShardSchedulingPolicy::Pause => {}
-            ShardSchedulingPolicy::Stop => {
-                // We only reach this point if there is work to do and we're going to skip
-                // doing it: warn it obvious why this tenant isn't doing what it ought to.
-                tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
-                return None;
-            }
-        }
-
        // Build list of nodes from which the reconciler should detach
        let mut detach = Vec::new();
        for node_id in self.observed.locations.keys() {
@@ -1040,22 +804,6 @@ impl TenantShard {
        })
    }

-    /// Get a waiter for any reconciliation in flight, but do not start reconciliation
-    /// if it is not already running
-    pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
-        if self.reconciler.is_some() {
-            Some(ReconcilerWaiter {
-                tenant_shard_id: self.tenant_shard_id,
-                seq_wait: self.waiter.clone(),
-                error_seq_wait: self.error_waiter.clone(),
-                error: self.last_error.clone(),
-                seq: self.sequence,
-            })
-        } else {
-            None
-        }
-    }
-
    /// Called when a ReconcileResult has been emitted and the service is updating
    /// our state: if the result is from a sequence >= my ReconcileHandle, then drop
    /// the handle to indicate there is no longer a reconciliation in progress.
@@ -1081,40 +829,6 @@ impl TenantShard {
        debug_assert!(!self.intent.all_pageservers().contains(&node_id));
    }

-    pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) {
-        self.scheduling_policy = p;
-    }
-
-    pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy {
-        &self.scheduling_policy
-    }
-
-    pub(crate) fn from_persistent(
-        tsp: TenantShardPersistence,
-        intent: IntentState,
-    ) -> anyhow::Result<Self> {
-        let tenant_shard_id = tsp.get_tenant_shard_id()?;
-        let shard_identity = tsp.get_shard_identity()?;
-
-        Ok(Self {
-            tenant_shard_id,
-            shard: shard_identity,
-            sequence: Sequence::initial(),
-            generation: tsp.generation.map(|g| Generation::new(g as u32)),
-            policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
-            intent,
-            observed: ObservedState::new(),
-            config: serde_json::from_str(&tsp.config).unwrap(),
-            reconciler: None,
-            splitting: tsp.splitting,
-            waiter: Arc::new(SeqWait::new(Sequence::initial())),
-            error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
-            last_error: Arc::default(),
-            pending_compute_notification: false,
-            scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
-        })
-    }
-
    pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
        TenantShardPersistence {
            tenant_id: self.tenant_shard_id.tenant_id.to_string(),
@@ -1126,7 +840,6 @@ impl TenantShard {
            placement_policy: serde_json::to_string(&self.policy).unwrap(),
            config: serde_json::to_string(&self.config).unwrap(),
            splitting: SplitState::default(),
-            scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
        }
    }
 }
@@ -1143,7 +856,7 @@ pub(crate) mod tests {

    use super::*;

-    fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard {
+    fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
        let tenant_id = TenantId::generate();
        let shard_number = ShardNumber(0);
        let shard_count = ShardCount::new(1);
@@ -1153,7 +866,7 @@ pub(crate) mod tests {
            shard_number,
            shard_count,
        };
-        TenantShard::new(
+        TenantState::new(
            tenant_shard_id,
            ShardIdentity::new(
                shard_number,
@@ -1165,32 +878,6 @@ pub(crate) mod tests {
        )
    }

-    fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantShard> {
-        let tenant_id = TenantId::generate();
-
-        (0..shard_count.count())
-            .map(|i| {
-                let shard_number = ShardNumber(i);
-
-                let tenant_shard_id = TenantShardId {
-                    tenant_id,
-                    shard_number,
-                    shard_count,
-                };
-                TenantShard::new(
-                    tenant_shard_id,
-                    ShardIdentity::new(
-                        shard_number,
-                        shard_count,
-                        pageserver_api::shard::ShardStripeSize(32768),
-                    )
-                    .unwrap(),
-                    policy.clone(),
-                )
-            })
-            .collect()
-    }
-
    /// Test the scheduling behaviors used when a tenant configured for HA is subject
    /// to nodes being marked offline.
    #[test]
@@ -1200,26 +887,25 @@ pub(crate) mod tests {
        let mut nodes = make_test_nodes(3);

        let mut scheduler = Scheduler::new(nodes.values());
-        let mut context = ScheduleContext::default();

-        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
-        tenant_shard
-            .schedule(&mut scheduler, &mut context)
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        tenant_state
+            .schedule(&mut scheduler)
            .expect("we have enough nodes, scheduling should work");

        // Expect to initially be schedule on to different nodes
-        assert_eq!(tenant_shard.intent.secondary.len(), 1);
-        assert!(tenant_shard.intent.attached.is_some());
+        assert_eq!(tenant_state.intent.secondary.len(), 1);
+        assert!(tenant_state.intent.attached.is_some());

-        let attached_node_id = tenant_shard.intent.attached.unwrap();
-        let secondary_node_id = *tenant_shard.intent.secondary.iter().last().unwrap();
+        let attached_node_id = tenant_state.intent.attached.unwrap();
+        let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
        assert_ne!(attached_node_id, secondary_node_id);

        // Notifying the attached node is offline should demote it to a secondary
-        let changed = tenant_shard.intent.demote_attached(attached_node_id);
+        let changed = tenant_state.intent.demote_attached(attached_node_id);
        assert!(changed);
-        assert!(tenant_shard.intent.attached.is_none());
-        assert_eq!(tenant_shard.intent.secondary.len(), 2);
+        assert!(tenant_state.intent.attached.is_none());
+        assert_eq!(tenant_state.intent.secondary.len(), 2);

        // Update the scheduler state to indicate the node is offline
        nodes
@@ -1229,18 +915,18 @@ pub(crate) mod tests {
        scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());

        // Scheduling the node should promote the still-available secondary node to attached
-        tenant_shard
-            .schedule(&mut scheduler, &mut context)
+        tenant_state
+            .schedule(&mut scheduler)
            .expect("active nodes are available");
-        assert_eq!(tenant_shard.intent.attached.unwrap(), secondary_node_id);
+        assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);

        // The original attached node should have been retained as a secondary
        assert_eq!(
-            *tenant_shard.intent.secondary.iter().last().unwrap(),
+            *tenant_state.intent.secondary.iter().last().unwrap(),
            attached_node_id
        );

-        tenant_shard.intent.clear(&mut scheduler);
+        tenant_state.intent.clear(&mut scheduler);

        Ok(())
    }
@@ -1250,263 +936,48 @@ pub(crate) mod tests {
        let nodes = make_test_nodes(3);
        let mut scheduler = Scheduler::new(nodes.values());

-        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));

-        tenant_shard.observed.locations.insert(
+        tenant_state.observed.locations.insert(
            NodeId(3),
            ObservedStateLocation {
                conf: Some(LocationConfig {
                    mode: LocationConfigMode::AttachedMulti,
                    generation: Some(2),
                    secondary_conf: None,
-                    shard_number: tenant_shard.shard.number.0,
-                    shard_count: tenant_shard.shard.count.literal(),
-                    shard_stripe_size: tenant_shard.shard.stripe_size.0,
+                    shard_number: tenant_state.shard.number.0,
+                    shard_count: tenant_state.shard.count.literal(),
+                    shard_stripe_size: tenant_state.shard.stripe_size.0,
                    tenant_conf: TenantConfig::default(),
                }),
            },
        );

-        tenant_shard.observed.locations.insert(
+        tenant_state.observed.locations.insert(
            NodeId(2),
            ObservedStateLocation {
                conf: Some(LocationConfig {
                    mode: LocationConfigMode::AttachedStale,
                    generation: Some(1),
                    secondary_conf: None,
-                    shard_number: tenant_shard.shard.number.0,
-                    shard_count: tenant_shard.shard.count.literal(),
-                    shard_stripe_size: tenant_shard.shard.stripe_size.0,
+                    shard_number: tenant_state.shard.number.0,
+                    shard_count: tenant_state.shard.count.literal(),
+                    shard_stripe_size: tenant_state.shard.stripe_size.0,
                    tenant_conf: TenantConfig::default(),
                }),
            },
        );

-        tenant_shard.intent_from_observed(&mut scheduler);
+        tenant_state.intent_from_observed(&mut scheduler);

        // The highest generationed attached location gets used as attached
-        assert_eq!(tenant_shard.intent.attached, Some(NodeId(3)));
+        assert_eq!(tenant_state.intent.attached, Some(NodeId(3)));
        // Other locations get used as secondary
-        assert_eq!(tenant_shard.intent.secondary, vec![NodeId(2)]);
+        assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]);

-        scheduler.consistency_check(nodes.values(), [&tenant_shard].into_iter())?;
-
-        tenant_shard.intent.clear(&mut scheduler);
-        Ok(())
-    }
-
-    #[test]
-    fn scheduling_mode() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(3);
-        let mut scheduler = Scheduler::new(nodes.values());
-
-        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
-
-        // In pause mode, schedule() shouldn't do anything
-        tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause;
-        assert!(tenant_shard
-            .schedule(&mut scheduler, &mut ScheduleContext::default())
-            .is_ok());
-        assert!(tenant_shard.intent.all_pageservers().is_empty());
-
-        // In active mode, schedule() works
-        tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active;
-        assert!(tenant_shard
-            .schedule(&mut scheduler, &mut ScheduleContext::default())
-            .is_ok());
-        assert!(!tenant_shard.intent.all_pageservers().is_empty());
-
-        tenant_shard.intent.clear(&mut scheduler);
-        Ok(())
-    }
-
-    #[test]
-    fn optimize_attachment() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(3);
-        let mut scheduler = Scheduler::new(nodes.values());
-
-        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
-        let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
-
-        // Initially: both nodes attached on shard 1, and both have secondary locations
-        // on different nodes.
-        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
-        shard_a.intent.push_secondary(&mut scheduler, NodeId(2));
-        shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1)));
-        shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
-
-        let mut schedule_context = ScheduleContext::default();
-        schedule_context.avoid(&shard_a.intent.all_pageservers());
-        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
-        schedule_context.avoid(&shard_b.intent.all_pageservers());
-        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
-
-        let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);
-
-        // Either shard should recognize that it has the option to switch to a secondary location where there
-        // would be no other shards from the same tenant, and request to do so.
-        assert_eq!(
-            optimization_a,
-            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                old_attached_node_id: NodeId(1),
-                new_attached_node_id: NodeId(2)
-            }))
-        );
-
-        // Note that these optimizing two shards in the same tenant with the same ScheduleContext is
-        // mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility
-        // of [`Service::optimize_all`] to avoid trying
-        // to do optimizations for multiple shards in the same tenant at the same time.  Generating
-        // both optimizations is just done for test purposes
-        let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
-        assert_eq!(
-            optimization_b,
-            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                old_attached_node_id: NodeId(1),
-                new_attached_node_id: NodeId(3)
-            }))
-        );
-
-        // Applying these optimizations should result in the end state proposed
-        shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
-        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2)));
-        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
-        shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap());
-        assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3)));
-        assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]);
-
-        shard_a.intent.clear(&mut scheduler);
-        shard_b.intent.clear(&mut scheduler);
-
-        Ok(())
-    }
-
-    #[test]
-    fn optimize_secondary() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(4);
-        let mut scheduler = Scheduler::new(nodes.values());
-
-        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
-        let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
-
-        // Initially: both nodes attached on shard 1, and both have secondary locations
-        // on different nodes.
-        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
-        shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
-        shard_b.intent.set_attached(&mut scheduler, Some(NodeId(2)));
-        shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
-
-        let mut schedule_context = ScheduleContext::default();
-        schedule_context.avoid(&shard_a.intent.all_pageservers());
-        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
-        schedule_context.avoid(&shard_b.intent.all_pageservers());
-        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
-
-        let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context);
-
-        // Since there is a node with no locations available, the node with two locations for the
-        // same tenant should generate an optimization to move one away
-        assert_eq!(
-            optimization_a,
-            Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
-                old_node_id: NodeId(3),
-                new_node_id: NodeId(4)
-            }))
-        );
-
-        shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
-        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
-        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(4)]);
-
-        shard_a.intent.clear(&mut scheduler);
-        shard_b.intent.clear(&mut scheduler);
-
-        Ok(())
-    }
-
-    // Optimize til quiescent: this emulates what Service::optimize_all does, when
-    // called repeatedly in the background.
-    fn optimize_til_idle(
-        nodes: &HashMap<NodeId, Node>,
-        scheduler: &mut Scheduler,
-        shards: &mut [TenantShard],
-    ) {
-        let mut loop_n = 0;
-        loop {
-            let mut schedule_context = ScheduleContext::default();
-            let mut any_changed = false;
-
-            for shard in shards.iter() {
-                schedule_context.avoid(&shard.intent.all_pageservers());
-                if let Some(attached) = shard.intent.get_attached() {
-                    schedule_context.push_attached(*attached);
-                }
-            }
-
-            for shard in shards.iter_mut() {
-                let optimization = shard.optimize_attachment(nodes, &schedule_context);
-                if let Some(optimization) = optimization {
-                    shard.apply_optimization(scheduler, optimization);
-                    any_changed = true;
-                    break;
-                }
-
-                let optimization = shard.optimize_secondary(scheduler, &schedule_context);
-                if let Some(optimization) = optimization {
-                    shard.apply_optimization(scheduler, optimization);
-                    any_changed = true;
-                    break;
-                }
-            }
-
-            if !any_changed {
-                break;
-            }
-
-            // Assert no infinite loop
-            loop_n += 1;
-            assert!(loop_n < 1000);
-        }
-    }
-
-    /// Test the balancing behavior of shard scheduling: that it achieves a balance, and
-    /// that it converges.
-    #[test]
-    fn optimize_add_nodes() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(4);
-
-        // Only show the scheduler a couple of nodes
-        let mut scheduler = Scheduler::new([].iter());
-        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
-        scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
-
-        let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
-        let mut schedule_context = ScheduleContext::default();
-        for shard in &mut shards {
-            assert!(shard
-                .schedule(&mut scheduler, &mut schedule_context)
-                .is_ok());
-        }
-
-        // We should see equal number of locations on the two nodes.
-        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
-        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
-
-        // Add another two nodes: we should see the shards spread out when their optimize
-        // methods are called
-        scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap());
-        scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
-        optimize_til_idle(&nodes, &mut scheduler, &mut shards);
-
-        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
-        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
-        assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
-        assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
-
-        for shard in shards.iter_mut() {
-            shard.intent.clear(&mut scheduler);
-        }
+        scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?;

+        tenant_state.intent.clear(&mut scheduler);
        Ok(())
    }
 }
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -86,10 +86,7 @@ where
        .stdout(process_log_file)
        .stderr(same_file_for_stderr)
        .args(args);
-
-    let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
-        fill_rust_env_vars(background_command),
-    ));
+    let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
    filled_cmd.envs(envs);

    let pid_file_to_check = match &initial_pid_file {
@@ -271,15 +268,6 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
    cmd
 }

-fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
-    for (var, val) in std::env::vars() {
-        if var.starts_with("NEON_PAGESERVER_") {
-            cmd = cmd.env(var, val);
-        }
-    }
-    cmd
-}
-
 /// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
 /// 1. Claims a pidfile with a fcntl lock on it and
 /// 2. Sets up the pidfile's file descriptor so that it (and the lock)
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -14,7 +14,9 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
-use pageserver_api::controller_api::PlacementPolicy;
+use pageserver_api::controller_api::{
+    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
+};
 use pageserver_api::models::{
    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
@@ -1058,6 +1060,21 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }
        }

+        Some(("set-state", subcommand_args)) => {
+            let pageserver = get_pageserver(env, subcommand_args)?;
+            let scheduling = subcommand_args.get_one("scheduling");
+            let availability = subcommand_args.get_one("availability");
+
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
+                .node_configure(NodeConfigureRequest {
+                    node_id: pageserver.conf.id,
+                    scheduling: scheduling.cloned(),
+                    availability: availability.cloned(),
+                })
+                .await?;
+        }
+
        Some(("status", subcommand_args)) => {
            match get_pageserver(env, subcommand_args)?.check_status().await {
                Ok(_) => println!("Page server is up and running"),
@@ -1498,6 +1515,12 @@ fn cli() -> Command {
                    .about("Restart local pageserver")
                    .arg(pageserver_config_args.clone())
                )
+                .subcommand(Command::new("set-state")
+                    .arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
+                    .arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
+                    .about("Set scheduling or availability state of pageserver node")
+                    .arg(pageserver_config_args.clone())
+                )
        )
        .subcommand(
            Command::new("storage_controller")
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -389,10 +389,6 @@ impl PageServerNode {
                .remove("image_creation_threshold")
                .map(|x| x.parse::<usize>())
                .transpose()?,
-            image_layer_creation_check_threshold: settings
-                .remove("image_layer_creation_check_threshold")
-                .map(|x| x.parse::<u8>())
-                .transpose()?,
            pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
            walreceiver_connect_timeout: settings
                .remove("walreceiver_connect_timeout")
@@ -505,12 +501,6 @@ impl PageServerNode {
                    .map(|x| x.parse::<usize>())
                    .transpose()
                    .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
-                image_layer_creation_check_threshold: settings
-                    .remove("image_layer_creation_check_threshold")
-                    .map(|x| x.parse::<u8>())
-                    .transpose()
-                    .context("Failed to parse 'image_creation_check_threshold' as integer")?,
-
                pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
                walreceiver_connect_timeout: settings
                    .remove("walreceiver_connect_timeout")
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -1,23 +0,0 @@
-[package]
-name = "storcon_cli"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-
-[dependencies]
-anyhow.workspace = true
-clap.workspace = true
-comfy-table.workspace = true
-hyper.workspace = true
-pageserver_api.workspace = true
-pageserver_client.workspace = true
-reqwest.workspace = true
-serde.workspace = true
-serde_json = { workspace = true, features = ["raw_value"] }
-thiserror.workspace = true
-tokio.workspace = true
-tracing.workspace = true
-utils.workspace = true
-workspace_hack.workspace = true
-
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,587 +0,0 @@
-use std::{collections::HashMap, str::FromStr};
-
-use clap::{Parser, Subcommand};
-use hyper::Method;
-use pageserver_api::{
-    controller_api::{
-        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
-        TenantDescribeResponse, TenantPolicyRequest,
-    },
-    models::{
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
-        TenantShardSplitRequest, TenantShardSplitResponse,
-    },
-    shard::{ShardStripeSize, TenantShardId},
-};
-use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
-use reqwest::Url;
-use serde::{de::DeserializeOwned, Serialize};
-use utils::id::{NodeId, TenantId};
-
-use pageserver_api::controller_api::{
-    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-    TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
-};
-
-#[derive(Subcommand, Debug)]
-enum Command {
-    /// Register a pageserver with the storage controller.  This shouldn't usually be necessary,
-    /// since pageservers auto-register when they start up
-    NodeRegister {
-        #[arg(long)]
-        node_id: NodeId,
-
-        #[arg(long)]
-        listen_pg_addr: String,
-        #[arg(long)]
-        listen_pg_port: u16,
-
-        #[arg(long)]
-        listen_http_addr: String,
-        #[arg(long)]
-        listen_http_port: u16,
-    },
-
-    /// Modify a node's configuration in the storage controller
-    NodeConfigure {
-        #[arg(long)]
-        node_id: NodeId,
-
-        /// Availability is usually auto-detected based on heartbeats.  Set 'offline' here to
-        /// manually mark a node offline
-        #[arg(long)]
-        availability: Option<NodeAvailabilityArg>,
-        /// Scheduling policy controls whether tenant shards may be scheduled onto this node.
-        #[arg(long)]
-        scheduling: Option<NodeSchedulingPolicy>,
-    },
-    /// Modify a tenant's policies in the storage controller
-    TenantPolicy {
-        #[arg(long)]
-        tenant_id: TenantId,
-        /// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
-        /// or is in the normal attached state with N secondary locations (`attached:N`)
-        #[arg(long)]
-        placement: Option<PlacementPolicyArg>,
-        /// Scheduling policy enables pausing the controller's scheduling activity involving this tenant.  `active` is normal,
-        /// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
-        /// all reconciliation activity including for scheduling changes already made.  `pause` and `stop` can make a tenant
-        /// unavailable, and are only for use in emergencies.
-        #[arg(long)]
-        scheduling: Option<ShardSchedulingPolicyArg>,
-    },
-    /// List nodes known to the storage controller
-    Nodes {},
-    /// List tenants known to the storage controller
-    Tenants {},
-    /// Create a new tenant in the storage controller, and by extension on pageservers.
-    TenantCreate {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-    /// Delete a tenant in the storage controller, and by extension on pageservers.
-    TenantDelete {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-    /// Split an existing tenant into a higher number of shards than its current shard count.
-    TenantShardSplit {
-        #[arg(long)]
-        tenant_id: TenantId,
-        #[arg(long)]
-        shard_count: u8,
-        /// Optional, in 8kiB pages.  e.g. set 2048 for 16MB stripes.
-        #[arg(long)]
-        stripe_size: Option<u32>,
-    },
-    /// Migrate the attached location for a tenant shard to a specific pageserver.
-    TenantShardMigrate {
-        #[arg(long)]
-        tenant_shard_id: TenantShardId,
-        #[arg(long)]
-        node: NodeId,
-    },
-    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
-    /// that is passed through to pageservers, and does not affect storage controller behavior.
-    TenantConfig {
-        #[arg(long)]
-        tenant_id: TenantId,
-        #[arg(long)]
-        config: String,
-    },
-    /// Attempt to balance the locations for a tenant across pageservers.  This is a client-side
-    /// alternative to the storage controller's scheduling optimization behavior.
-    TenantScatter {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-    /// Print details about a particular tenant, including all its shards' states.
-    TenantDescribe {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-}
-
-#[derive(Parser)]
-#[command(
-    author,
-    version,
-    about,
-    long_about = "CLI for Storage Controller Support/Debug"
-)]
-#[command(arg_required_else_help(true))]
-struct Cli {
-    #[arg(long)]
-    /// URL to storage controller.  e.g. http://127.0.0.1:1234 when using `neon_local`
-    api: Url,
-
-    #[arg(long)]
-    /// JWT token for authenticating with storage controller.  Depending on the API used, this
-    /// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
-    /// a token with both scopes to use with this tool.
-    jwt: Option<String>,
-
-    #[command(subcommand)]
-    command: Command,
-}
-
-#[derive(Debug, Clone)]
-struct PlacementPolicyArg(PlacementPolicy);
-
-impl FromStr for PlacementPolicyArg {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "detached" => Ok(Self(PlacementPolicy::Detached)),
-            "secondary" => Ok(Self(PlacementPolicy::Secondary)),
-            _ if s.starts_with("attached:") => {
-                let mut splitter = s.split(':');
-                let _prefix = splitter.next().unwrap();
-                match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
-                    Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
-                    None => Err(anyhow::anyhow!(
-                        "Invalid format '{s}', a valid example is 'attached:1'"
-                    )),
-                }
-            }
-            _ => Err(anyhow::anyhow!(
-                "Unknown placement policy '{s}', try detached,secondary,attached:<n>"
-            )),
-        }
-    }
-}
-
-#[derive(Debug, Clone)]
-struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
-
-impl FromStr for ShardSchedulingPolicyArg {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self(ShardSchedulingPolicy::Active)),
-            "essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
-            "pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
-            "stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
-            _ => Err(anyhow::anyhow!(
-                "Unknown scheduling policy '{s}', try active,essential,pause,stop"
-            )),
-        }
-    }
-}
-
-#[derive(Debug, Clone)]
-struct NodeAvailabilityArg(NodeAvailabilityWrapper);
-
-impl FromStr for NodeAvailabilityArg {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self(NodeAvailabilityWrapper::Active)),
-            "offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
-            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
-        }
-    }
-}
-
-struct Client {
-    base_url: Url,
-    jwt_token: Option<String>,
-    client: reqwest::Client,
-}
-
-impl Client {
-    fn new(base_url: Url, jwt_token: Option<String>) -> Self {
-        Self {
-            base_url,
-            jwt_token,
-            client: reqwest::ClientBuilder::new()
-                .build()
-                .expect("Failed to construct http client"),
-        }
-    }
-
-    /// Simple HTTP request wrapper for calling into storage controller
-    async fn dispatch<RQ, RS>(
-        &self,
-        method: hyper::Method,
-        path: String,
-        body: Option<RQ>,
-    ) -> mgmt_api::Result<RS>
-    where
-        RQ: Serialize + Sized,
-        RS: DeserializeOwned + Sized,
-    {
-        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
-        // for general purpose API access.
-        let url = Url::from_str(&format!(
-            "http://{}:{}/{path}",
-            self.base_url.host_str().unwrap(),
-            self.base_url.port().unwrap()
-        ))
-        .unwrap();
-
-        let mut builder = self.client.request(method, url);
-        if let Some(body) = body {
-            builder = builder.json(&body)
-        }
-        if let Some(jwt_token) = &self.jwt_token {
-            builder = builder.header(
-                reqwest::header::AUTHORIZATION,
-                format!("Bearer {jwt_token}"),
-            );
-        }
-
-        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
-        let response = response.error_from_body().await?;
-
-        response
-            .json()
-            .await
-            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
-    }
-}
-
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
-    let cli = Cli::parse();
-
-    let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
-
-    let mut trimmed = cli.api.to_string();
-    trimmed.pop();
-    let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
-
-    match cli.command {
-        Command::NodeRegister {
-            node_id,
-            listen_pg_addr,
-            listen_pg_port,
-            listen_http_addr,
-            listen_http_port,
-        } => {
-            storcon_client
-                .dispatch::<_, ()>(
-                    Method::POST,
-                    "control/v1/node".to_string(),
-                    Some(NodeRegisterRequest {
-                        node_id,
-                        listen_pg_addr,
-                        listen_pg_port,
-                        listen_http_addr,
-                        listen_http_port,
-                    }),
-                )
-                .await?;
-        }
-        Command::TenantCreate { tenant_id } => {
-            vps_client
-                .tenant_create(&TenantCreateRequest {
-                    new_tenant_id: TenantShardId::unsharded(tenant_id),
-                    generation: None,
-                    shard_parameters: ShardParameters::default(),
-                    placement_policy: Some(PlacementPolicy::Attached(1)),
-                    config: TenantConfig::default(),
-                })
-                .await?;
-        }
-        Command::TenantDelete { tenant_id } => {
-            let status = vps_client
-                .tenant_delete(TenantShardId::unsharded(tenant_id))
-                .await?;
-            tracing::info!("Delete status: {}", status);
-        }
-        Command::Nodes {} => {
-            let resp = storcon_client
-                .dispatch::<(), Vec<NodeDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/node".to_string(),
-                    None,
-                )
-                .await?;
-            let mut table = comfy_table::Table::new();
-            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
-            for node in resp {
-                table.add_row([
-                    format!("{}", node.id),
-                    node.listen_http_addr,
-                    format!("{:?}", node.scheduling),
-                    format!("{:?}", node.availability),
-                ]);
-            }
-            println!("{table}");
-        }
-        Command::NodeConfigure {
-            node_id,
-            availability,
-            scheduling,
-        } => {
-            let req = NodeConfigureRequest {
-                node_id,
-                availability: availability.map(|a| a.0),
-                scheduling,
-            };
-            storcon_client
-                .dispatch::<_, ()>(
-                    Method::PUT,
-                    format!("control/v1/node/{node_id}/config"),
-                    Some(req),
-                )
-                .await?;
-        }
-        Command::Tenants {} => {
-            let resp = storcon_client
-                .dispatch::<(), Vec<TenantDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/tenant".to_string(),
-                    None,
-                )
-                .await?;
-            let mut table = comfy_table::Table::new();
-            table.set_header([
-                "TenantId",
-                "ShardCount",
-                "StripeSize",
-                "Placement",
-                "Scheduling",
-            ]);
-            for tenant in resp {
-                let shard_zero = tenant.shards.into_iter().next().unwrap();
-                table.add_row([
-                    format!("{}", tenant.tenant_id),
-                    format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
-                    format!("{:?}", tenant.stripe_size),
-                    format!("{:?}", tenant.policy),
-                    format!("{:?}", shard_zero.scheduling_policy),
-                ]);
-            }
-
-            println!("{table}");
-        }
-        Command::TenantPolicy {
-            tenant_id,
-            placement,
-            scheduling,
-        } => {
-            let req = TenantPolicyRequest {
-                scheduling: scheduling.map(|s| s.0),
-                placement: placement.map(|p| p.0),
-            };
-            storcon_client
-                .dispatch::<_, ()>(
-                    Method::PUT,
-                    format!("control/v1/tenant/{tenant_id}/policy"),
-                    Some(req),
-                )
-                .await?;
-        }
-        Command::TenantShardSplit {
-            tenant_id,
-            shard_count,
-            stripe_size,
-        } => {
-            let req = TenantShardSplitRequest {
-                new_shard_count: shard_count,
-                new_stripe_size: stripe_size.map(ShardStripeSize),
-            };
-
-            let response = storcon_client
-                .dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
-                    Method::PUT,
-                    format!("control/v1/tenant/{tenant_id}/shard_split"),
-                    Some(req),
-                )
-                .await?;
-            println!(
-                "Split tenant {} into {} shards: {}",
-                tenant_id,
-                shard_count,
-                response
-                    .new_shards
-                    .iter()
-                    .map(|s| format!("{:?}", s))
-                    .collect::<Vec<_>>()
-                    .join(",")
-            );
-        }
-        Command::TenantShardMigrate {
-            tenant_shard_id,
-            node,
-        } => {
-            let req = TenantShardMigrateRequest {
-                tenant_shard_id,
-                node_id: node,
-            };
-
-            storcon_client
-                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
-                    Method::PUT,
-                    format!("control/v1/tenant/{tenant_shard_id}/migrate"),
-                    Some(req),
-                )
-                .await?;
-        }
-        Command::TenantConfig { tenant_id, config } => {
-            let tenant_conf = serde_json::from_str(&config)?;
-
-            vps_client
-                .tenant_config(&TenantConfigRequest {
-                    tenant_id,
-                    config: tenant_conf,
-                })
-                .await?;
-        }
-        Command::TenantScatter { tenant_id } => {
-            // Find the shards
-            let locate_response = storcon_client
-                .dispatch::<(), TenantLocateResponse>(
-                    Method::GET,
-                    format!("control/v1/tenant/{tenant_id}/locate"),
-                    None,
-                )
-                .await?;
-            let shards = locate_response.shards;
-
-            let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
-            let shard_count = shards.len();
-            for s in shards {
-                let entry = node_to_shards.entry(s.node_id).or_default();
-                entry.push(s.shard_id);
-            }
-
-            // Load list of available nodes
-            let nodes_resp = storcon_client
-                .dispatch::<(), Vec<NodeDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/node".to_string(),
-                    None,
-                )
-                .await?;
-
-            for node in nodes_resp {
-                if matches!(node.availability, NodeAvailabilityWrapper::Active) {
-                    node_to_shards.entry(node.id).or_default();
-                }
-            }
-
-            let max_shard_per_node = shard_count / node_to_shards.len();
-
-            loop {
-                let mut migrate_shard = None;
-                for shards in node_to_shards.values_mut() {
-                    if shards.len() > max_shard_per_node {
-                        // Pick the emptiest
-                        migrate_shard = Some(shards.pop().unwrap());
-                    }
-                }
-                let Some(migrate_shard) = migrate_shard else {
-                    break;
-                };
-
-                // Pick the emptiest node to migrate to
-                let mut destinations = node_to_shards
-                    .iter()
-                    .map(|(k, v)| (k, v.len()))
-                    .collect::<Vec<_>>();
-                destinations.sort_by_key(|i| i.1);
-                let (destination_node, destination_count) = *destinations.first().unwrap();
-                if destination_count + 1 > max_shard_per_node {
-                    // Even the emptiest destination doesn't have space: we're done
-                    break;
-                }
-                let destination_node = *destination_node;
-
-                node_to_shards
-                    .get_mut(&destination_node)
-                    .unwrap()
-                    .push(migrate_shard);
-
-                println!("Migrate {} -> {} ...", migrate_shard, destination_node);
-
-                storcon_client
-                    .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
-                        Method::PUT,
-                        format!("control/v1/tenant/{migrate_shard}/migrate"),
-                        Some(TenantShardMigrateRequest {
-                            tenant_shard_id: migrate_shard,
-                            node_id: destination_node,
-                        }),
-                    )
-                    .await?;
-                println!("Migrate {} -> {} OK", migrate_shard, destination_node);
-            }
-
-            // Spread the shards across the nodes
-        }
-        Command::TenantDescribe { tenant_id } => {
-            let describe_response = storcon_client
-                .dispatch::<(), TenantDescribeResponse>(
-                    Method::GET,
-                    format!("control/v1/tenant/{tenant_id}"),
-                    None,
-                )
-                .await?;
-            let shards = describe_response.shards;
-            let mut table = comfy_table::Table::new();
-            table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
-            for shard in shards {
-                let secondary = shard
-                    .node_secondary
-                    .iter()
-                    .map(|n| format!("{}", n))
-                    .collect::<Vec<_>>()
-                    .join(",");
-
-                let mut status_parts = Vec::new();
-                if shard.is_reconciling {
-                    status_parts.push("reconciling");
-                }
-
-                if shard.is_pending_compute_notification {
-                    status_parts.push("pending_compute");
-                }
-
-                if shard.is_splitting {
-                    status_parts.push("splitting");
-                }
-                let status = status_parts.join(",");
-
-                table.add_row([
-                    format!("{}", shard.tenant_shard_id),
-                    shard
-                        .node_attached
-                        .map(|n| format!("{}", n))
-                        .unwrap_or(String::new()),
-                    secondary,
-                    shard.last_error,
-                    status,
-                ]);
-            }
-            println!("{table}");
-        }
-    }
-
-    Ok(())
-}
--- a/diesel.toml
+++ b/diesel.toml
@@ -2,8 +2,8 @@
 # see https://diesel.rs/guides/configuring-diesel-cli

 [print_schema]
-file = "storage_controller/src/schema.rs"
+file = "control_plane/attachment_service/src/schema.rs"
 custom_type_derives = ["diesel::query_builder::QueryId"]

 [migrations_directory]
-dir = "storage_controller/migrations"
+dir = "control_plane/attachment_service/migrations"
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -7,11 +7,6 @@ Below you will find a brief overview of each subdir in the source tree in alphab
 Neon storage broker, providing messaging between safekeepers and pageservers.
 [storage_broker.md](./storage_broker.md)

-`storage_controller`:
-
-Neon storage controller, manages a cluster of pageservers and exposes an API that enables
-managing a many-sharded tenant as a single entity.
-
 `/control_plane`:

 Local control plane.
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -10,13 +10,11 @@ libc.workspace = true
 once_cell.workspace = true
 chrono.workspace = true
 twox-hash.workspace = true
-measured.workspace = true

 workspace_hack.workspace = true

 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
-measured-process.workspace = true

 [dev-dependencies]
 rand = "0.8"
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -4,17 +4,6 @@
 //! a default registry.
 #![deny(clippy::undocumented_unsafe_blocks)]

-use measured::{
-    label::{LabelGroupVisitor, LabelName, NoLabels},
-    metric::{
-        counter::CounterState,
-        gauge::GaugeState,
-        group::{Encoding, MetricValue},
-        name::{MetricName, MetricNameEncoder},
-        MetricEncoding, MetricFamilyEncoding,
-    },
-    FixedCardinalityLabel, LabelGroup, MetricGroup,
-};
 use once_cell::sync::Lazy;
 use prometheus::core::{
    Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
@@ -22,7 +11,6 @@ use prometheus::core::{
 pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::Error;
-use prometheus::Registry;
 pub use prometheus::{core, default_registry, proto};
 pub use prometheus::{exponential_buckets, linear_buckets};
 pub use prometheus::{register_counter_vec, Counter, CounterVec};
@@ -35,6 +23,7 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
 pub use prometheus::{register_int_gauge, IntGauge};
 pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
 pub use prometheus::{Encoder, TextEncoder};
+use prometheus::{Registry, Result};

 pub mod launch_timestamp;
 mod wrappers;
@@ -70,7 +59,7 @@ static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
 /// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
 /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
 /// while holding the lock.
-pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
+pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
    INTERNAL_REGISTRY.register(c)
 }

@@ -107,127 +96,6 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
 ];

-pub struct BuildInfo {
-    pub revision: &'static str,
-    pub build_tag: &'static str,
-}
-
-// todo: allow label group without the set
-impl LabelGroup for BuildInfo {
-    fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
-        const REVISION: &LabelName = LabelName::from_str("revision");
-        v.write_value(REVISION, &self.revision);
-        const BUILD_TAG: &LabelName = LabelName::from_str("build_tag");
-        v.write_value(BUILD_TAG, &self.build_tag);
-    }
-}
-
-impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
-where
-    GaugeState: MetricEncoding<T>,
-{
-    fn collect_family_into(
-        &self,
-        name: impl measured::metric::name::MetricNameEncoder,
-        enc: &mut T,
-    ) -> Result<(), T::Err> {
-        enc.write_help(&name, "Build/version information")?;
-        GaugeState::write_type(&name, enc)?;
-        GaugeState {
-            count: std::sync::atomic::AtomicI64::new(1),
-        }
-        .collect_into(&(), self, name, enc)
-    }
-}
-
-#[derive(MetricGroup)]
-#[metric(new(build_info: BuildInfo))]
-pub struct NeonMetrics {
-    #[cfg(target_os = "linux")]
-    #[metric(namespace = "process")]
-    #[metric(init = measured_process::ProcessCollector::for_self())]
-    process: measured_process::ProcessCollector,
-
-    #[metric(namespace = "libmetrics")]
-    #[metric(init = LibMetrics::new(build_info))]
-    libmetrics: LibMetrics,
-}
-
-#[derive(MetricGroup)]
-#[metric(new(build_info: BuildInfo))]
-pub struct LibMetrics {
-    #[metric(init = build_info)]
-    build_info: BuildInfo,
-
-    #[metric(flatten)]
-    rusage: Rusage,
-
-    serve_count: CollectionCounter,
-}
-
-fn write_gauge<Enc: Encoding>(
-    x: i64,
-    labels: impl LabelGroup,
-    name: impl MetricNameEncoder,
-    enc: &mut Enc,
-) -> Result<(), Enc::Err> {
-    enc.write_metric_value(name, labels, MetricValue::Int(x))
-}
-
-#[derive(Default)]
-struct Rusage;
-
-#[derive(FixedCardinalityLabel, Clone, Copy)]
-#[label(singleton = "io_operation")]
-enum IoOp {
-    Read,
-    Write,
-}
-
-impl<T: Encoding> MetricGroup<T> for Rusage
-where
-    GaugeState: MetricEncoding<T>,
-{
-    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
-        const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total");
-        const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb");
-
-        let ru = get_rusage_stats();
-
-        enc.write_help(
-            DISK_IO,
-            "Bytes written and read from disk, grouped by the operation (read|write)",
-        )?;
-        GaugeState::write_type(DISK_IO, enc)?;
-        write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?;
-        write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?;
-
-        enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?;
-        GaugeState::write_type(MAXRSS, enc)?;
-        write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?;
-
-        Ok(())
-    }
-}
-
-#[derive(Default)]
-struct CollectionCounter(CounterState);
-
-impl<T: Encoding> MetricFamilyEncoding<T> for CollectionCounter
-where
-    CounterState: MetricEncoding<T>,
-{
-    fn collect_family_into(
-        &self,
-        name: impl measured::metric::name::MetricNameEncoder,
-        enc: &mut T,
-    ) -> Result<(), T::Err> {
-        self.0.inc();
-        enc.write_help(&name, "Number of metric requests made")?;
-        self.0.collect_into(&(), NoLabels, name, enc)
-    }
-}
-
 pub fn set_build_info_metric(revision: &str, build_tag: &str) {
    let metric = register_int_gauge_vec!(
        "libmetrics_build_info",
@@ -237,7 +105,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
    .expect("Failed to register build info metric");
    metric.with_label_values(&[revision, build_tag]).set(1);
 }
-const BYTES_IN_BLOCK: i64 = 512;

 // Records I/O stats in a "cross-platform" way.
 // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
@@ -250,6 +117,7 @@ const BYTES_IN_BLOCK: i64 = 512;
 fn update_rusage_metrics() {
    let rusage_stats = get_rusage_stats();

+    const BYTES_IN_BLOCK: i64 = 512;
    DISK_IO_BYTES
        .with_label_values(&["read"])
        .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
@@ -283,7 +151,6 @@ macro_rules! register_int_counter_pair_vec {
        }
    }};
 }
-
 /// Create an [`IntCounterPair`] and registers to default registry.
 #[macro_export(local_inner_macros)]
 macro_rules! register_int_counter_pair {
@@ -321,10 +188,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
    ///
    /// An error is returned if the number of label values is not the same as the
    /// number of VariableLabels in Desc.
-    pub fn get_metric_with_label_values(
-        &self,
-        vals: &[&str],
-    ) -> prometheus::Result<GenericCounterPair<P>> {
+    pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
        Ok(GenericCounterPair {
            inc: self.inc.get_metric_with_label_values(vals)?,
            dec: self.dec.get_metric_with_label_values(vals)?,
@@ -337,7 +201,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
        self.get_metric_with_label_values(vals).unwrap()
    }

-    pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
+    pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
        res[0] = self.inc.remove_label_values(vals);
        res[1] = self.dec.remove_label_values(vals);
    }
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -2,9 +2,9 @@ use std::str::FromStr;

 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
-/// in [`storage_controller::http`]
+/// in [`attachment_service::http`]
 use serde::{Deserialize, Serialize};
-use utils::id::{NodeId, TenantId};
+use utils::id::NodeId;

 use crate::{
    models::{ShardParameters, TenantConfig},
@@ -42,12 +42,6 @@ pub struct NodeConfigureRequest {
    pub scheduling: Option<NodeSchedulingPolicy>,
 }

-#[derive(Serialize, Deserialize)]
-pub struct TenantPolicyRequest {
-    pub placement: Option<PlacementPolicy>,
-    pub scheduling: Option<ShardSchedulingPolicy>,
-}
-
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantLocateResponseShard {
    pub shard_id: TenantShardId,
@@ -68,27 +62,12 @@ pub struct TenantLocateResponse {

 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponse {
-    pub tenant_id: TenantId,
    pub shards: Vec<TenantDescribeResponseShard>,
    pub stripe_size: ShardStripeSize,
    pub policy: PlacementPolicy,
    pub config: TenantConfig,
 }

-#[derive(Serialize, Deserialize)]
-pub struct NodeDescribeResponse {
-    pub id: NodeId,
-
-    pub availability: NodeAvailabilityWrapper,
-    pub scheduling: NodeSchedulingPolicy,
-
-    pub listen_http_addr: String,
-    pub listen_http_port: u16,
-
-    pub listen_pg_addr: String,
-    pub listen_pg_port: u16,
-}
-
 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponseShard {
    pub tenant_shard_id: TenantShardId,
@@ -104,8 +83,6 @@ pub struct TenantDescribeResponseShard {
    pub is_pending_compute_notification: bool,
    /// A shard split is currently underway
    pub is_splitting: bool,
-
-    pub scheduling_policy: ShardSchedulingPolicy,
 }

 /// Explicitly migrating a particular shard is a low level operation
@@ -120,7 +97,7 @@ pub struct TenantShardMigrateRequest {
 /// Utilisation score indicating how good a candidate a pageserver
 /// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
 /// Lower values are better.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
 pub struct UtilizationScore(pub u64);

 impl UtilizationScore {
@@ -129,7 +106,7 @@ impl UtilizationScore {
    }
 }

-#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
+#[derive(Serialize, Clone, Copy)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
    // Normal, happy state
@@ -152,7 +129,7 @@ impl Eq for NodeAvailability {}
 // This wrapper provides serde functionality and it should only be used to
 // communicate with external callers which don't know or care about the
 // utilisation score of the pageserver it is targeting.
-#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
+#[derive(Serialize, Deserialize, Clone)]
 pub enum NodeAvailabilityWrapper {
    Active,
    Offline,
@@ -178,33 +155,22 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
    }
 }

-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
-pub enum ShardSchedulingPolicy {
-    // Normal mode: the tenant's scheduled locations may be updated at will, including
-    // for non-essential optimization.
-    Active,
+impl FromStr for NodeAvailability {
+    type Err = anyhow::Error;

-    // Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
-    // For example, this still permits a node's attachment location to change to a secondary in
-    // response to a node failure, or to assign a new secondary if a node was removed.
-    Essential,
-
-    // No scheduling: leave the shard running wherever it currently is.  Even if the shard is
-    // unavailable, it will not be rescheduled to another node.
-    Pause,
-
-    // No reconciling: we will make no location_conf API calls to pageservers at all.  If the
-    // shard is unavailable, it stays that way.  If a node fails, this shard doesn't get failed over.
-    Stop,
-}
-
-impl Default for ShardSchedulingPolicy {
-    fn default() -> Self {
-        Self::Active
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            // This is used when parsing node configuration requests from neon-local.
+            // Assume the worst possible utilisation score
+            // and let it get updated via the heartbeats.
+            "active" => Ok(Self::Active(UtilizationScore::worst())),
+            "offline" => Ok(Self::Offline),
+            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
+        }
    }
 }

-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 pub enum NodeSchedulingPolicy {
    Active,
    Filling,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -20,7 +20,6 @@ use utils::{
    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
-    serde_system_time,
 };

 use crate::controller_api::PlacementPolicy;
@@ -302,7 +301,6 @@ pub struct TenantConfig {
    pub heatmap_period: Option<String>,
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
-    pub image_layer_creation_check_threshold: Option<u8>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -759,7 +757,11 @@ pub struct WalRedoManagerStatus {
 #[derive(Default, Debug, Serialize, Deserialize, Clone)]
 pub struct SecondaryProgress {
    /// The remote storage LastModified time of the heatmap object we last downloaded.
-    pub heatmap_mtime: Option<serde_system_time::SystemTime>,
+    #[serde(
+        serialize_with = "opt_ser_rfc3339_millis",
+        deserialize_with = "opt_deser_rfc3339_millis"
+    )]
+    pub heatmap_mtime: Option<SystemTime>,

    /// The number of layers currently on-disk
    pub layers_downloaded: usize,
@@ -772,6 +774,29 @@ pub struct SecondaryProgress {
    pub bytes_total: u64,
 }

+fn opt_ser_rfc3339_millis<S: serde::Serializer>(
+    ts: &Option<SystemTime>,
+    serializer: S,
+) -> Result<S::Ok, S::Error> {
+    match ts {
+        Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
+        None => serializer.serialize_none(),
+    }
+}
+
+fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
+    match s {
+        None => Ok(None),
+        Some(s) => humantime::parse_rfc3339(&s)
+            .map_err(serde::de::Error::custom)
+            .map(Some),
+    }
+}
+
 pub mod virtual_file {
    #[derive(
        Copy,
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -1,4 +1,4 @@
-use utils::serde_system_time::SystemTime;
+use std::time::SystemTime;

 /// Pageserver current utilization and scoring for how good candidate the pageserver would be for
 /// the next tenant.
@@ -21,9 +21,28 @@ pub struct PageserverUtilization {
    /// When was this snapshot captured, pageserver local time.
    ///
    /// Use millis to give confidence that the value is regenerated often enough.
+    #[serde(
+        serialize_with = "ser_rfc3339_millis",
+        deserialize_with = "deser_rfc3339_millis"
+    )]
    pub captured_at: SystemTime,
 }

+fn ser_rfc3339_millis<S: serde::Serializer>(
+    ts: &SystemTime,
+    serializer: S,
+) -> Result<S::Ok, S::Error> {
+    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
+}
+
+fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    let s: String = serde::de::Deserialize::deserialize(deserializer)?;
+    humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
+}
+
 /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
 ///
 /// Instead of newtype, use this because a newtype would get require handling deserializing values
@@ -50,9 +69,7 @@ mod tests {
            disk_usage_bytes: u64::MAX,
            free_space_bytes: 0,
            utilization_score: u64::MAX,
-            captured_at: SystemTime(
-                std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
-            ),
+            captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
        };

        let s = serde_json::to_string(&doc).unwrap();
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -565,16 +565,6 @@ impl GenericRemoteStorage {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct StorageMetadata(HashMap<String, String>);

-impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
-    fn from(arr: [(&str, &str); N]) -> Self {
-        let map: HashMap<String, String> = arr
-            .iter()
-            .map(|(k, v)| (k.to_string(), v.to_string()))
-            .collect();
-        Self(map)
-    }
-}
-
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -57,6 +57,7 @@ enum MaybeEnabledStorage {
    Disabled,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -85,6 +86,7 @@ struct AzureWithTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -146,6 +148,7 @@ struct AzureWithSimpleTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -219,6 +219,7 @@ enum MaybeEnabledStorage {
    Disabled,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -247,6 +248,7 @@ struct S3WithTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -308,6 +310,7 @@ struct S3WithSimpleTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -22,7 +22,6 @@ camino.workspace = true
 chrono.workspace = true
 heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
-humantime.workspace = true
 hyper = { workspace = true, features = ["full"] }
 fail.workspace = true
 futures = { workspace = true}
--- a/libs/utils/src/env.rs
+++ b/libs/utils/src/env.rs
@@ -1,21 +0,0 @@
-//! Wrapper around `std::env::var` for parsing environment variables.
-
-use std::{fmt::Display, str::FromStr};
-
-pub fn var<V, E>(varname: &str) -> Option<V>
-where
-    V: FromStr<Err = E>,
-    E: Display,
-{
-    match std::env::var(varname) {
-        Ok(s) => Some(
-            s.parse()
-                .map_err(|e| format!("failed to parse env var {varname}: {e:#}"))
-                .unwrap(),
-        ),
-        Err(std::env::VarError::NotPresent) => None,
-        Err(std::env::VarError::NotUnicode(_)) => {
-            panic!("env var {varname} is not unicode")
-        }
-    }
-}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -63,7 +63,6 @@ pub mod measured_stream;

 pub mod serde_percent;
 pub mod serde_regex;
-pub mod serde_system_time;

 pub mod pageserver_feedback;

@@ -90,8 +89,6 @@ pub mod yielding_loop;

 pub mod zstd;

-pub mod env;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -182,18 +182,6 @@ where
        }
    }

-    /// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
-    pub fn would_wait_for(&self, num: V) -> Result<(), V> {
-        let internal = self.internal.lock().unwrap();
-        let cnt = internal.current.cnt_value();
-        drop(internal);
-        if cnt >= num {
-            Ok(())
-        } else {
-            Err(cnt)
-        }
-    }
-
    /// Register and return a channel that will be notified when a number arrives,
    /// or None, if it has already arrived.
    fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
--- a/libs/utils/src/serde_system_time.rs
+++ b/libs/utils/src/serde_system_time.rs
@@ -1,55 +0,0 @@
-//! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision.
-
-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)]
-#[serde(transparent)]
-pub struct SystemTime(
-    #[serde(
-        deserialize_with = "deser_rfc3339_millis",
-        serialize_with = "ser_rfc3339_millis"
-    )]
-    pub std::time::SystemTime,
-);
-
-fn ser_rfc3339_millis<S: serde::ser::Serializer>(
-    ts: &std::time::SystemTime,
-    serializer: S,
-) -> Result<S::Ok, S::Error> {
-    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
-}
-
-fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<std::time::SystemTime, D::Error>
-where
-    D: serde::de::Deserializer<'de>,
-{
-    let s: String = serde::de::Deserialize::deserialize(deserializer)?;
-    humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    /// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds.
-    fn to_millisecond_precision(time: SystemTime) -> SystemTime {
-        match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) {
-            Ok(duration) => {
-                let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis());
-                SystemTime(
-                    std::time::SystemTime::UNIX_EPOCH
-                        + std::time::Duration::from_millis(total_millis),
-                )
-            }
-            Err(_) => time,
-        }
-    }
-
-    #[test]
-    fn test_serialize_deserialize() {
-        let input = SystemTime(std::time::SystemTime::now());
-        let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0));
-        let serialized = serde_json::to_string(&input).unwrap();
-        assert_eq!(expected_serialized, serialized);
-        let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap();
-        assert_eq!(to_millisecond_precision(input), deserialized);
-    }
-}
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -27,25 +27,25 @@
 //!
 //! # Reference Numbers
 //!
-//! 2024-04-04 on i3en.3xlarge
+//! 2024-03-20 on i3en.3xlarge
 //!
 //! ```text
-//! short/1                 time:   [25.925 µs 26.060 µs 26.209 µs]
-//! short/2                 time:   [31.277 µs 31.483 µs 31.722 µs]
-//! short/4                 time:   [45.496 µs 45.831 µs 46.182 µs]
-//! short/8                 time:   [84.298 µs 84.920 µs 85.566 µs]
-//! short/16                time:   [185.04 µs 186.41 µs 187.88 µs]
-//! short/32                time:   [385.01 µs 386.77 µs 388.70 µs]
-//! short/64                time:   [770.24 µs 773.04 µs 776.04 µs]
-//! short/128               time:   [1.5017 ms 1.5064 ms 1.5113 ms]
-//! medium/1                time:   [106.65 µs 107.20 µs 107.85 µs]
-//! medium/2                time:   [153.28 µs 154.24 µs 155.56 µs]
-//! medium/4                time:   [325.67 µs 327.01 µs 328.71 µs]
-//! medium/8                time:   [646.82 µs 650.17 µs 653.91 µs]
-//! medium/16               time:   [1.2645 ms 1.2701 ms 1.2762 ms]
-//! medium/32               time:   [2.4409 ms 2.4550 ms 2.4692 ms]
-//! medium/64               time:   [4.6814 ms 4.7114 ms 4.7408 ms]
-//! medium/128              time:   [8.7790 ms 8.9037 ms 9.0282 ms]
+//! short/1                 time:   [26.483 µs 26.614 µs 26.767 µs]
+//! short/2                 time:   [32.223 µs 32.465 µs 32.767 µs]
+//! short/4                 time:   [47.203 µs 47.583 µs 47.984 µs]
+//! short/8                 time:   [89.135 µs 89.612 µs 90.139 µs]
+//! short/16                time:   [190.12 µs 191.52 µs 192.88 µs]
+//! short/32                time:   [380.96 µs 382.63 µs 384.20 µs]
+//! short/64                time:   [736.86 µs 741.07 µs 745.03 µs]
+//! short/128               time:   [1.4106 ms 1.4206 ms 1.4294 ms]
+//! medium/1                time:   [111.81 µs 112.25 µs 112.79 µs]
+//! medium/2                time:   [158.26 µs 159.13 µs 160.21 µs]
+//! medium/4                time:   [334.65 µs 337.14 µs 340.07 µs]
+//! medium/8                time:   [675.32 µs 679.91 µs 685.25 µs]
+//! medium/16               time:   [1.2929 ms 1.2996 ms 1.3067 ms]
+//! medium/32               time:   [2.4295 ms 2.4461 ms 2.4623 ms]
+//! medium/64               time:   [4.3973 ms 4.4458 ms 4.4875 ms]
+//! medium/128              time:   [7.5955 ms 7.7847 ms 7.9481 ms]
 //! ```

 use bytes::{Buf, Bytes};
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -128,12 +128,12 @@ impl Client {

    pub async fn timeline_info(
        &self,
-        tenant_shard_id: TenantShardId,
+        tenant_id: TenantId,
        timeline_id: TimelineId,
        force_await_logical_size: ForceAwaitLogicalSize,
    ) -> Result<pageserver_api::models::TimelineInfo> {
        let uri = format!(
-            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
            self.mgmt_api_endpoint
        );

@@ -151,11 +151,11 @@ impl Client {

    pub async fn keyspace(
        &self,
-        tenant_shard_id: TenantShardId,
+        tenant_id: TenantId,
        timeline_id: TimelineId,
    ) -> Result<pageserver_api::models::partitioning::Partitioning> {
        let uri = format!(
-            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace",
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
            self.mgmt_api_endpoint
        );
        self.get(&uri)
--- a/pageserver/compaction/Cargo.toml
+++ b/pageserver/compaction/Cargo.toml
@@ -11,6 +11,7 @@ default = []
 anyhow.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
+async-trait.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
 chrono = { workspace = true, features = ["serde"] }
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -43,8 +43,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
    fanout: u64,
    ctx: &E::RequestContext,
 ) -> anyhow::Result<()> {
-    assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}");
-    let exp_base = fanout.max(2);
+    assert!(fanout >= 2);
    // Start at L0
    let mut current_level_no = 0;
    let mut current_level_target_height = target_file_size;
@@ -107,7 +106,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
            break;
        }
        current_level_no += 1;
-        current_level_target_height = current_level_target_height.saturating_mul(exp_base);
+        current_level_target_height = current_level_target_height.saturating_mul(fanout);
    }
    Ok(())
 }
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -180,7 +180,7 @@ where
                match top.deref_mut() {
                    LazyLoadLayer::Unloaded(ref mut l) => {
                        let fut = l.load_keys(this.ctx);
-                        this.load_future.set(Some(Box::pin(fut)));
+                        this.load_future.set(Some(fut));
                        continue;
                    }
                    LazyLoadLayer::Loaded(ref mut entries) => {
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -3,6 +3,7 @@
 //!
 //! All the heavy lifting is done by the create_image and create_delta
 //! functions that the implementor provides.
+use async_trait::async_trait;
 use futures::Future;
 use pageserver_api::{key::Key, keyspace::key_range_size};
 use std::ops::Range;
@@ -140,16 +141,18 @@ pub trait CompactionLayer<K: CompactionKey + ?Sized> {

    fn is_delta(&self) -> bool;
 }
+
+#[async_trait]
 pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
    type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
    where
        Self: 'a;

    /// Return all keys in this delta layer.
-    fn load_keys<'a>(
+    async fn load_keys<'a>(
        &self,
        ctx: &E::RequestContext,
-    ) -> impl Future<Output = anyhow::Result<Vec<Self::DeltaEntry<'_>>>> + Send;
+    ) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
 }

 pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -2,6 +2,7 @@ mod draw;

 use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};

+use async_trait::async_trait;
 use futures::StreamExt;
 use rand::Rng;
 use tracing::info;
@@ -138,6 +139,7 @@ impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
    }
 }

+#[async_trait]
 impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
    type DeltaEntry<'a> = MockRecord;

--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -12,14 +12,9 @@ bytes.workspace = true
 camino.workspace = true
 clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
-humantime.workspace = true
 pageserver = { path = ".." }
-pageserver_api.workspace = true
-remote_storage = { path = "../../libs/remote_storage" }
 postgres_ffi.workspace = true
 tokio.workspace = true
-tokio-util.workspace = true
-toml_edit.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -9,11 +9,6 @@ mod index_part;
 mod layer_map_analyzer;
 mod layers;

-use std::{
-    str::FromStr,
-    time::{Duration, SystemTime},
-};
-
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
 use index_part::IndexPartCmd;
@@ -25,16 +20,8 @@ use pageserver::{
    tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
    virtual_file,
 };
-use pageserver_api::shard::TenantShardId;
 use postgres_ffi::ControlFileData;
-use remote_storage::{RemotePath, RemoteStorageConfig};
-use tokio_util::sync::CancellationToken;
-use utils::{
-    id::TimelineId,
-    logging::{self, LogFormat, TracingErrorLayerEnablement},
-    lsn::Lsn,
-    project_git_version,
-};
+use utils::{lsn::Lsn, project_git_version};

 project_git_version!(GIT_VERSION);

@@ -56,7 +43,6 @@ enum Commands {
    #[command(subcommand)]
    IndexPart(IndexPartCmd),
    PrintLayerFile(PrintLayerFileCmd),
-    TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd),
    DrawTimeline {},
    AnalyzeLayerMap(AnalyzeLayerMapCmd),
    #[command(subcommand)]
@@ -82,26 +68,6 @@ struct PrintLayerFileCmd {
    path: Utf8PathBuf,
 }

-/// Roll back the time for the specified prefix using S3 history.
-///
-/// The command is fairly low level and powerful. Validation is only very light,
-/// so it is more powerful, and thus potentially more dangerous.
-#[derive(Parser)]
-struct TimeTravelRemotePrefixCmd {
-    /// A configuration string for the remote_storage configuration.
-    ///
-    /// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }`
-    config_toml_str: String,
-    /// remote prefix to time travel recover. For safety reasons, we require it to contain
-    /// a timeline or tenant ID in the prefix.
-    prefix: String,
-    /// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy.
-    travel_to: String,
-    /// Timestamp of the start of the operation, must be after any changes we want to roll back and after.
-    /// You can use a few seconds before invoking the command. Same format as `travel_to`.
-    done_if_after: Option<String>,
-}
-
 #[derive(Parser)]
 struct AnalyzeLayerMapCmd {
    /// Pageserver data path
@@ -112,14 +78,6 @@ struct AnalyzeLayerMapCmd {

 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
-    logging::init(
-        LogFormat::Plain,
-        TracingErrorLayerEnablement::EnableWithRustLogFilter,
-        logging::Output::Stdout,
-    )?;
-
-    logging::replace_panic_hook_with_tracing_panic_hook().forget();
-
    let cli = CliOpts::parse();

    match cli.command {
@@ -147,42 +105,6 @@ async fn main() -> anyhow::Result<()> {
                print_layerfile(&cmd.path).await?;
            }
        }
-        Commands::TimeTravelRemotePrefix(cmd) => {
-            let timestamp = humantime::parse_rfc3339(&cmd.travel_to)
-                .map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?;
-
-            let done_if_after = if let Some(done_if_after) = &cmd.done_if_after {
-                humantime::parse_rfc3339(done_if_after).map_err(|_e| {
-                    anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after)
-                })?
-            } else {
-                const SAFETY_MARGIN: Duration = Duration::from_secs(3);
-                tokio::time::sleep(SAFETY_MARGIN).await;
-                // Convert to string representation and back to get rid of sub-second values
-                let done_if_after = SystemTime::now();
-                tokio::time::sleep(SAFETY_MARGIN).await;
-                done_if_after
-            };
-
-            let timestamp = strip_subsecond(timestamp);
-            let done_if_after = strip_subsecond(done_if_after);
-
-            let Some(prefix) = validate_prefix(&cmd.prefix) else {
-                println!("specified prefix '{}' failed validation", cmd.prefix);
-                return Ok(());
-            };
-            let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
-            let toml_item = toml_document
-                .get("remote_storage")
-                .expect("need remote_storage");
-            let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
-            let storage = remote_storage::GenericRemoteStorage::from_config(&config);
-            let cancel = CancellationToken::new();
-            storage
-                .unwrap()
-                .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
-                .await?;
-        }
    };
    Ok(())
 }
@@ -263,89 +185,3 @@ fn handle_metadata(

    Ok(())
 }
-
-/// Ensures that the given S3 prefix is sufficiently constrained.
-/// The command is very risky already and we don't want to expose something
-/// that allows usually unintentional and quite catastrophic time travel of
-/// an entire bucket, which would be a major catastrophy and away
-/// by only one character change (similar to "rm -r /home /username/foobar").
-fn validate_prefix(prefix: &str) -> Option<RemotePath> {
-    if prefix.is_empty() {
-        // Empty prefix means we want to specify the *whole* bucket
-        return None;
-    }
-    let components = prefix.split('/').collect::<Vec<_>>();
-    let (last, components) = {
-        let last = components.last()?;
-        if last.is_empty() {
-            (
-                components.iter().nth_back(1)?,
-                &components[..(components.len() - 1)],
-            )
-        } else {
-            (last, &components[..])
-        }
-    };
-    'valid: {
-        if let Ok(_timeline_id) = TimelineId::from_str(last) {
-            // Ends in either a tenant or timeline ID
-            break 'valid;
-        }
-        if *last == "timelines" {
-            if let Some(before_last) = components.iter().nth_back(1) {
-                if let Ok(_tenant_id) = TenantShardId::from_str(before_last) {
-                    // Has a valid tenant id
-                    break 'valid;
-                }
-            }
-        }
-
-        return None;
-    }
-    RemotePath::from_string(prefix).ok()
-}
-
-fn strip_subsecond(timestamp: SystemTime) -> SystemTime {
-    let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string();
-    humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp")
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_validate_prefix() {
-        assert_eq!(validate_prefix(""), None);
-        assert_eq!(validate_prefix("/"), None);
-        #[track_caller]
-        fn assert_valid(prefix: &str) {
-            let remote_path = RemotePath::from_string(prefix).unwrap();
-            assert_eq!(validate_prefix(prefix), Some(remote_path));
-        }
-        assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/");
-        // Path is not relative but absolute
-        assert_eq!(
-            validate_prefix(
-                "/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"
-            ),
-            None
-        );
-        assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/");
-        // Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix
-        assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None);
-        assert_eq!(validate_prefix("wal"), None);
-        assert_eq!(validate_prefix("/wal/"), None);
-        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001");
-        // Partial tenant ID
-        assert_eq!(
-            validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"),
-            None
-        );
-        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines");
-        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines");
-        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/");
-        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683");
-        assert_eq!(validate_prefix("pageserver/v1/tenants/"), None);
-    }
-}
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -1,5 +1,4 @@
 use anyhow::Context;
-use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
 use pageserver_client::page_service::BasebackupRequest;

@@ -96,7 +95,7 @@ async fn main_impl(
            let timeline = *timeline;
            let info = mgmt_api_client
                .timeline_info(
-                    TenantShardId::unsharded(timeline.tenant_id),
+                    timeline.tenant_id,
                    timeline.timeline_id,
                    ForceAwaitLogicalSize::No,
                )
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -4,7 +4,6 @@ use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::models::PagestreamGetPageRequest;

-use pageserver_api::shard::TenantShardId;
 use tokio_util::sync::CancellationToken;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
@@ -174,10 +173,7 @@ async fn main_impl(
                let timeline = *timeline;
                async move {
                    let partitioning = mgmt_api_client
-                        .keyspace(
-                            TenantShardId::unsharded(timeline.tenant_id),
-                            timeline.timeline_id,
-                        )
+                        .keyspace(timeline.tenant_id, timeline.timeline_id)
                        .await?;
                    let lsn = partitioning.at_lsn;
                    let start = Instant::now();
--- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
+++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
@@ -1,7 +1,6 @@
 use std::sync::Arc;

 use humantime::Duration;
-use pageserver_api::shard::TenantShardId;
 use tokio::task::JoinSet;
 use utils::id::TenantTimelineId;

@@ -60,11 +59,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
        let mgmt_api_client = Arc::clone(&mgmt_api_client);
        js.spawn(async move {
            let info = mgmt_api_client
-                .timeline_info(
-                    TenantShardId::unsharded(tl.tenant_id),
-                    tl.timeline_id,
-                    ForceAwaitLogicalSize::Yes,
-                )
+                .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
                .await
                .unwrap();

@@ -79,11 +74,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
                while !info.current_logical_size_is_accurate {
                    ticker.tick().await;
                    info = mgmt_api_client
-                        .timeline_info(
-                            TenantShardId::unsharded(tl.tenant_id),
-                            tl.timeline_id,
-                            ForceAwaitLogicalSize::Yes,
-                        )
+                        .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
                        .await
                        .unwrap();
                }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -18,7 +18,6 @@ use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use pageserver::tenant::{secondary, TenantSharedResources};
 use remote_storage::GenericRemoteStorage;
-use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
 use tracing::*;

@@ -672,37 +671,42 @@ fn start_pageserver(
    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

    // All started up! Now just sit and wait for shutdown signal.
-
    {
-        BACKGROUND_RUNTIME.block_on(async move {
-            let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
-            let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
-            let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
-            let signal = tokio::select! {
-                _ = sigquit.recv() => {
-                    info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
-                    std::process::exit(111);
-                }
-                _ = sigint.recv() => { "SIGINT" },
-                _ = sigterm.recv() => { "SIGTERM" },
-            };
+        use signal_hook::consts::*;
+        let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
+            let mut signals =
+                signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
+            return signals
+                .forever()
+                .next()
+                .expect("forever() never returns None unless explicitly closed");
+        });
+        let signal = BACKGROUND_RUNTIME
+            .block_on(signal_handler)
+            .expect("join error");
+        match signal {
+            SIGQUIT => {
+                info!("Got signal {signal}. Terminating in immediate shutdown mode",);
+                std::process::exit(111);
+            }
+            SIGINT | SIGTERM => {
+                info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);

-            info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
-
-            // This cancels the `shutdown_pageserver` cancellation tree.
-            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
-            // The plan is to change that over time.
-            shutdown_pageserver.take();
-            let bg_remote_storage = remote_storage.clone();
-            let bg_deletion_queue = deletion_queue.clone();
-            pageserver::shutdown_pageserver(
-                &tenant_manager,
-                bg_remote_storage.map(|_| bg_deletion_queue),
-                0,
-            )
-            .await;
-            unreachable!()
-        })
+                // This cancels the `shutdown_pageserver` cancellation tree.
+                // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
+                // The plan is to change that over time.
+                shutdown_pageserver.take();
+                let bg_remote_storage = remote_storage.clone();
+                let bg_deletion_queue = deletion_queue.clone();
+                BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
+                    &tenant_manager,
+                    bg_remote_storage.map(|_| bg_deletion_queue),
+                    0,
+                ));
+                unreachable!()
+            }
+            _ => unreachable!(),
+        }
    }
 }

--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -12,7 +12,7 @@ use pageserver_api::{
 use serde::{de::DeserializeOwned, Serialize};
 use tokio_util::sync::CancellationToken;
 use url::Url;
-use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
+use utils::{backoff, generation::Generation, id::NodeId};

 use crate::{
    config::{NodeMetadata, PageServerConf},
@@ -210,10 +210,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                .collect(),
        };

-        failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel);
-        if self.cancel.is_cancelled() {
-            return Err(RetryForeverError::ShuttingDown);
-        }
+        fail::fail_point!("control-plane-client-validate");

        let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;

--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1629,7 +1629,7 @@ components:
          type: integer
          format: int64
          minimum: 0
-          description: The amount of disk space currently used.
+          description: The amount of disk space currently utilized by layer files.
        free_space_bytes:
          type: integer
          format: int64
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -993,26 +993,11 @@ async fn tenant_status(
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let state = get_state(&request);

-    // In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting.
-    let activate = true;
-    #[cfg(feature = "testing")]
-    let activate = parse_query_param(&request, "activate")?.unwrap_or(activate);
-
    let tenant_info = async {
        let tenant = state
            .tenant_manager
            .get_attached_tenant_shard(tenant_shard_id)?;

-        if activate {
-            // This is advisory: we prefer to let the tenant activate on-demand when this function is
-            // called, but it is still valid to return 200 and describe the current state of the tenant
-            // if it doesn't make it into an active state.
-            tenant
-                .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
-                .await
-                .ok();
-        }
-
        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
        for timeline in tenant.list_timelines().iter() {
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -8,7 +8,6 @@ use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
-use pageserver_api::key::rel_block_to_key;
 use tokio::io::{AsyncRead, AsyncReadExt};
 use tokio_tar::Archive;
 use tracing::*;
@@ -171,10 +170,7 @@ async fn import_rel(
        let r = reader.read_exact(&mut buf).await;
        match r {
            Ok(_) => {
-                let key = rel_block_to_key(rel, blknum);
-                if modification.tline.get_shard_identity().is_key_local(&key) {
-                    modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
-                }
+                modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
            }

            // TODO: UnexpectedEof is expected
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1483,18 +1483,12 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
 });

 pub(crate) struct WalIngestMetrics {
-    pub(crate) bytes_received: IntCounter,
    pub(crate) records_received: IntCounter,
    pub(crate) records_committed: IntCounter,
    pub(crate) records_filtered: IntCounter,
 }

 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
-    bytes_received: register_int_counter!(
-        "pageserver_wal_ingest_bytes_received",
-        "Bytes of WAL ingested from safekeepers",
-    )
-    .unwrap(),
    records_received: register_int_counter!(
        "pageserver_wal_ingest_records_received",
        "Number of WAL records received from safekeepers"
@@ -2100,7 +2094,6 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
 use futures::Future;
 use pin_project_lite::pin_project;
 use std::collections::HashMap;
-use std::num::NonZeroUsize;
 use std::pin::Pin;
 use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
@@ -2670,26 +2663,6 @@ pub(crate) mod disk_usage_based_eviction {
    pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
 }

-static TOKIO_EXECUTOR_THREAD_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_tokio_executor_thread_configured_count",
-        "Total number of configued tokio executor threads in the process.
-         The `setup` label denotes whether we're running with multiple runtimes or a single runtime.",
-        &["setup"],
-    )
-    .unwrap()
-});
-
-pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
-    static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(());
-    let _guard = SERIALIZE.lock().unwrap();
-    TOKIO_EXECUTOR_THREAD_COUNT.reset();
-    TOKIO_EXECUTOR_THREAD_COUNT
-        .get_metric_with_label_values(&[setup])
-        .unwrap()
-        .set(u64::try_from(num_threads.get()).unwrap());
-}
-
 pub fn preinitialize_metrics() {
    // Python tests need these and on some we do alerting.
    //
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -876,13 +876,7 @@ impl PageServerHandler {
            if lsn <= last_record_lsn {
                lsn = last_record_lsn;
            } else {
-                timeline
-                    .wait_lsn(
-                        lsn,
-                        crate::tenant::timeline::WaitLsnWaiter::PageService,
-                        ctx,
-                    )
-                    .await?;
+                timeline.wait_lsn(lsn, ctx).await?;
                // Since we waited for 'lsn' to arrive, that is now the last
                // record LSN. (Or close enough for our purposes; the
                // last-record LSN can advance immediately after we return
@@ -894,13 +888,7 @@ impl PageServerHandler {
                    "invalid LSN(0) in request".into(),
                ));
            }
-            timeline
-                .wait_lsn(
-                    lsn,
-                    crate::tenant::timeline::WaitLsnWaiter::PageService,
-                    ctx,
-                )
-                .await?;
+            timeline.wait_lsn(lsn, ctx).await?;
        }

        if lsn < **latest_gc_cutoff_lsn {
@@ -1227,13 +1215,7 @@ impl PageServerHandler {
        if let Some(lsn) = lsn {
            // Backup was requested at a particular LSN. Wait for it to arrive.
            info!("waiting for {}", lsn);
-            timeline
-                .wait_lsn(
-                    lsn,
-                    crate::tenant::timeline::WaitLsnWaiter::PageService,
-                    ctx,
-                )
-                .await?;
+            timeline.wait_lsn(lsn, ctx).await?;
            timeline
                .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
                .context("invalid basebackup lsn")?;
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -33,14 +33,13 @@
 use std::collections::HashMap;
 use std::fmt;
 use std::future::Future;
-use std::num::NonZeroUsize;
 use std::panic::AssertUnwindSafe;
-use std::str::FromStr;
 use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::{Arc, Mutex};

 use futures::FutureExt;
 use pageserver_api::shard::TenantShardId;
+use tokio::runtime::Runtime;
 use tokio::task::JoinHandle;
 use tokio::task_local;
 use tokio_util::sync::CancellationToken;
@@ -49,11 +48,8 @@ use tracing::{debug, error, info, warn};

 use once_cell::sync::Lazy;

-use utils::env;
 use utils::id::TimelineId;

-use crate::metrics::set_tokio_runtime_setup;
-
 //
 // There are four runtimes:
 //
@@ -102,119 +98,52 @@ use crate::metrics::set_tokio_runtime_setup;
 // other operations, if the upload tasks e.g. get blocked on locks. It shouldn't
 // happen, but still.
 //
+pub static COMPUTE_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+    tokio::runtime::Builder::new_multi_thread()
+        .thread_name("compute request worker")
+        .enable_all()
+        .build()
+        .expect("Failed to create compute request runtime")
+});

-pub(crate) static TOKIO_WORKER_THREADS: Lazy<NonZeroUsize> = Lazy::new(|| {
+pub static MGMT_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+    tokio::runtime::Builder::new_multi_thread()
+        .thread_name("mgmt request worker")
+        .enable_all()
+        .build()
+        .expect("Failed to create mgmt request runtime")
+});
+
+pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+    tokio::runtime::Builder::new_multi_thread()
+        .thread_name("walreceiver worker")
+        .enable_all()
+        .build()
+        .expect("Failed to create walreceiver runtime")
+});
+
+pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+    tokio::runtime::Builder::new_multi_thread()
+        .thread_name("background op worker")
+        // if you change the number of worker threads please change the constant below
+        .enable_all()
+        .build()
+        .expect("Failed to create background op runtime")
+});
+
+pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
+    // force init and thus panics
+    let _ = BACKGROUND_RUNTIME.handle();
    // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
    // tokio would had already panicked for parsing errors or NotUnicode
    //
    // this will be wrong if any of the runtimes gets their worker threads configured to something
    // else, but that has not been needed in a long time.
-    NonZeroUsize::new(
-        std::env::var("TOKIO_WORKER_THREADS")
-            .map(|s| s.parse::<usize>().unwrap())
-            .unwrap_or_else(|_e| usize::max(2, num_cpus::get())),
-    )
-    .expect("the max() ensures that this is not zero")
+    std::env::var("TOKIO_WORKER_THREADS")
+        .map(|s| s.parse::<usize>().unwrap())
+        .unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
 });

-enum TokioRuntimeMode {
-    SingleThreaded,
-    MultiThreaded { num_workers: NonZeroUsize },
-}
-
-impl FromStr for TokioRuntimeMode {
-    type Err = String;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "current_thread" => Ok(TokioRuntimeMode::SingleThreaded),
-            s => match s.strip_prefix("multi_thread:") {
-                Some("default") => Ok(TokioRuntimeMode::MultiThreaded {
-                    num_workers: *TOKIO_WORKER_THREADS,
-                }),
-                Some(suffix) => {
-                    let num_workers = suffix.parse::<NonZeroUsize>().map_err(|e| {
-                        format!(
-                            "invalid number of multi-threaded runtime workers ({suffix:?}): {e}",
-                        )
-                    })?;
-                    Ok(TokioRuntimeMode::MultiThreaded { num_workers })
-                }
-                None => Err(format!("invalid runtime config: {s:?}")),
-            },
-        }
-    }
-}
-
-static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
-    let thread_name = "pageserver-tokio";
-    let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else {
-        // If the env var is not set, leave this static as None.
-        set_tokio_runtime_setup(
-            "multiple-runtimes",
-            NUM_MULTIPLE_RUNTIMES
-                .checked_mul(*TOKIO_WORKER_THREADS)
-                .unwrap(),
-        );
-        return None;
-    };
-    Some(match mode {
-        TokioRuntimeMode::SingleThreaded => {
-            set_tokio_runtime_setup("one-runtime-single-threaded", NonZeroUsize::new(1).unwrap());
-            tokio::runtime::Builder::new_current_thread()
-                .thread_name(thread_name)
-                .enable_all()
-                .build()
-                .expect("failed to create one single runtime")
-        }
-        TokioRuntimeMode::MultiThreaded { num_workers } => {
-            set_tokio_runtime_setup("one-runtime-multi-threaded", num_workers);
-            tokio::runtime::Builder::new_multi_thread()
-                .thread_name(thread_name)
-                .enable_all()
-                .worker_threads(num_workers.get())
-                .build()
-                .expect("failed to create one multi-threaded runtime")
-        }
-    })
-});
-
-/// Declare a lazy static variable named `$varname` that will resolve
-/// to a tokio runtime handle. If the env var `NEON_PAGESERVER_USE_ONE_RUNTIME`
-/// is set, this will resolve to `ONE_RUNTIME`. Otherwise, the macro invocation
-/// declares a separate runtime and the lazy static variable `$varname`
-/// will resolve to that separate runtime.
-///
-/// The result is is that `$varname.spawn()` will use `ONE_RUNTIME` if
-/// `NEON_PAGESERVER_USE_ONE_RUNTIME` is set, and will use the separate runtime
-/// otherwise.
-macro_rules! pageserver_runtime {
-    ($varname:ident, $name:literal) => {
-        pub static $varname: Lazy<&'static tokio::runtime::Runtime> = Lazy::new(|| {
-            if let Some(runtime) = &*ONE_RUNTIME {
-                return runtime;
-            }
-            static RUNTIME: Lazy<tokio::runtime::Runtime> = Lazy::new(|| {
-                tokio::runtime::Builder::new_multi_thread()
-                    .thread_name($name)
-                    .worker_threads(TOKIO_WORKER_THREADS.get())
-                    .enable_all()
-                    .build()
-                    .expect(std::concat!("Failed to create runtime ", $name))
-            });
-            &*RUNTIME
-        });
-    };
-}
-
-pageserver_runtime!(COMPUTE_REQUEST_RUNTIME, "compute request worker");
-pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker");
-pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker");
-pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker");
-// Bump this number when adding a new pageserver_runtime!
-// SAFETY: it's obviously correct
-const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) };
-
 #[derive(Debug, Clone, Copy)]
 pub struct PageserverTaskId(u64);

@@ -285,12 +214,13 @@ pub enum TaskKind {
    /// Internally, `Client` hands over requests to the `Connection` object.
    /// The `Connection` object is responsible for speaking the wire protocol.
    ///
-    /// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
+    /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
+    /// That abstraction doesn't use `task_mgr`.
    /// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
    /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
    ///
-    /// Once the connection is established, the `TaskHandle` task spawns a
-    /// [`WalReceiverConnectionPoller`] task that is responsible for polling
+    /// Once the connection is established, the `TaskHandle` task creates a
+    /// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
    /// the `Connection` object.
    /// A `CancellationToken` created by the `TaskHandle` task ensures
    /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
@@ -300,6 +230,7 @@ pub enum TaskKind {
    WalReceiverManager,

    /// The `TaskHandle` task that executes `handle_walreceiver_connection`.
+    /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
    /// See the comment on [`WalReceiverManager`].
    ///
    /// [`WalReceiverManager`]: Self::WalReceiverManager
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,7 +12,6 @@
 //!

 use anyhow::{bail, Context};
-use arc_swap::ArcSwap;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use enumset::EnumSet;
@@ -99,7 +98,7 @@ use std::ops::Bound::Included;
 use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
-use std::sync::Mutex;
+use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

 use crate::span;
@@ -261,7 +260,7 @@ pub struct Tenant {
    // We keep TenantConfOpt sturct here to preserve the information
    // about parameters that are not set.
    // This is necessary to allow global config updates.
-    tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
+    tenant_conf: Arc<RwLock<AttachedTenantConf>>,

    tenant_shard_id: TenantShardId,

@@ -1516,7 +1515,7 @@ impl Tenant {
                    // sizes etc. and that would get confused if the previous page versions
                    // are not in the repository yet.
                    ancestor_timeline
-                        .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
+                        .wait_lsn(*lsn, ctx)
                        .await
                        .map_err(|e| match e {
                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
@@ -1607,7 +1606,7 @@ impl Tenant {
        );

        {
-            let conf = self.tenant_conf.load();
+            let conf = self.tenant_conf.read().unwrap();

            if !conf.location.may_delete_layers_hint() {
                info!("Skipping GC in location state {:?}", conf.location);
@@ -1634,7 +1633,7 @@ impl Tenant {
        }

        {
-            let conf = self.tenant_conf.load();
+            let conf = self.tenant_conf.read().unwrap();
            if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
                info!("Skipping compaction in location state {:?}", conf.location);
                return Ok(());
@@ -1783,7 +1782,7 @@ impl Tenant {
    async fn shutdown(
        &self,
        shutdown_progress: completion::Barrier,
-        shutdown_mode: timeline::ShutdownMode,
+        freeze_and_flush: bool,
    ) -> Result<(), completion::Barrier> {
        span::debug_assert_current_span_has_tenant_id();

@@ -1830,8 +1829,16 @@ impl Tenant {
            timelines.values().for_each(|timeline| {
                let timeline = Arc::clone(timeline);
                let timeline_id = timeline.timeline_id;
-                let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode);
-                js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await });
+
+                let span =
+                    tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush);
+                js.spawn(async move {
+                    if freeze_and_flush {
+                        timeline.flush_and_shutdown().instrument(span).await
+                    } else {
+                        timeline.shutdown().instrument(span).await
+                    }
+                });
            })
        };
        // test_long_timeline_create_then_tenant_delete is leaning on this message
@@ -2075,14 +2082,14 @@ impl Tenant {
    }

    pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
-        self.tenant_conf.load().location.attach_mode
+        self.tenant_conf.read().unwrap().location.attach_mode
    }

    /// For API access: generate a LocationConfig equivalent to the one that would be used to
    /// create a Tenant in the same state.  Do not use this in hot paths: it's for relatively
    /// rare external API calls, like a reconciliation at startup.
    pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
-        let conf = self.tenant_conf.load();
+        let conf = self.tenant_conf.read().unwrap();

        let location_config_mode = match conf.location.attach_mode {
            AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
@@ -2229,7 +2236,7 @@ where

 impl Tenant {
    pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
-        self.tenant_conf.load().tenant_conf.clone()
+        self.tenant_conf.read().unwrap().tenant_conf.clone()
    }

    pub fn effective_config(&self) -> TenantConf {
@@ -2238,84 +2245,84 @@ impl Tenant {
    }

    pub fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .checkpoint_distance
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
    }

    pub fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .checkpoint_timeout
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
    }

    pub fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .compaction_target_size
            .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
    }

    pub fn get_compaction_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .compaction_period
            .unwrap_or(self.conf.default_tenant_conf.compaction_period)
    }

    pub fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .compaction_threshold
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

    pub fn get_gc_horizon(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .gc_horizon
            .unwrap_or(self.conf.default_tenant_conf.gc_horizon)
    }

    pub fn get_gc_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .gc_period
            .unwrap_or(self.conf.default_tenant_conf.gc_period)
    }

    pub fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .image_creation_threshold
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

    pub fn get_pitr_interval(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .pitr_interval
            .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
    }

    pub fn get_trace_read_requests(&self) -> bool {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .trace_read_requests
            .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
    }

    pub fn get_min_resident_size_override(&self) -> Option<u64> {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .min_resident_size_override
            .or(self.conf.default_tenant_conf.min_resident_size_override)
    }

    pub fn get_heatmap_period(&self) -> Option<Duration> {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        let heatmap_period = tenant_conf
            .heatmap_period
            .unwrap_or(self.conf.default_tenant_conf.heatmap_period);
@@ -2327,40 +2334,26 @@ impl Tenant {
    }

    pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
-        // Use read-copy-update in order to avoid overwriting the location config
-        // state if this races with [`Tenant::set_new_location_config`]. Note that
-        // this race is not possible if both request types come from the storage
-        // controller (as they should!) because an exclusive op lock is required
-        // on the storage controller side.
-        self.tenant_conf.rcu(|inner| {
-            Arc::new(AttachedTenantConf {
-                tenant_conf: new_tenant_conf.clone(),
-                location: inner.location,
-            })
-        });
-
-        self.tenant_conf_updated(&new_tenant_conf);
+        self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
+        self.tenant_conf_updated();
        // Don't hold self.timelines.lock() during the notifies.
        // There's no risk of deadlock right now, but there could be if we consolidate
        // mutexes in struct Timeline in the future.
        let timelines = self.list_timelines();
        for timeline in timelines {
-            timeline.tenant_conf_updated(&new_tenant_conf);
+            timeline.tenant_conf_updated();
        }
    }

    pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
-        let new_tenant_conf = new_conf.tenant_conf.clone();
-
-        self.tenant_conf.store(Arc::new(new_conf));
-
-        self.tenant_conf_updated(&new_tenant_conf);
+        *self.tenant_conf.write().unwrap() = new_conf;
+        self.tenant_conf_updated();
        // Don't hold self.timelines.lock() during the notifies.
        // There's no risk of deadlock right now, but there could be if we consolidate
        // mutexes in struct Timeline in the future.
        let timelines = self.list_timelines();
        for timeline in timelines {
-            timeline.tenant_conf_updated(&new_tenant_conf);
+            timeline.tenant_conf_updated();
        }
    }

@@ -2374,8 +2367,11 @@ impl Tenant {
            .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
    }

-    pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
-        let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
+    pub(crate) fn tenant_conf_updated(&self) {
+        let conf = {
+            let guard = self.tenant_conf.read().unwrap();
+            Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf)
+        };
        self.timeline_get_throttle.reconfigure(conf)
    }

@@ -2523,7 +2519,7 @@ impl Tenant {
                Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
                &crate::metrics::tenant_throttling::TIMELINE_GET,
            )),
-            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
+            tenant_conf: Arc::new(RwLock::new(attached_conf)),
        }
    }

@@ -3509,7 +3505,7 @@ impl Tenant {
    }

    pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
-        self.tenant_conf.load().tenant_conf.clone()
+        self.tenant_conf.read().unwrap().tenant_conf.clone()
    }
 }

@@ -3657,9 +3653,6 @@ pub(crate) mod harness {
                heatmap_period: Some(tenant_conf.heatmap_period),
                lazy_slru_download: Some(tenant_conf.lazy_slru_download),
                timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
-                image_layer_creation_check_threshold: Some(
-                    tenant_conf.image_layer_creation_check_threshold,
-                ),
            }
        }
    }
@@ -3858,7 +3851,6 @@ mod tests {
    use hex_literal::hex;
    use pageserver_api::keyspace::KeySpace;
    use rand::{thread_rng, Rng};
-    use tests::timeline::ShutdownMode;

    static TEST_KEY: Lazy<Key> =
        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4304,7 +4296,7 @@ mod tests {
            make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
            // so that all uploads finish & we can call harness.load() below again
            tenant
-                .shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
+                .shutdown(Default::default(), true)
                .instrument(harness.span())
                .await
                .ok()
@@ -4345,7 +4337,7 @@ mod tests {

            // so that all uploads finish & we can call harness.load() below again
            tenant
-                .shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
+                .shutdown(Default::default(), true)
                .instrument(harness.span())
                .await
                .ok()
@@ -5126,7 +5118,7 @@ mod tests {
            // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
-                .shutdown(super::timeline::ShutdownMode::Hard)
+                .shutdown()
                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
                .await;
            std::mem::forget(tline);
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -57,9 +57,6 @@ pub mod defaults {
    // throughputs up to 1GiB/s per timeline.
    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
-    // By default ingest enough WAL for two new L0 layers before checking if new image
-    // image layers should be created.
-    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;

    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }
@@ -365,10 +362,6 @@ pub struct TenantConf {
    pub lazy_slru_download: bool,

    pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
-
-    // How much WAL must be ingested before checking again whether a new image layer is required.
-    // Expresed in multiples of checkpoint distance.
-    pub image_layer_creation_check_threshold: u8,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -461,9 +454,6 @@ pub struct TenantConfOpt {

    #[serde(skip_serializing_if = "Option::is_none")]
    pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
-
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub image_layer_creation_check_threshold: Option<u8>,
 }

 impl TenantConfOpt {
@@ -518,9 +508,6 @@ impl TenantConfOpt {
                .timeline_get_throttle
                .clone()
                .unwrap_or(global_conf.timeline_get_throttle),
-            image_layer_creation_check_threshold: self
-                .image_layer_creation_check_threshold
-                .unwrap_or(global_conf.image_layer_creation_check_threshold),
        }
    }
 }
@@ -561,7 +548,6 @@ impl Default for TenantConf {
            heatmap_period: Duration::ZERO,
            lazy_slru_download: false,
            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
-            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
        }
    }
 }
@@ -635,7 +621,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
            heatmap_period: value.heatmap_period.map(humantime),
            lazy_slru_download: value.lazy_slru_download,
            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
-            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
        }
    }
 }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -14,10 +14,7 @@ use crate::{
    config::PageServerConf,
    context::RequestContext,
    task_mgr::{self, TaskKind},
-    tenant::{
-        mgr::{TenantSlot, TenantsMapRemoveResult},
-        timeline::ShutdownMode,
-    },
+    tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
 };

 use super::{
@@ -466,7 +463,7 @@ impl DeleteTenantFlow {
        // tenant.shutdown
        // Its also bad that we're holding tenants.read here.
        // TODO relax set_stopping to be idempotent?
-        if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
+        if tenant.shutdown(progress, false).await.is_err() {
            return Err(DeleteTenantError::Other(anyhow::anyhow!(
                "tenant shutdown is already in progress"
            )));
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -72,10 +72,6 @@ impl EphemeralFile {
        self.len
    }

-    pub(crate) fn id(&self) -> page_cache::FileId {
-        self.page_cache_file_id
-    }
-
    pub(crate) async fn read_blk(
        &self,
        blknum: u32,
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -346,6 +346,35 @@ where
    }
 }

+#[derive(PartialEq, Eq, Hash, Debug, Clone)]
+pub enum InMemoryLayerHandle {
+    Open {
+        lsn_floor: Lsn,
+        end_lsn: Lsn,
+    },
+    Frozen {
+        idx: usize,
+        lsn_floor: Lsn,
+        end_lsn: Lsn,
+    },
+}
+
+impl InMemoryLayerHandle {
+    pub fn get_lsn_floor(&self) -> Lsn {
+        match self {
+            InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
+            InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
+        }
+    }
+
+    pub fn get_end_lsn(&self) -> Lsn {
+        match self {
+            InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
+            InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
+        }
+    }
+}
+
 impl LayerMap {
    ///
    /// Find the latest layer (by lsn.end) that covers the given
@@ -547,18 +576,41 @@ impl LayerMap {
        self.historic.iter()
    }

-    /// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
-    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
+    /// Get a handle for the first in memory layer that matches the provided predicate.
+    /// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
+    ///
+    /// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
+    /// the same exclusive region established by holding the layer manager lock.
+    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
    where
        Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
    {
        if let Some(open) = &self.open_layer {
            if pred(open) {
-                return Some(open.clone());
+                return Some(InMemoryLayerHandle::Open {
+                    lsn_floor: open.get_lsn_range().start,
+                    end_lsn: open.get_lsn_range().end,
+                });
            }
        }

-        self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
+        let pos = self.frozen_layers.iter().rev().position(pred);
+        pos.map(|rev_idx| {
+            let idx = self.frozen_layers.len() - 1 - rev_idx;
+            InMemoryLayerHandle::Frozen {
+                idx,
+                lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
+                end_lsn: self.frozen_layers[idx].get_lsn_range().end,
+            }
+        })
+    }
+
+    /// Get the layer pointed to by the provided handle.
+    pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
+        match handle {
+            InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
+            InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
+        }
    }

    ///
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -44,7 +44,6 @@ use crate::tenant::config::{
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
-use crate::tenant::timeline::ShutdownMode;
 use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};

@@ -784,9 +783,11 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
                            shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone()));
                            join_set.spawn(
                                async move {
+                                    let freeze_and_flush = true;
+
                                    let res = {
                                        let (_guard, shutdown_progress) = completion::channel();
-                                        t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await
+                                        t.shutdown(shutdown_progress, freeze_and_flush).await
                                    };

                                    if let Err(other_progress) = res {
@@ -1106,7 +1107,7 @@ impl TenantManager {
                };

                info!("Shutting down attached tenant");
-                match tenant.shutdown(progress, ShutdownMode::Hard).await {
+                match tenant.shutdown(progress, false).await {
                    Ok(()) => {}
                    Err(barrier) => {
                        info!("Shutdown already in progress, waiting for it to complete");
@@ -1222,7 +1223,7 @@ impl TenantManager {
                    TenantSlot::Attached(tenant) => {
                        let (_guard, progress) = utils::completion::channel();
                        info!("Shutting down just-spawned tenant, because tenant manager is shut down");
-                        match tenant.shutdown(progress, ShutdownMode::Hard).await {
+                        match tenant.shutdown(progress, false).await {
                            Ok(()) => {
                                info!("Finished shutting down just-spawned tenant");
                            }
@@ -1272,7 +1273,7 @@ impl TenantManager {
        };

        let (_guard, progress) = utils::completion::channel();
-        match tenant.shutdown(progress, ShutdownMode::Hard).await {
+        match tenant.shutdown(progress, false).await {
            Ok(()) => {
                slot_guard.drop_old_value()?;
            }
@@ -1648,14 +1649,7 @@ impl TenantManager {
                    fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
                        "failpoint"
                    )));
-                    if let Err(e) = timeline
-                        .wait_lsn(
-                            *target_lsn,
-                            crate::tenant::timeline::WaitLsnWaiter::Tenant,
-                            ctx,
-                        )
-                        .await
-                    {
+                    if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
                        // Failure here might mean shutdown, in any case this part is an optimization
                        // and we shouldn't hold up the split operation.
                        tracing::warn!(
@@ -1676,7 +1670,7 @@ impl TenantManager {

        // Phase 5: Shut down the parent shard, and erase it from disk
        let (_guard, progress) = completion::channel();
-        match parent.shutdown(progress, ShutdownMode::Hard).await {
+        match parent.shutdown(progress, false).await {
            Ok(()) => {}
            Err(other) => {
                other.wait().await;
@@ -2663,11 +2657,11 @@ where
    let attached_tenant = match slot_guard.get_old_value() {
        Some(TenantSlot::Attached(tenant)) => {
            // whenever we remove a tenant from memory, we don't want to flush and wait for upload
-            let shutdown_mode = ShutdownMode::Hard;
+            let freeze_and_flush = false;

            // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
            // that we can continue safely to cleanup.
-            match tenant.shutdown(progress, shutdown_mode).await {
+            match tenant.shutdown(progress, freeze_and_flush).await {
                Ok(()) => {}
                Err(_other) => {
                    // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -200,7 +200,6 @@ use utils::backoff::{
 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
-use std::time::Duration;

 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use std::ops::DerefMut;
@@ -208,7 +207,7 @@ use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;

-use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
+use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{
    MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -262,10 +261,6 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 pub(crate) const BUFFER_SIZE: usize = 32 * 1024;

-/// Doing non-essential flushes of deletion queue is subject to this timeout, after
-/// which we warn and skip.
-const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);
-
 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
    Deleted(IndexPart),
@@ -593,14 +588,14 @@ impl RemoteTimelineClient {
        upload_queue: &mut UploadQueueInitialized,
        metadata: TimelineMetadata,
    ) {
-        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
-
        info!(
-            "scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
+            "scheduling metadata upload with {} files ({} changed)",
            upload_queue.latest_files.len(),
            upload_queue.latest_files_changes_since_metadata_upload_scheduled,
        );

+        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
+
        let index_part = IndexPart::new(
            upload_queue.latest_files.clone(),
            disk_consistent_lsn,
@@ -1055,26 +1050,6 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> {
-        match tokio::time::timeout(
-            DELETION_QUEUE_FLUSH_TIMEOUT,
-            self.deletion_queue_client.flush_immediate(),
-        )
-        .await
-        {
-            Ok(result) => result,
-            Err(_timeout) => {
-                // Flushing remote deletions is not mandatory: we flush here to make the system easier to test, and
-                // to ensure that _usually_ objects are really gone after a DELETE is acked.  However, in case of deletion
-                // queue issues (https://github.com/neondatabase/neon/issues/6440), we don't want to wait indefinitely here.
-                tracing::warn!(
-                    "Timed out waiting for deletion queue flush, acking deletion anyway"
-                );
-                Ok(())
-            }
-        }
-    }
-
    /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
    /// The function deletes layer files one by one, then lists the prefix to see if we leaked something
    /// deletes leaked files if any and proceeds with deletion of index file at the end.
@@ -1124,7 +1099,7 @@ impl RemoteTimelineClient {

        // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
        // taking the burden of listing all the layers that we already know we should delete.
-        self.flush_deletion_queue().await?;
+        self.deletion_queue_client.flush_immediate().await?;

        let cancel = shutdown_token();

@@ -1198,7 +1173,7 @@ impl RemoteTimelineClient {

        // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
        // for a flush to a persistent deletion list so that we may be sure deletion will occur.
-        self.flush_deletion_queue().await?;
+        self.deletion_queue_client.flush_immediate().await?;

        fail::fail_point!("timeline-delete-after-index-delete", |_| {
            Err(anyhow::anyhow!(
@@ -1594,7 +1569,7 @@ impl RemoteTimelineClient {
    /// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
    ///
    /// In-progress operations will still be running after this function returns.
-    /// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))`
+    /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
    /// to wait for them to complete, after calling this function.
    pub(crate) fn stop(&self) {
        // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -51,7 +51,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
 use utils::{
    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
-    id::TimelineId, serde_system_time,
+    id::TimelineId,
 };

 use super::{
@@ -591,7 +591,7 @@ impl<'a> TenantDownloader<'a> {
        let mut progress = SecondaryProgress {
            layers_total: heatmap_stats.layers,
            bytes_total: heatmap_stats.bytes,
-            heatmap_mtime: Some(serde_system_time::SystemTime(heatmap_mtime)),
+            heatmap_mtime: Some(heatmap_mtime),
            layers_downloaded: 0,
            bytes_downloaded: 0,
        };
@@ -786,35 +786,6 @@ impl<'a> TenantDownloader<'a> {
            // Existing on-disk layers: just update their access time.
            if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
                tracing::debug!("Layer {} is already on disk", layer.name);
-
-                if cfg!(debug_assertions) {
-                    // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
-                    // are already present on disk are really there.
-                    let local_path = self
-                        .conf
-                        .timeline_path(tenant_shard_id, &timeline.timeline_id)
-                        .join(layer.name.file_name());
-                    match tokio::fs::metadata(&local_path).await {
-                        Ok(meta) => {
-                            tracing::debug!(
-                                "Layer {} present at {}, size {}",
-                                layer.name,
-                                local_path,
-                                meta.len(),
-                            );
-                        }
-                        Err(e) => {
-                            tracing::warn!(
-                                "Layer {} not found at {} ({})",
-                                layer.name,
-                                local_path,
-                                e
-                            );
-                            debug_assert!(false);
-                        }
-                    }
-                }
-
                if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
                    || on_disk.access_time != layer.access_time
                {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
 use std::ops::Range;
-use std::sync::{Arc, Mutex};
+use std::sync::Mutex;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -41,8 +41,8 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};

 pub(crate) use layer::{EvictionError, Layer, ResidentLayer};

-use self::inmemory_layer::InMemoryLayerFileId;
-
+use super::layer_map::InMemoryLayerHandle;
+use super::timeline::layer_manager::LayerManager;
 use super::timeline::GetVectoredError;
 use super::PageReconstructError;

@@ -204,30 +204,23 @@ impl Default for ValuesReconstructState {
    }
 }

-/// A key that uniquely identifies a layer in a timeline
-#[derive(Debug, PartialEq, Eq, Clone, Hash)]
-pub(crate) enum LayerId {
-    PersitentLayerId(PersistentLayerKey),
-    InMemoryLayerId(InMemoryLayerFileId),
+/// Description of layer to be read - the layer map can turn
+/// this description into the actual layer.
+#[derive(PartialEq, Eq, Hash, Debug, Clone)]
+pub(crate) enum ReadableLayerDesc {
+    Persistent {
+        desc: PersistentLayerDesc,
+        lsn_range: Range<Lsn>,
+    },
+    InMemory {
+        handle: InMemoryLayerHandle,
+        lsn_ceil: Lsn,
+    },
 }

-/// Layer wrapper for the read path. Note that it is valid
-/// to use these layers even after external operations have
-/// been performed on them (compaction, freeze, etc.).
+/// Wraper for 'ReadableLayerDesc' sorted by Lsn
 #[derive(Debug)]
-pub(crate) enum ReadableLayer {
-    PersistentLayer(Layer),
-    InMemoryLayer(Arc<InMemoryLayer>),
-}
-
-/// A partial description of a read to be done.
-#[derive(Debug, Clone)]
-struct ReadDesc {
-    /// An id used to resolve the readable layer within the fringe
-    layer_id: LayerId,
-    /// Lsn range for the read, used for selecting the next read
-    lsn_range: Range<Lsn>,
-}
+struct ReadableLayerDescOrdered(ReadableLayerDesc);

 /// Data structure which maintains a fringe of layers for the
 /// read path. The fringe is the set of layers which intersects
@@ -238,64 +231,41 @@ struct ReadDesc {
 /// a two layer indexing scheme.
 #[derive(Debug)]
 pub(crate) struct LayerFringe {
-    planned_reads_by_lsn: BinaryHeap<ReadDesc>,
-    layers: HashMap<LayerId, LayerKeyspace>,
-}
-
-#[derive(Debug)]
-struct LayerKeyspace {
-    layer: ReadableLayer,
-    target_keyspace: KeySpace,
+    layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
+    layers: HashMap<ReadableLayerDesc, KeySpace>,
 }

 impl LayerFringe {
    pub(crate) fn new() -> Self {
        LayerFringe {
-            planned_reads_by_lsn: BinaryHeap::new(),
+            layers_by_lsn: BinaryHeap::new(),
            layers: HashMap::new(),
        }
    }

-    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
-        let read_desc = match self.planned_reads_by_lsn.pop() {
-            Some(desc) => desc,
+    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
+        let handle = match self.layers_by_lsn.pop() {
+            Some(h) => h,
            None => return None,
        };

-        let removed = self.layers.remove_entry(&read_desc.layer_id);
+        let removed = self.layers.remove_entry(&handle.0);
        match removed {
-            Some((
-                _,
-                LayerKeyspace {
-                    layer,
-                    target_keyspace,
-                },
-            )) => Some((layer, target_keyspace, read_desc.lsn_range)),
+            Some((layer, keyspace)) => Some((layer, keyspace)),
            None => unreachable!("fringe internals are always consistent"),
        }
    }

-    pub(crate) fn update(
-        &mut self,
-        layer: ReadableLayer,
-        keyspace: KeySpace,
-        lsn_range: Range<Lsn>,
-    ) {
-        let layer_id = layer.id();
-        let entry = self.layers.entry(layer_id.clone());
+    pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
+        let entry = self.layers.entry(layer.clone());
        match entry {
            Entry::Occupied(mut entry) => {
-                entry.get_mut().target_keyspace.merge(&keyspace);
+                entry.get_mut().merge(&keyspace);
            }
            Entry::Vacant(entry) => {
-                self.planned_reads_by_lsn.push(ReadDesc {
-                    lsn_range,
-                    layer_id: layer_id.clone(),
-                });
-                entry.insert(LayerKeyspace {
-                    layer,
-                    target_keyspace: keyspace,
-                });
+                self.layers_by_lsn
+                    .push(ReadableLayerDescOrdered(entry.key().clone()));
+                entry.insert(keyspace);
            }
        }
    }
@@ -307,55 +277,77 @@ impl Default for LayerFringe {
    }
 }

-impl Ord for ReadDesc {
+impl Ord for ReadableLayerDescOrdered {
    fn cmp(&self, other: &Self) -> Ordering {
-        let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
+        let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
        if ord == std::cmp::Ordering::Equal {
-            self.lsn_range.start.cmp(&other.lsn_range.start).reverse()
+            self.0
+                .get_lsn_floor()
+                .cmp(&other.0.get_lsn_floor())
+                .reverse()
        } else {
            ord
        }
    }
 }

-impl PartialOrd for ReadDesc {
+impl PartialOrd for ReadableLayerDescOrdered {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
 }

-impl PartialEq for ReadDesc {
+impl PartialEq for ReadableLayerDescOrdered {
    fn eq(&self, other: &Self) -> bool {
-        self.lsn_range == other.lsn_range
+        self.0.get_lsn_floor() == other.0.get_lsn_floor()
+            && self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
    }
 }

-impl Eq for ReadDesc {}
+impl Eq for ReadableLayerDescOrdered {}

-impl ReadableLayer {
-    pub(crate) fn id(&self) -> LayerId {
+impl ReadableLayerDesc {
+    pub(crate) fn get_lsn_floor(&self) -> Lsn {
        match self {
-            Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()),
-            Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()),
+            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
+            ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
+        }
+    }
+
+    pub(crate) fn get_lsn_ceil(&self) -> Lsn {
+        match self {
+            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
+            ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
        }
    }

    pub(crate) async fn get_values_reconstruct_data(
        &self,
+        layer_manager: &LayerManager,
        keyspace: KeySpace,
-        lsn_range: Range<Lsn>,
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
        match self {
-            ReadableLayer::PersistentLayer(layer) => {
+            ReadableLayerDesc::Persistent { desc, lsn_range } => {
+                let layer = layer_manager.get_from_desc(desc);
                layer
-                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(
+                        keyspace,
+                        lsn_range.clone(),
+                        reconstruct_state,
+                        ctx,
+                    )
                    .await
            }
-            ReadableLayer::InMemoryLayer(layer) => {
+            ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
+                let layer = layer_manager
+                    .layer_map()
+                    .get_in_memory_layer(handle)
+                    .unwrap();
+
                layer
-                    .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
                    .await
            }
        }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -47,7 +47,6 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
-use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -947,34 +946,6 @@ impl DeltaLayerInner {
        Ok(planner.finish())
    }

-    fn get_min_read_buffer_size(
-        planned_reads: &[VectoredRead],
-        read_size_soft_max: usize,
-    ) -> usize {
-        let Some(largest_read) = planned_reads.iter().max_by_key(|read| read.size()) else {
-            return read_size_soft_max;
-        };
-
-        let largest_read_size = largest_read.size();
-        if largest_read_size > read_size_soft_max {
-            // If the read is oversized, it should only contain one key.
-            let offenders = largest_read
-                .blobs_at
-                .as_slice()
-                .iter()
-                .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
-                .join(", ");
-            tracing::warn!(
-                "Oversized vectored read ({} > {}) for keys {}",
-                largest_read_size,
-                read_size_soft_max,
-                offenders
-            );
-        }
-
-        largest_read_size
-    }
-
    async fn do_reads_and_update_state(
        &self,
        reads: Vec<VectoredRead>,
@@ -988,8 +959,7 @@ impl DeltaLayerInner {
            .expect("Layer is loaded with max vectored bytes config")
            .0
            .into();
-        let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
-        let mut buf = Some(BytesMut::with_capacity(buf_size));
+        let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));

        // Note that reads are processed in reverse order (from highest key+lsn).
        // This is the order that `ReconstructState` requires such that it can
@@ -1016,7 +986,7 @@ impl DeltaLayerInner {

                    // We have "lost" the buffer since the lower level IO api
                    // doesn't return the buffer on error. Allocate a new one.
-                    buf = Some(BytesMut::with_capacity(buf_size));
+                    buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));

                    continue;
                }
@@ -1240,16 +1210,9 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
 mod test {
    use std::collections::BTreeMap;

-    use itertools::MinMaxResult;
-    use rand::prelude::{SeedableRng, SliceRandom, StdRng};
-    use rand::RngCore;
-
    use super::*;
    use crate::{
-        context::DownloadBehavior,
-        task_mgr::TaskKind,
-        tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
-        DEFAULT_PG_VERSION,
+        context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk,
    };

    /// Construct an index for a fictional delta layer and and then
@@ -1369,229 +1332,4 @@ mod test {

        assert_eq!(planned_blobs, expected_blobs);
    }
-
-    mod constants {
-        use utils::lsn::Lsn;
-
-        /// Offset used by all lsns in this test
-        pub(super) const LSN_OFFSET: Lsn = Lsn(0x08);
-        /// Number of unique keys including in the test data
-        pub(super) const KEY_COUNT: u8 = 60;
-        /// Max number of different lsns for each key
-        pub(super) const MAX_ENTRIES_PER_KEY: u8 = 20;
-        /// Possible value sizes for each key along with a probability weight
-        pub(super) const VALUE_SIZES: [(usize, u8); 3] = [(100, 2), (1024, 2), (1024 * 1024, 1)];
-        /// Probability that there will be a gap between the current key and the next one (33.3%)
-        pub(super) const KEY_GAP_CHANGES: [(bool, u8); 2] = [(true, 1), (false, 2)];
-        /// The minimum size of a key range in all the generated reads
-        pub(super) const MIN_RANGE_SIZE: i128 = 10;
-        /// The number of ranges included in each vectored read
-        pub(super) const RANGES_COUNT: u8 = 2;
-        /// The number of vectored reads performed
-        pub(super) const READS_COUNT: u8 = 100;
-        /// Soft max size of a vectored read. Will be violated if we have to read keys
-        /// with values larger than the limit
-        pub(super) const MAX_VECTORED_READ_BYTES: usize = 64 * 1024;
-    }
-
-    struct Entry {
-        key: Key,
-        lsn: Lsn,
-        value: Vec<u8>,
-    }
-
-    fn generate_entries(rng: &mut StdRng) -> Vec<Entry> {
-        let mut current_key = Key::MIN;
-
-        let mut entries = Vec::new();
-        for _ in 0..constants::KEY_COUNT {
-            let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY);
-            let mut lsns_iter =
-                std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| {
-                    Some(Lsn(lsn.0 + 0x08))
-                });
-            let mut lsns = Vec::new();
-            while lsns.len() < count as usize {
-                let take = rng.gen_bool(0.5);
-                let lsn = lsns_iter.next().unwrap();
-                if take {
-                    lsns.push(lsn);
-                }
-            }
-
-            for lsn in lsns {
-                let size = constants::VALUE_SIZES
-                    .choose_weighted(rng, |item| item.1)
-                    .unwrap()
-                    .0;
-                let mut buf = vec![0; size];
-                rng.fill_bytes(&mut buf);
-
-                entries.push(Entry {
-                    key: current_key,
-                    lsn,
-                    value: buf,
-                })
-            }
-
-            let gap = constants::KEY_GAP_CHANGES
-                .choose_weighted(rng, |item| item.1)
-                .unwrap()
-                .0;
-            if gap {
-                current_key = current_key.add(2);
-            } else {
-                current_key = current_key.add(1);
-            }
-        }
-
-        entries
-    }
-
-    struct EntriesMeta {
-        key_range: Range<Key>,
-        lsn_range: Range<Lsn>,
-        index: BTreeMap<(Key, Lsn), Vec<u8>>,
-    }
-
-    fn get_entries_meta(entries: &[Entry]) -> EntriesMeta {
-        let key_range = match entries.iter().minmax_by_key(|e| e.key) {
-            MinMaxResult::MinMax(min, max) => min.key..max.key.next(),
-            _ => panic!("More than one entry is always expected"),
-        };
-
-        let lsn_range = match entries.iter().minmax_by_key(|e| e.lsn) {
-            MinMaxResult::MinMax(min, max) => min.lsn..Lsn(max.lsn.0 + 1),
-            _ => panic!("More than one entry is always expected"),
-        };
-
-        let mut index = BTreeMap::new();
-        for entry in entries.iter() {
-            index.insert((entry.key, entry.lsn), entry.value.clone());
-        }
-
-        EntriesMeta {
-            key_range,
-            lsn_range,
-            index,
-        }
-    }
-
-    fn pick_random_keyspace(rng: &mut StdRng, key_range: &Range<Key>) -> KeySpace {
-        let start = key_range.start.to_i128();
-        let end = key_range.end.to_i128();
-
-        let mut keyspace = KeySpace::default();
-
-        for _ in 0..constants::RANGES_COUNT {
-            let mut range: Option<Range<Key>> = Option::default();
-            while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) {
-                let range_start = rng.gen_range(start..end);
-                let range_end_offset = range_start + constants::MIN_RANGE_SIZE;
-                if range_end_offset >= end {
-                    range = Some(Key::from_i128(range_start)..Key::from_i128(end));
-                } else {
-                    let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end);
-                    range = Some(Key::from_i128(range_start)..Key::from_i128(range_end));
-                }
-            }
-            keyspace.ranges.push(range.unwrap());
-        }
-
-        keyspace
-    }
-
-    #[tokio::test]
-    async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
-        let (tenant, ctx) = harness.load().await;
-
-        let timeline_id = TimelineId::generate();
-        let timeline = tenant
-            .create_test_timeline(timeline_id, constants::LSN_OFFSET, DEFAULT_PG_VERSION, &ctx)
-            .await?;
-
-        tracing::info!("Generating test data ...");
-
-        let rng = &mut StdRng::seed_from_u64(0);
-        let entries = generate_entries(rng);
-        let entries_meta = get_entries_meta(&entries);
-
-        tracing::info!("Done generating {} entries", entries.len());
-
-        tracing::info!("Writing test data to delta layer ...");
-        let mut writer = DeltaLayerWriter::new(
-            harness.conf,
-            timeline_id,
-            harness.tenant_shard_id,
-            entries_meta.key_range.start,
-            entries_meta.lsn_range.clone(),
-        )
-        .await?;
-
-        for entry in entries {
-            let (_, res) = writer
-                .put_value_bytes(entry.key, entry.lsn, entry.value, false)
-                .await;
-            res?;
-        }
-
-        let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
-
-        let inner = resident.get_inner_delta(&ctx).await?;
-
-        let file_size = inner.file.metadata().await?.len();
-        tracing::info!(
-            "Done writing test data to delta layer. Resulting file size is: {}",
-            file_size
-        );
-
-        for i in 0..constants::READS_COUNT {
-            tracing::info!("Doing vectored read {}/{}", i + 1, constants::READS_COUNT);
-
-            let block_reader = FileBlockReader::new(&inner.file, inner.file_id);
-            let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-                inner.index_start_blk,
-                inner.index_root_blk,
-                block_reader,
-            );
-
-            let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES);
-            let mut reconstruct_state = ValuesReconstructState::new();
-            let keyspace = pick_random_keyspace(rng, &entries_meta.key_range);
-            let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
-
-            let vectored_reads = DeltaLayerInner::plan_reads(
-                keyspace.clone(),
-                entries_meta.lsn_range.clone(),
-                data_end_offset,
-                index_reader,
-                planner,
-                &mut reconstruct_state,
-                &ctx,
-            )
-            .await?;
-
-            let vectored_blob_reader = VectoredBlobReader::new(&inner.file);
-            let buf_size = DeltaLayerInner::get_min_read_buffer_size(
-                &vectored_reads,
-                constants::MAX_VECTORED_READ_BYTES,
-            );
-            let mut buf = Some(BytesMut::with_capacity(buf_size));
-
-            for read in vectored_reads {
-                let blobs_buf = vectored_blob_reader
-                    .read_blobs(&read, buf.take().expect("Should have a buffer"))
-                    .await?;
-                for meta in blobs_buf.blobs.iter() {
-                    let value = &blobs_buf.buf[meta.start..meta.end];
-                    assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
-                }
-
-                buf = Some(blobs_buf.buf);
-            }
-        }
-
-        Ok(())
-    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -44,7 +44,6 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
-use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -541,25 +540,7 @@ impl ImageLayerInner {

        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
        for read in reads.into_iter() {
-            let buf_size = read.size();
-
-            if buf_size > max_vectored_read_bytes {
-                // If the read is oversized, it should only contain one key.
-                let offenders = read
-                    .blobs_at
-                    .as_slice()
-                    .iter()
-                    .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
-                    .join(", ");
-                tracing::warn!(
-                    "Oversized vectored read ({} > {}) for keys {}",
-                    buf_size,
-                    max_vectored_read_bytes,
-                    offenders
-                );
-            }
-
-            let buf = BytesMut::with_capacity(buf_size);
+            let buf = BytesMut::with_capacity(max_vectored_read_bytes);
            let res = vectored_blob_reader.read_blobs(&read, buf).await;

            match res {
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -12,14 +12,13 @@ use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{PageReconstructError, Timeline};
-use crate::{page_cache, walrecord};
+use crate::walrecord;
 use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
 use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::sync::{Arc, OnceLock};
-use std::time::Instant;
 use tracing::*;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // avoid binding to Write (conflicts with std::io::Write)
@@ -37,14 +36,10 @@ use super::{
    ValuesReconstructState,
 };

-#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
-pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
-
 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
-    file_id: InMemoryLayerFileId,

    /// This layer contains all the changes from 'start_lsn'. The
    /// start is inclusive.
@@ -54,8 +49,6 @@ pub struct InMemoryLayer {
    /// Writes are only allowed when this is `None`.
    end_lsn: OnceLock<Lsn>,

-    opened_at: Instant,
-
    /// The above fields never change, except for `end_lsn`, which is only set once.
    /// All other changing parts are in `inner`, and protected by a mutex.
    inner: RwLock<InMemoryLayerInner>,
@@ -207,10 +200,6 @@ pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
 };

 impl InMemoryLayer {
-    pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
-        self.file_id
-    }
-
    pub(crate) fn get_timeline_id(&self) -> TimelineId {
        self.timeline_id
    }
@@ -454,16 +443,13 @@ impl InMemoryLayer {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");

        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
-        let key = InMemoryLayerFileId(file.id());

        Ok(InMemoryLayer {
-            file_id: key,
            conf,
            timeline_id,
            tenant_shard_id,
            start_lsn,
            end_lsn: OnceLock::new(),
-            opened_at: Instant::now(),
            inner: RwLock::new(InMemoryLayerInner {
                index: HashMap::new(),
                file,
@@ -524,10 +510,6 @@ impl InMemoryLayer {
        Ok(())
    }

-    pub(crate) fn get_opened_at(&self) -> Instant {
-        self.opened_at
-    }
-
    pub(crate) async fn tick(&self) -> Option<u64> {
        let mut inner = self.inner.write().await;
        let size = inner.file.len();
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1759,18 +1759,6 @@ impl ResidentLayer {
    pub(crate) fn metadata(&self) -> LayerFileMetadata {
        self.owner.metadata()
    }
-
-    #[cfg(test)]
-    pub(crate) async fn get_inner_delta<'a>(
-        &'a self,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> {
-        let owner = &self.owner.0;
-        match self.downloaded.get(owner, ctx).await? {
-            LayerKind::Delta(d) => Ok(d),
-            LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")),
-        }
-    }
 }

 impl AsLayerDesc for ResidentLayer {
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -18,7 +18,7 @@ use utils::{backoff, completion};

 static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
    once_cell::sync::Lazy::new(|| {
-        let total_threads = task_mgr::TOKIO_WORKER_THREADS.get();
+        let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
        let permits = usize::max(
            1,
            // while a lot of the work is done on spawn_blocking, we still do
@@ -72,7 +72,6 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
        loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation
    );

-    // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id();
    match CONCURRENT_BACKGROUND_TASKS.acquire().await {
        Ok(permit) => permit,
        Err(_closed) => unreachable!("we never close the semaphore"),
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -9,7 +9,6 @@ pub mod uninit;
 mod walreceiver;

 use anyhow::{anyhow, bail, ensure, Context, Result};
-use arc_swap::ArcSwap;
 use bytes::Bytes;
 use camino::Utf8Path;
 use enumset::EnumSet;
@@ -119,11 +118,11 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

-use super::config::TenantConf;
+use super::remote_timeline_client::RemoteTimelineClient;
 use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
+use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
-use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};

 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(super) enum FlushLoopState {
@@ -184,7 +183,7 @@ pub(crate) struct AuxFilesState {

 pub struct Timeline {
    conf: &'static PageServerConf,
-    tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
+    tenant_conf: Arc<RwLock<AttachedTenantConf>>,

    myself: Weak<Self>,

@@ -282,12 +281,10 @@ pub struct Timeline {
    pub(super) flush_loop_state: Mutex<FlushLoopState>,

    /// layer_flush_start_tx can be used to wake up the layer-flushing task.
-    /// - The u64 value is a counter, incremented every time a new flush cycle is requested.
-    ///   The flush cycle counter is sent back on the layer_flush_done channel when
-    ///   the flush finishes. You can use that to wait for the flush to finish.
-    /// - The LSN is updated to max() of its current value and the latest disk_consistent_lsn
-    ///   read by whoever sends an update
-    layer_flush_start_tx: tokio::sync::watch::Sender<(u64, Lsn)>,
+    /// The value is a counter, incremented every time a new flush cycle is requested.
+    /// The flush cycle counter is sent back on the layer_flush_done channel when
+    /// the flush finishes. You can use that to wait for the flush to finish.
+    layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
    /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
    layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,

@@ -312,8 +309,6 @@ pub struct Timeline {
    /// Configuration: how often should the partitioning be recalculated.
    repartition_threshold: u64,

-    last_image_layer_creation_check_at: AtomicLsn,
-
    /// Current logical size of the "datadir", at the last LSN.
    current_logical_size: LogicalSize,

@@ -615,25 +610,6 @@ pub enum GetVectoredImpl {
    Vectored,
 }

-pub(crate) enum WaitLsnWaiter<'a> {
-    Timeline(&'a Timeline),
-    Tenant,
-    PageService,
-}
-
-/// Argument to [`Timeline::shutdown`].
-#[derive(Debug, Clone, Copy)]
-pub(crate) enum ShutdownMode {
-    /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
-    /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
-    ///
-    /// While we are flushing, we continue to accept read I/O for LSNs ingested before
-    /// the call to [`Timeline::shutdown`].
-    FreezeAndFlush,
-    /// Shut down immediately, without waiting for any open layers to flush.
-    Hard,
-}
-
 /// Public interface functions
 impl Timeline {
    /// Get the LSN where this branch was created
@@ -1082,8 +1058,7 @@ impl Timeline {
    pub(crate) async fn wait_lsn(
        &self,
        lsn: Lsn,
-        who_is_waiting: WaitLsnWaiter<'_>,
-        ctx: &RequestContext, /* Prepare for use by cancellation */
+        _ctx: &RequestContext, /* Prepare for use by cancellation */
    ) -> Result<(), WaitLsnError> {
        if self.cancel.is_cancelled() {
            return Err(WaitLsnError::Shutdown);
@@ -1091,28 +1066,20 @@ impl Timeline {
            return Err(WaitLsnError::BadState);
        }

-        if cfg!(debug_assertions) {
-            match ctx.task_kind() {
-                TaskKind::WalReceiverManager
-                | TaskKind::WalReceiverConnectionHandler
-                | TaskKind::WalReceiverConnectionPoller => {
-                    let is_myself = match who_is_waiting {
-                        WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself),
-                        WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()),
-                    };
-                    if is_myself {
-                        if let Err(current) = self.last_record_lsn.would_wait_for(lsn) {
-                            // walingest is the only one that can advance last_record_lsn; it should make sure to never reach here
-                            panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock");
-                        }
-                    } else {
-                        // if another  timeline's  is waiting for us, there's no deadlock risk because
-                        // our walreceiver task can make progress independent of theirs
-                    }
-                }
-                _ => {}
-            }
-        }
+        // This should never be called from the WAL receiver, because that could lead
+        // to a deadlock.
+        debug_assert!(
+            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
+            "wait_lsn cannot be called in WAL receiver"
+        );
+        debug_assert!(
+            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
+            "wait_lsn cannot be called in WAL receiver"
+        );
+        debug_assert!(
+            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
+            "wait_lsn cannot be called in WAL receiver"
+        );

        let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();

@@ -1171,8 +1138,8 @@ impl Timeline {
    /// Flush to disk all data that was written with the put_* functions
    #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
    pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
-        let to_lsn = self.freeze_inmem_layer(false).await;
-        self.flush_frozen_layers_and_wait(to_lsn).await
+        self.freeze_inmem_layer(false).await;
+        self.flush_frozen_layers_and_wait().await
    }

    /// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
@@ -1192,39 +1159,7 @@ impl Timeline {
        };

        let Some(open_layer) = &layers_guard.layer_map().open_layer else {
-            // If there is no open layer, we have no layer freezing to do.  However, we might need to generate
-            // some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions
-            // that didn't result in writes to this shard.
-
-            // Must not hold the layers lock while waiting for a flush.
-            drop(layers_guard);
-
-            let last_record_lsn = self.get_last_record_lsn();
-            let disk_consistent_lsn = self.get_disk_consistent_lsn();
-            if last_record_lsn > disk_consistent_lsn {
-                // We have no open layer, but disk_consistent_lsn is behind the last record: this indicates
-                // we are a sharded tenant and have skipped some WAL
-                let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
-                if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
-                    // This should be somewhat rare, so we log it at INFO level.
-                    //
-                    // We checked for checkpoint timeout so that a shard without any
-                    // data ingested (yet) doesn't write a remote index as soon as it
-                    // sees its LSN advance: we only do this if we've been layer-less
-                    // for some time.
-                    tracing::info!(
-                        "Advancing disk_consistent_lsn past WAL ingest gap {} -> {}",
-                        disk_consistent_lsn,
-                        last_record_lsn
-                    );
-
-                    // The flush loop will update remote consistent LSN as well as disk consistent LSN.
-                    self.flush_frozen_layers_and_wait(last_record_lsn)
-                        .await
-                        .ok();
-                }
-            }
-
+            // No open layer, no work to do.
            return;
        };

@@ -1257,7 +1192,7 @@ impl Timeline {
            checkpoint_distance,
            self.get_last_record_lsn(),
            self.last_freeze_at.load(),
-            open_layer.get_opened_at(),
+            *self.last_freeze_ts.read().unwrap(),
        ) {
            match open_layer.info() {
                InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
@@ -1353,119 +1288,83 @@ impl Timeline {
        self.launch_eviction_task(parent, background_jobs_can_start);
    }

-    /// After this function returns, there are no timeline-scoped tasks are left running.
+    /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
+    /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
    ///
-    /// The preferred pattern for is:
-    /// - in any spawned tasks, keep Timeline::guard open + Timeline::cancel / child token
-    /// - if early shutdown (not just cancellation) of a sub-tree of tasks is required,
-    ///   go the extra mile and keep track of JoinHandles
-    /// - Keep track of JoinHandles using a passed-down `Arc<Mutex<Option<JoinSet>>>` or similar,
-    ///   instead of spawning directly on a runtime. It is a more composable / testable pattern.
-    ///
-    /// For legacy reasons, we still have multiple tasks spawned using
-    /// `task_mgr::spawn(X, Some(tenant_id), Some(timeline_id))`.
-    /// We refer to these as "timeline-scoped task_mgr tasks".
-    /// Some of these tasks are already sensitive to Timeline::cancel while others are
-    /// not sensitive to Timeline::cancel and instead respect [`task_mgr::shutdown_token`]
-    /// or [`task_mgr::shutdown_watcher`].
-    /// We want to gradually convert the code base away from these.
-    ///
-    /// Here is an inventory of timeline-scoped task_mgr tasks that are still sensitive to
-    /// `task_mgr::shutdown_{token,watcher}` (there are also tenant-scoped and global-scoped
-    /// ones that aren't mentioned here):
-    /// - [`TaskKind::TimelineDeletionWorker`]
-    ///    - NB: also used for tenant deletion
-    /// - [`TaskKind::RemoteUploadTask`]`
-    /// - [`TaskKind::InitialLogicalSizeCalculation`]
-    /// - [`TaskKind::DownloadAllRemoteLayers`] (can we get rid of it?)
-    // Inventory of timeline-scoped task_mgr tasks that use spawn but aren't sensitive:
-    /// - [`TaskKind::Eviction`]
-    /// - [`TaskKind::LayerFlushTask`]
-    /// - [`TaskKind::OndemandLogicalSizeCalculation`]
-    /// - [`TaskKind::GarbageCollector`] (immediate_gc is timeline-scoped)
-    pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
+    /// While we are flushing, we continue to accept read I/O.
+    pub(crate) async fn flush_and_shutdown(&self) {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        let try_freeze_and_flush = match mode {
-            ShutdownMode::FreezeAndFlush => true,
-            ShutdownMode::Hard => false,
-        };
+        // Stop ingesting data, so that we are not still writing to an InMemoryLayer while
+        // trying to flush
+        tracing::debug!("Waiting for WalReceiverManager...");
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::WalReceiverManager),
+            Some(self.tenant_shard_id),
+            Some(self.timeline_id),
+        )
+        .await;

-        // Regardless of whether we're going to try_freeze_and_flush
-        // or not, stop ingesting any more data. Walreceiver only provides
-        // cancellation but no "wait until gone", because it uses the Timeline::gate.
-        // So, only after the self.gate.close() below will we know for sure that
-        // no walreceiver tasks are left.
-        // For `try_freeze_and_flush=true`, this means that we might still be ingesting
-        // data during the call to `self.freeze_and_flush()` below.
-        // That's not ideal, but, we don't have the concept of a ChildGuard,
-        // which is what we'd need to properly model early shutdown of the walreceiver
-        // task sub-tree before the other Timeline task sub-trees.
-        let walreceiver = self.walreceiver.lock().unwrap().take();
-        tracing::debug!(
-            is_some = walreceiver.is_some(),
-            "Waiting for WalReceiverManager..."
-        );
-        if let Some(walreceiver) = walreceiver {
-            walreceiver.cancel();
-        }
-        // ... and inform any waiters for newer LSNs that there won't be any.
+        // Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
        self.last_record_lsn.shutdown();

-        if try_freeze_and_flush {
-            // we shut down walreceiver above, so, we won't add anything more
-            // to the InMemoryLayer; freeze it and wait for all frozen layers
-            // to reach the disk & upload queue, then shut the upload queue and
-            // wait for it to drain.
-            match self.freeze_and_flush().await {
-                Ok(_) => {
-                    // drain the upload queue
-                    if let Some(client) = self.remote_client.as_ref() {
-                        // if we did not wait for completion here, it might be our shutdown process
-                        // didn't wait for remote uploads to complete at all, as new tasks can forever
-                        // be spawned.
-                        //
-                        // what is problematic is the shutting down of RemoteTimelineClient, because
-                        // obviously it does not make sense to stop while we wait for it, but what
-                        // about corner cases like s3 suddenly hanging up?
-                        client.shutdown().await;
-                    }
-                }
-                Err(e) => {
-                    // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
-                    // we have some extra WAL replay to do next time the timeline starts.
-                    warn!("failed to freeze and flush: {e:#}");
+        // now all writers to InMemory layer are gone, do the final flush if requested
+        match self.freeze_and_flush().await {
+            Ok(_) => {
+                // drain the upload queue
+                if let Some(client) = self.remote_client.as_ref() {
+                    // if we did not wait for completion here, it might be our shutdown process
+                    // didn't wait for remote uploads to complete at all, as new tasks can forever
+                    // be spawned.
+                    //
+                    // what is problematic is the shutting down of RemoteTimelineClient, because
+                    // obviously it does not make sense to stop while we wait for it, but what
+                    // about corner cases like s3 suddenly hanging up?
+                    client.shutdown().await;
                }
            }
+            Err(e) => {
+                // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
+                // we have some extra WAL replay to do next time the timeline starts.
+                warn!("failed to freeze and flush: {e:#}");
+            }
        }

+        self.shutdown().await;
+    }
+
+    /// Shut down immediately, without waiting for any open layers to flush to disk.  This is a subset of
+    /// the graceful [`Timeline::flush_and_shutdown`] function.
+    pub(crate) async fn shutdown(&self) {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
        // Signal any subscribers to our cancellation token to drop out
        tracing::debug!("Cancelling CancellationToken");
        self.cancel.cancel();

-        // Transition the remote_client into a state where it's only useful for timeline deletion.
-        // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
+        // Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
+        // while doing so.
+        self.last_record_lsn.shutdown();
+
+        // Shut down the layer flush task before the remote client, as one depends on the other
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::LayerFlushTask),
+            Some(self.tenant_shard_id),
+            Some(self.timeline_id),
+        )
+        .await;
+
+        // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
+        // case our caller wants to use that for a deletion
        if let Some(remote_client) = self.remote_client.as_ref() {
            remote_client.stop();
-            // As documented in remote_client.stop()'s doc comment, it's our responsibility
-            // to shut down the upload queue tasks.
-            // TODO: fix that, task management should be encapsulated inside remote_client.
-            task_mgr::shutdown_tasks(
-                Some(TaskKind::RemoteUploadTask),
-                Some(self.tenant_shard_id),
-                Some(self.timeline_id),
-            )
-            .await;
        }

-        // TODO: work toward making this a no-op. See this funciton's doc comment for more context.
        tracing::debug!("Waiting for tasks...");
+
        task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;

-        // Finally wait until any gate-holders are complete.
-        //
-        // TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks
-        // and use a TBD variant of shutdown_tasks that asserts that there were no tasks left.
+        // Finally wait until any gate-holders are complete
        self.gate.close().await;

        self.metrics.shutdown();
@@ -1622,7 +1521,7 @@ impl Timeline {
        checkpoint_distance: u64,
        projected_lsn: Lsn,
        last_freeze_at: Lsn,
-        opened_at: Instant,
+        last_freeze_ts: Instant,
    ) -> bool {
        let distance = projected_lsn.widening_sub(last_freeze_at);

@@ -1648,13 +1547,13 @@ impl Timeline {
            );

            true
-        } else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() {
+        } else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
            info!(
-                    "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
-                    projected_lsn,
-                    layer_size,
-                    opened_at.elapsed()
-                );
+                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
+                projected_lsn,
+                layer_size,
+                last_freeze_ts.elapsed()
+            );

            true
        } else {
@@ -1669,65 +1568,57 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 // Private functions
 impl Timeline {
    pub(crate) fn get_lazy_slru_download(&self) -> bool {
-        let tenant_conf = self.tenant_conf.load();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
-            .tenant_conf
            .lazy_slru_download
            .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
    }

    fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.load();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
-            .tenant_conf
            .checkpoint_distance
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
    }

    fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.load();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
-            .tenant_conf
            .checkpoint_timeout
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
    }

    fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.load();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
-            .tenant_conf
            .compaction_target_size
            .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
    }

    fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.load();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
-            .tenant_conf
            .compaction_threshold
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

    fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.load();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
-            .tenant_conf
            .image_creation_threshold
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

    fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
-        let tenant_conf = &self.tenant_conf.load();
+        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
-            .tenant_conf
            .compaction_algorithm
            .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
    }

    fn get_eviction_policy(&self) -> EvictionPolicy {
-        let tenant_conf = self.tenant_conf.load();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
-            .tenant_conf
            .eviction_policy
            .unwrap_or(self.conf.default_tenant_conf.eviction_policy)
    }
@@ -1741,26 +1632,14 @@ impl Timeline {
            .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
    }

-    fn get_image_layer_creation_check_threshold(&self) -> u8 {
-        let tenant_conf = self.tenant_conf.load();
-        tenant_conf
-            .tenant_conf
-            .image_layer_creation_check_threshold
-            .unwrap_or(
-                self.conf
-                    .default_tenant_conf
-                    .image_layer_creation_check_threshold,
-            )
-    }
-
-    pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
+    pub(super) fn tenant_conf_updated(&self) {
        // NB: Most tenant conf options are read by background loops, so,
        // changes will automatically be picked up.

        // The threshold is embedded in the metric. So, we need to update it.
        {
            let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
-                new_conf,
+                &self.tenant_conf.read().unwrap().tenant_conf,
                &self.conf.default_tenant_conf,
            );

@@ -1787,7 +1666,7 @@ impl Timeline {
    #[allow(clippy::too_many_arguments)]
    pub(super) fn new(
        conf: &'static PageServerConf,
-        tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
+        tenant_conf: Arc<RwLock<AttachedTenantConf>>,
        metadata: &TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
        timeline_id: TimelineId,
@@ -1803,16 +1682,17 @@ impl Timeline {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
        let (state, _) = watch::channel(state);

-        let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn));
+        let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
        let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));

-        let evictions_low_residence_duration_metric_threshold = {
-            let loaded_tenant_conf = tenant_conf.load();
+        let tenant_conf_guard = tenant_conf.read().unwrap();
+
+        let evictions_low_residence_duration_metric_threshold =
            Self::get_evictions_low_residence_duration_metric_threshold(
-                &loaded_tenant_conf.tenant_conf,
+                &tenant_conf_guard.tenant_conf,
                &conf.default_tenant_conf,
-            )
-        };
+            );
+        drop(tenant_conf_guard);

        Arc::new_cyclic(|myself| {
            let mut result = Timeline {
@@ -1889,7 +1769,6 @@ impl Timeline {
                },
                partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
                repartition_threshold: 0,
-                last_image_layer_creation_check_at: AtomicLsn::new(0),

                last_received_wal: Mutex::new(None),
                rel_size_cache: RwLock::new(HashMap::new()),
@@ -1918,7 +1797,6 @@ impl Timeline {
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
-
            result
                .metrics
                .last_record_gauge
@@ -1995,19 +1873,20 @@ impl Timeline {
            self.timeline_id, self.tenant_shard_id
        );

-        let tenant_conf = self.tenant_conf.load();
-        let wal_connect_timeout = tenant_conf
+        let tenant_conf_guard = self.tenant_conf.read().unwrap();
+        let wal_connect_timeout = tenant_conf_guard
            .tenant_conf
            .walreceiver_connect_timeout
            .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
-        let lagging_wal_timeout = tenant_conf
+        let lagging_wal_timeout = tenant_conf_guard
            .tenant_conf
            .lagging_wal_timeout
            .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
-        let max_lsn_wal_lag = tenant_conf
+        let max_lsn_wal_lag = tenant_conf_guard
            .tenant_conf
            .max_lsn_wal_lag
            .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
+        drop(tenant_conf_guard);

        let mut guard = self.walreceiver.lock().unwrap();
        assert!(
@@ -2555,6 +2434,10 @@ impl Timeline {
                debug!("cancelling logical size calculation for timeline shutdown");
                calculation.await
            }
+            _ = task_mgr::shutdown_watcher() => {
+                debug!("cancelling logical size calculation for task shutdown");
+                calculation.await
+            }
        }
    }

@@ -3009,6 +2892,16 @@ impl Timeline {

        let mut completed_keyspace = KeySpace::default();

+        // Hold the layer map whilst visiting the timeline to prevent
+        // compaction, eviction and flushes from rendering the layers unreadable.
+        //
+        // TODO: Do we actually need to do this? In theory holding on
+        // to [`tenant::storage_layer::Layer`] should be enough. However,
+        // [`Timeline::get`] also holds the lock during IO, so more investigation
+        // is needed.
+        let guard = timeline.layers.read().await;
+        let layers = guard.layer_map();
+
        loop {
            if cancel.is_cancelled() {
                return Err(GetVectoredError::Cancelled);
@@ -3018,9 +2911,6 @@ impl Timeline {
            unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
            completed_keyspace.merge(&keys_done_last_step);

-            let guard = timeline.layers.read().await;
-            let layers = guard.layer_map();
-
            let in_memory_layer = layers.find_in_memory_layer(|l| {
                let start_lsn = l.get_lsn_range().start;
                cont_lsn > start_lsn
@@ -3028,11 +2918,12 @@ impl Timeline {

            match in_memory_layer {
                Some(l) => {
-                    let lsn_range = l.get_lsn_range().start..cont_lsn;
                    fringe.update(
-                        ReadableLayer::InMemoryLayer(l),
+                        ReadableLayerDesc::InMemory {
+                            handle: l,
+                            lsn_ceil: cont_lsn,
+                        },
                        unmapped_keyspace.clone(),
-                        lsn_range,
                    );
                }
                None => {
@@ -3044,43 +2935,30 @@ impl Timeline {
                            .into_iter()
                            .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
                                (
-                                    ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
+                                    ReadableLayerDesc::Persistent {
+                                        desc: (*layer).clone(),
+                                        lsn_range: lsn_floor..cont_lsn,
+                                    },
                                    keyspace_accum.to_keyspace(),
-                                    lsn_floor..cont_lsn,
                                )
                            })
-                            .for_each(|(layer, keyspace, lsn_range)| {
-                                fringe.update(layer, keyspace, lsn_range)
-                            });
+                            .for_each(|(layer, keyspace)| fringe.update(layer, keyspace));
                    }
                }
            }

-            // It's safe to drop the layer map lock after planning the next round of reads.
-            // The fringe keeps readable handles for the layers which are safe to read even
-            // if layers were compacted or flushed.
-            //
-            // The more interesting consideration is: "Why is the read algorithm still correct
-            // if the layer map changes while it is operating?". Doing a vectored read on a
-            // timeline boils down to pushing an imaginary lsn boundary downwards for each range
-            // covered by the read. The layer map tells us how to move the lsn downwards for a
-            // range at *a particular point in time*. It is fine for the answer to be different
-            // at two different time points.
-            drop(guard);
-
-            if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
-                let next_cont_lsn = lsn_range.start;
+            if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() {
                layer_to_read
                    .get_values_reconstruct_data(
+                        &guard,
                        keyspace_to_read.clone(),
-                        lsn_range,
                        reconstruct_state,
                        ctx,
                    )
                    .await?;

                unmapped_keyspace = keyspace_to_read;
-                cont_lsn = next_cont_lsn;
+                cont_lsn = layer_to_read.get_lsn_floor();
            } else {
                break;
            }
@@ -3158,7 +3036,7 @@ impl Timeline {
            }
        }
        ancestor
-            .wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx)
+            .wait_lsn(self.ancestor_lsn, ctx)
            .await
            .map_err(|e| match e {
                e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
@@ -3208,9 +3086,7 @@ impl Timeline {
        self.last_record_lsn.advance(new_lsn);
    }

-    /// Whether there was a layer to freeze or not, return the value of get_last_record_lsn
-    /// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive).
-    async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn {
+    async fn freeze_inmem_layer(&self, write_lock_held: bool) {
        // Freeze the current open in-memory layer. It will be written to disk on next
        // iteration.

@@ -3220,9 +3096,7 @@ impl Timeline {
            Some(self.write_lock.lock().await)
        };

-        let to_lsn = self.get_last_record_lsn();
-        self.freeze_inmem_layer_at(to_lsn).await;
-        to_lsn
+        self.freeze_inmem_layer_at(self.get_last_record_lsn()).await;
    }

    async fn freeze_inmem_layer_at(&self, at: Lsn) {
@@ -3235,24 +3109,25 @@ impl Timeline {
    /// Layer flusher task's main loop.
    async fn flush_loop(
        self: &Arc<Self>,
-        mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>,
+        mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
        ctx: &RequestContext,
    ) {
        info!("started flush loop");
        loop {
            tokio::select! {
                _ = self.cancel.cancelled() => {
-                    info!("shutting down layer flush task due to Timeline::cancel");
+                    info!("shutting down layer flush task");
+                    break;
+                },
+                _ = task_mgr::shutdown_watcher() => {
+                    info!("shutting down layer flush task");
                    break;
                },
                _ = layer_flush_start_rx.changed() => {}
            }
+
            trace!("waking up");
-            let (flush_counter, frozen_to_lsn) = *layer_flush_start_rx.borrow();
-
-            // The highest LSN to which we flushed in the loop over frozen layers
-            let mut flushed_to_lsn = Lsn(0);
-
+            let flush_counter = *layer_flush_start_rx.borrow();
            let result = loop {
                if self.cancel.is_cancelled() {
                    info!("dropping out of flush loop for timeline shutdown");
@@ -3273,9 +3148,7 @@ impl Timeline {
                    break Ok(());
                };
                match self.flush_frozen_layer(layer_to_flush, ctx).await {
-                    Ok(this_layer_to_lsn) => {
-                        flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);
-                    }
+                    Ok(()) => {}
                    Err(FlushLayerError::Cancelled) => {
                        info!("dropping out of flush loop for timeline shutdown");
                        return;
@@ -3284,36 +3157,11 @@ impl Timeline {
                        FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
                    ) => {
                        error!("could not flush frozen layer: {err:?}");
-                        break err.map(|_| ());
+                        break err;
                    }
                }
                timer.stop_and_record();
            };
-
-            // Unsharded tenants should never advance their LSN beyond the end of the
-            // highest layer they write: such gaps between layer data and the frozen LSN
-            // are only legal on sharded tenants.
-            debug_assert!(
-                self.shard_identity.count.count() > 1
-                    || flushed_to_lsn >= frozen_to_lsn
-                    || !flushed_to_lsn.is_valid()
-            );
-
-            if flushed_to_lsn < frozen_to_lsn && self.shard_identity.count.count() > 1 {
-                // If our layer flushes didn't carry disk_consistent_lsn up to the `to_lsn` advertised
-                // to us via layer_flush_start_rx, then advance it here.
-                //
-                // This path is only taken for tenants with multiple shards: single sharded tenants should
-                // never encounter a gap in the wal.
-                let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
-                tracing::debug!("Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}");
-                if self.set_disk_consistent_lsn(frozen_to_lsn) {
-                    if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) {
-                        tracing::warn!("Failed to schedule metadata upload after updating disk_consistent_lsn: {e}");
-                    }
-                }
-            }
-
            // Notify any listeners that we're done
            let _ = self
                .layer_flush_done_tx
@@ -3321,13 +3169,7 @@ impl Timeline {
        }
    }

-    /// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk.
-    /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`].
-    ///
-    /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
-    /// it means no data will be written between the top of the highest frozen layer and to_lsn,
-    /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
-    async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> {
+    async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> {
        let mut rx = self.layer_flush_done_tx.subscribe();

        // Increment the flush cycle counter and wake up the flush task.
@@ -3341,10 +3183,9 @@ impl Timeline {
            anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
        }

-        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
+        self.layer_flush_start_tx.send_modify(|counter| {
            my_flush_request = *counter + 1;
            *counter = my_flush_request;
-            *lsn = std::cmp::max(last_record_lsn, *lsn);
        });

        loop {
@@ -3381,22 +3222,16 @@ impl Timeline {
    }

    fn flush_frozen_layers(&self) {
-        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
-            *counter += 1;
-
-            *lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1));
-        });
+        self.layer_flush_start_tx.send_modify(|val| *val += 1);
    }

    /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    ///
-    /// Return value is the last lsn (inclusive) of the layer that was frozen.
    #[instrument(skip_all, fields(layer=%frozen_layer))]
    async fn flush_frozen_layer(
        self: &Arc<Self>,
        frozen_layer: Arc<InMemoryLayer>,
        ctx: &RequestContext,
-    ) -> Result<Lsn, FlushLayerError> {
+    ) -> Result<(), FlushLayerError> {
        debug_assert_current_span_has_tenant_and_timeline_id();

        // As a special case, when we have just imported an image into the repository,
@@ -3471,6 +3306,7 @@ impl Timeline {
        }

        let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
+        let old_disk_consistent_lsn = self.disk_consistent_lsn.load();

        // The new on-disk layers are now in the layer map. We can remove the
        // in-memory layer from the map now. The flushed layer is stored in
@@ -3484,7 +3320,10 @@ impl Timeline {

            guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);

-            if self.set_disk_consistent_lsn(disk_consistent_lsn) {
+            if disk_consistent_lsn != old_disk_consistent_lsn {
+                assert!(disk_consistent_lsn > old_disk_consistent_lsn);
+                self.disk_consistent_lsn.store(disk_consistent_lsn);
+
                // Schedule remote uploads that will reflect our new disk_consistent_lsn
                self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
            }
@@ -3501,22 +3340,7 @@ impl Timeline {
        // This failpoint is used by another test case `test_pageserver_recovery`.
        fail_point!("flush-frozen-exit");

-        Ok(Lsn(lsn_range.end.0 - 1))
-    }
-
-    /// Return true if the value changed
-    ///
-    /// This function must only be used from the layer flush task, and may not be called concurrently.
-    fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
-        // We do a simple load/store cycle: that's why this function isn't safe for concurrent use.
-        let old_value = self.disk_consistent_lsn.load();
-        if new_value != old_value {
-            assert!(new_value >= old_value);
-            self.disk_consistent_lsn.store(new_value);
-            true
-        } else {
-            false
-        }
+        Ok(())
    }

    /// Update metadata file
@@ -3677,24 +3501,6 @@ impl Timeline {

    // Is it time to create a new image layer for the given partition?
    async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
-        let last = self.last_image_layer_creation_check_at.load();
-        if lsn != Lsn(0) {
-            let distance = lsn
-                .checked_sub(last)
-                .expect("Attempt to compact with LSN going backwards");
-
-            let min_distance = self.get_image_layer_creation_check_threshold() as u64
-                * self.get_checkpoint_distance();
-
-            // Skip the expensive delta layer counting below if we've not ingested
-            // sufficient WAL since the last check.
-            if distance.0 < min_distance {
-                return false;
-            }
-        }
-
-        self.last_image_layer_creation_check_at.store(lsn);
-
        let threshold = self.get_image_creation_threshold();

        let guard = self.layers.read().await;
@@ -4036,24 +3842,6 @@ impl Timeline {
        Ok(())
    }

-    /// Schedules the uploads of the given image layers
-    fn upload_new_image_layers(
-        self: &Arc<Self>,
-        new_images: impl IntoIterator<Item = ResidentLayer>,
-    ) -> anyhow::Result<()> {
-        let Some(remote_client) = &self.remote_client else {
-            return Ok(());
-        };
-        for layer in new_images {
-            remote_client.schedule_layer_file_upload(layer)?;
-        }
-        // should any new image layer been created, not uploading index_part will
-        // result in a mismatch between remote_physical_size and layermap calculated
-        // size, which will fail some tests, but should not be an issue otherwise.
-        remote_client.schedule_index_upload_for_file_changes()?;
-        Ok(())
-    }
-
    /// Update information about which layer files need to be retained on
    /// garbage collection. This is separate from actually performing the GC,
    /// and is updated more frequently, so that compaction can remove obsolete
@@ -4703,16 +4491,23 @@ struct TimelineWriterState {
    max_lsn: Option<Lsn>,
    // Cached details of the last freeze. Avoids going trough the atomic/lock on every put.
    cached_last_freeze_at: Lsn,
+    cached_last_freeze_ts: Instant,
 }

 impl TimelineWriterState {
-    fn new(open_layer: Arc<InMemoryLayer>, current_size: u64, last_freeze_at: Lsn) -> Self {
+    fn new(
+        open_layer: Arc<InMemoryLayer>,
+        current_size: u64,
+        last_freeze_at: Lsn,
+        last_freeze_ts: Instant,
+    ) -> Self {
        Self {
            open_layer,
            current_size,
            prev_lsn: None,
            max_lsn: None,
            cached_last_freeze_at: last_freeze_at,
+            cached_last_freeze_ts: last_freeze_ts,
        }
    }
 }
@@ -4811,10 +4606,12 @@ impl<'a> TimelineWriter<'a> {
        let initial_size = layer.size().await?;

        let last_freeze_at = self.last_freeze_at.load();
+        let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
        self.write_guard.replace(TimelineWriterState::new(
            layer,
            initial_size,
            last_freeze_at,
+            last_freeze_ts,
        ));

        Ok(())
@@ -4861,7 +4658,7 @@ impl<'a> TimelineWriter<'a> {
            self.get_checkpoint_distance(),
            lsn,
            state.cached_last_freeze_at,
-            state.open_layer.get_opened_at(),
+            state.cached_last_freeze_ts,
        ) {
            OpenLayerAction::Roll
        } else {
--- a/Show More
+++ b/Show More